From 4aa84003cad57610d05603367261bce9d27cafe4 Mon Sep 17 00:00:00 2001
From: guyang3532 <62738430+guyang3532@users.noreply.github.com>
Date: Wed, 27 Mar 2024 16:10:07 +0800
Subject: [PATCH 1/9] support Pow/Div/Sqrt in PaddingElimination (#20083)

---
 .../compute_optimizer/padding_elimination.cc  | 11 ++++--
 .../python/orttraining_test_ortmodule_api.py  | 35 +++++++++++++------
 2 files changed, 32 insertions(+), 14 deletions(-)

diff --git a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
index d42af92c7c66d..1f65d886a4b8b 100644
--- a/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
+++ b/orttraining/orttraining/core/optimizer/compute_optimizer/padding_elimination.cc
@@ -224,8 +224,10 @@ void IterateSubgraphFromNode(Graph& graph,
     visited.insert(cur);
     if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Add", {7, 13, 14}) ||
         graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "BiasGelu", {1}, kMSDomain) ||
-        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14}) ||
-        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14})) {
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Div", {7, 13, 14}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Mul", {7, 13, 14}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Pow", {7, 12, 13, 15}) ||
+        graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sub", {7, 13, 14})) {
       ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end() ||
                   subgraph.find(cur->MutableInputDefs()[1]) != subgraph.end());
       if (cur->InputDefs()[0]->Shape() && cur->InputDefs()[1]->Shape()) {
@@ -278,7 +280,10 @@ void IterateSubgraphFromNode(Graph& graph,
       subgraph.insert(cur->MutableOutputDefs()[1]);
       PushAllOutputNode(graph, to_visit, cur, visited);
     } else if (graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Cast", {9, 13}) ||
-               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain)) {
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "FastGelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Gelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "QuickGelu", {1}, kMSDomain) ||
+               graph_utils::IsSupportedOptypeVersionAndDomain(*cur, "Sqrt", {6, 13})) {
       ORT_ENFORCE(subgraph.find(cur->MutableInputDefs()[0]) != subgraph.end());
       subgraph.insert(cur->MutableOutputDefs()[0]);
       PushAllOutputNode(graph, to_visit, cur, visited);
diff --git a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
index da217eb76949c..5078058995281 100644
--- a/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
+++ b/orttraining/orttraining/test/python/orttraining_test_ortmodule_api.py
@@ -5725,8 +5725,6 @@ def run_step(model, input, target):
 @pytest.mark.parametrize("label_is_sparse", [False, True])
 @pytest.mark.parametrize("rank", [1, 2])
 def test_runtime_inspector_label_and_embed_sparsity_detection(embed_is_sparse, label_is_sparse, rank, caplog):
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
-
     class NeuralNetCrossEntropyLoss(torch.nn.Module):
         def __init__(self, num_embeddings, embedding_dim):
             super().__init__()
@@ -5797,10 +5795,12 @@ def run_step(model, input, positions):
     "test_cases",
     [
         ("Add", 0),
+        ("Add", 1),
         ("Add", 2),
         ("Add", 3),
         ("Add", 4),
         ("Sub", 0),
+        ("Sub", 1),
         ("Sub", 2),
         ("Sub", 3),
         ("Sub", 4),
@@ -5808,12 +5808,22 @@ def run_step(model, input, positions):
         ("Mul", 2),
         ("Mul", 3),
         ("Mul", 4),
+        ("Div", 0),
+        ("Div", 2),
+        ("Div", 3),
+        ("Div", 4),
+        ("Pow", 0),
+        ("Pow", 1),
+        ("Pow", 2),
+        ("Pow", 3),
+        ("Pow", 4),
         ("MatMul", 0),
         ("MatMul", 1),
         ("Dropout", 0),
         ("LayerNormalization", 0),
         ("LayerNormalization", 1),
         ("Cast", 0),
+        ("Sqrt", 0),
         ("BiasGelu", 0),
         ("Gelu", 0),
         ("ReduceMean", 0),
@@ -5821,7 +5831,6 @@ def run_step(model, input, positions):
     ],
 )
 def test_ops_for_padding_elimination(test_cases):
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
     test_op = test_cases[0]
     case = test_cases[1]
 
@@ -5848,7 +5857,7 @@ def __init__(self, vocab_size, hidden_size, pad_token_id):
         #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
         # in case 3, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [1, hidden_size],
         #            the test_op should be included in padding elimination subgraph and a 'Expand + FlattenAndUnpad'
-        #            pattern should be insert to the arg of [batch_size, 1, hidden_size].
+        #            pattern should be insert to the arg of [1, hidden_size].
         # in case 4, the shapes of inputs of test_op are [batch_size, seqlen, hidden_size] and [batch_size, seqlen, hidden_size],
         #            the test_op should be included in padding elimination subgraph and the PadAndUnflatten should be added to
         #            output of test_op. Besides, the other input of Add should be added 'FlattenAndUnpad' to
@@ -5858,6 +5867,8 @@ def test_elementwise(self, input_ids):
             one_input = None
             if case == 0:
                 one_input = torch.ones(self.hidden_size, dtype=torch.long).to(device)
+            elif case == 1:
+                one_input = 1
             elif case == 2:
                 one_input = torch.ones((input_shape[0], 1, self.hidden_size), dtype=torch.long).to(device)
             elif case == 3:
@@ -5872,6 +5883,10 @@ def test_elementwise(self, input_ids):
                 output = one_input - inputs_embeds
             elif test_op == "Mul":
                 output = one_input * inputs_embeds
+            elif test_op == "Div":
+                output = inputs_embeds / one_input
+            elif test_op == "Pow":
+                output = inputs_embeds ** (one_input * 2)
             else:
                 output = None
             return output
@@ -5911,6 +5926,8 @@ def test_other(self, input_ids):
                 output = torch.nn.functional.gelu(inputs_embeds + bias)
             elif test_op == "Gelu":
                 output = torch.nn.functional.gelu(inputs_embeds)
+            elif test_op == "Sqrt":
+                output = torch.sqrt(inputs_embeds)
             elif test_op == "ReduceMean":
                 # In case 0, the inputs_embeds are reduced at last dimension, the ReduceMean should be included in padding
                 # elimination subgraph and the PadAndUnflatten should be added to output of ReduceMean.
@@ -5924,7 +5941,7 @@ def test_other(self, input_ids):
             return output
 
         def forward(self, input_ids):
-            if test_op in ["Add", "Mul", "Sub"]:
+            if test_op in ["Add", "Mul", "Sub", "Div", "Pow"]:
                 output = self.test_elementwise(input_ids)
             elif test_op == "MatMul":
                 output = self.test_matmul(input_ids)
@@ -5953,7 +5970,7 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     model(x)
 
     training_model = model._torch_module._execution_manager(True)._onnx_models.optimized_model
-    if test_op == "Sub":
+    if test_op == "Sub" or test_op == "Pow":
         assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 2
     else:
         assert len([node.op_type for node in training_model.graph.node if node.op_type == "Sub"]) == 1
@@ -5974,7 +5991,7 @@ def find_input_node_type(model, arg):
         return result[0].op_type if len(result) == 1 else None
 
     recover_pad_input_optypes = [find_input_node_type(training_model, arg) for arg in recover_pad_node.input]
-    if test_op == "Add" or test_op == "Mul" or test_op == "Sub":
+    if test_op == "Add" or test_op == "Mul" or test_op == "Sub" or test_op == "Div" or test_op == "Pow":
         assert test_op in recover_pad_input_optypes
     else:
         if case == 0:
@@ -5982,11 +5999,8 @@ def find_input_node_type(model, arg):
         else:
             assert "ATen" in recover_pad_input_optypes
 
-    del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
-
 
 def test_e2e_padding_elimination():
-    os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"] = "1"
     seed = 5033
     random.seed(seed)
     np.random.seed(seed)
@@ -6129,7 +6143,6 @@ def generate_inputs(batch_size, max_seq_length, vocab_size):
     training_model = ort_model._torch_module._execution_manager(True)._onnx_models.optimized_model
     assert "FlattenAndUnpad" in [node.op_type for node in training_model.graph.node]
     assert "PadAndUnflatten" in [node.op_type for node in training_model.graph.node]
-    del os.environ["ORTMODULE_ENABLE_EMBEDDING_SPARSE_OPTIMIZER"]
 
 
 @pytest.mark.skipif(

From ca465dc087990682adc1db9eeaa0bcc4a1464287 Mon Sep 17 00:00:00 2001
From: Nanashi <sevenc7c@sevenc7c.com>
Date: Wed, 27 Mar 2024 18:07:00 +0900
Subject: [PATCH 2/9] [js] Make error friendly when isOrtFormat is undefined
 (#19958)

### Description
Make error friendly when isOrtFormat is undefined
(`onnxruntime.InferenceSession.create` is called with ArrayBuffer or
Uint8Array).

### Motivation and Context
I was trying to run my onnx model in WebGL EP, but it gave me the error
"Cannot read properties of null (reading 'irVersion')".
I used debugger to find that actual error is `int64 is not supported`,
but the error was invisible for me.
So I made it to show both error when isOrtFormat is undefined.
<s>I haven't written unit test yet, so I'm making it draft. (I have no
idea about how do I test this though...)</s>
[d62d942](https://github.com/microsoft/onnxruntime/pull/19958/commits/d62d9425ba7b9e5ff0d0a2ae6998dd53817d5db9)
---
 js/web/lib/onnxjs/model.ts            | 12 +++++++++++-
 js/web/test/e2e/browser-test-webgl.js | 13 +++++++++++++
 2 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/js/web/lib/onnxjs/model.ts b/js/web/lib/onnxjs/model.ts
index f9a1b6e76089d..8e689626011be 100644
--- a/js/web/lib/onnxjs/model.ts
+++ b/js/web/lib/onnxjs/model.ts
@@ -16,6 +16,7 @@ export class Model {
   constructor() {}
 
   load(buf: Uint8Array, graphInitializer?: Graph.Initializer, isOrtFormat?: boolean): void {
+    let onnxError: Error|undefined;
     if (!isOrtFormat) {
       // isOrtFormat === false || isOrtFormat === undefined
       try {
@@ -25,10 +26,19 @@ export class Model {
         if (isOrtFormat !== undefined) {
           throw e;
         }
+        onnxError = e;
       }
     }
 
-    this.loadFromOrtFormat(buf, graphInitializer);
+    try {
+      this.loadFromOrtFormat(buf, graphInitializer);
+    } catch (e) {
+      if (isOrtFormat !== undefined) {
+        throw e;
+      }
+      // Tried both formats and failed (when isOrtFormat === undefined)
+      throw new Error(`Failed to load model as ONNX format: ${onnxError}\nas ORT format: ${e}`);
+    }
   }
 
   private loadFromOnnxFormat(buf: Uint8Array, graphInitializer?: Graph.Initializer): void {
diff --git a/js/web/test/e2e/browser-test-webgl.js b/js/web/test/e2e/browser-test-webgl.js
index e503f38ae5735..974c81d064c89 100644
--- a/js/web/test/e2e/browser-test-webgl.js
+++ b/js/web/test/e2e/browser-test-webgl.js
@@ -6,3 +6,16 @@
 it('Browser E2E testing - WebGL backend', async function() {
   await testFunction(ort, {executionProviders: ['webgl']});
 });
+
+it('Browser E2E testing - invalid buffer', async () => {
+  try {
+    await ort.InferenceSession.create(
+        new Uint8Array(Array.from({length: 100}, () => 42)), {executionProviders: ['webgl']});
+
+    // Should not reach here.
+    assert(false);
+  } catch (e) {
+    assert(e.message.includes('as ONNX format'));
+    assert(e.message.includes('as ORT format'));
+  }
+});

From 47903e701a4b20b8f43ba2c77392db8c31a3f664 Mon Sep 17 00:00:00 2001
From: Yulong Wang <7679871+fs-eire@users.noreply.github.com>
Date: Wed, 27 Mar 2024 10:35:43 -0700
Subject: [PATCH 3/9] fix condition in web CI YAML (#20095)

### Description
fix condition in web CI YAML
---
 .../github/azure-pipelines/templates/win-web-ci.yml  | 12 ++++++------
 .../templates/win-web-multi-browsers.yml             |  6 +++---
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
index b7aee559cf73c..fa6103fb8a59d 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-ci.yml
@@ -155,7 +155,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\01
      npm test -- -e=chrome -b=webgl,wasm --user-data-dir=$(Agent.TempDirectory)\web\test\01
@@ -165,7 +165,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\02
      npm test -- -e=chrome -b=webgl,wasm,webgpu $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\02
@@ -175,7 +175,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\03
      npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-tensor $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\03
@@ -185,7 +185,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\04
      npm test -- suite1 -e=chrome -b=webgpu --io-binding=gpu-location $(webgpuCommandlineExtraFlags) --user-data-dir=$(Agent.TempDirectory)\web\test\04
@@ -195,7 +195,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\05
      npm test -- --webgl.pack -b=webgl -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\05
@@ -204,7 +204,7 @@ jobs:
   - script: |
      powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
      mkdir $(Agent.TempDirectory)\web\test\06
      npm test -- --wasm.proxy -b=wasm -e=chrome --user-data-dir=$(Agent.TempDirectory)\web\test\06
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
index 00109b348e8cb..a0af221607dc8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-web-multi-browsers.yml
@@ -70,7 +70,7 @@ jobs:
   - script: |
       powershell "Get-WmiObject Win32_Process -Filter \"name = 'chrome.exe'\" | Format-List CommandLine"
     displayName: 'Check active Chrome processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
       mkdir $(Agent.TempDirectory)\web\test_multi_browsers\01
       npm test -- suite0 -e=chrome -b=wasm,webgl --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\01
@@ -79,7 +79,7 @@ jobs:
   - script: |
       powershell "Get-WmiObject Win32_Process -Filter \"name = 'firefox.exe'\" | Format-List CommandLine"
     displayName: 'Check active Firefox processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
       mkdir $(Agent.TempDirectory)\web\test_multi_browsers\02
       npm test -- suite0 -b=wasm,webgl -e=firefox --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\02
@@ -88,7 +88,7 @@ jobs:
   - script: |
       powershell "Get-WmiObject Win32_Process -Filter \"name = 'msedge.exe'\" | Format-List CommandLine"
     displayName: 'Check active Edge processes (before test)'
-    condition: and(succeeded(), eq('$(Agent.Diagnostic)', 'true'))
+    condition: and(succeeded(), eq(variables['Agent.Diagnostic'], 'true'))
   - script: |
       mkdir $(Agent.TempDirectory)\web\test_multi_browsers\03
       npm test -- suite0 -b=wasm,webgl -e=edge --wasm.initTimeout=30000 --file-cache --user-data-dir=$(Agent.TempDirectory)\web\test_multi_browsers\03

From c8676ffbff5218e226a25651b5b0c981dc5da798 Mon Sep 17 00:00:00 2001
From: Xiaoyu <85524621+xiaoyu-work@users.noreply.github.com>
Date: Wed, 27 Mar 2024 10:40:08 -0700
Subject: [PATCH 4/9] Add ModelProto support for quantize api (#20018)

### Description
Add ModelProto support for `quantize` api


### Motivation and Context
Currently, the `quantize` API only accepts a model path as the input
model. However, for large models, saving and loading from disk can be
time-consuming. By adding `ModelProto` as an input option to the
`quantize` API, significant time can be saved.
---
 .../execution_providers/qnn/preprocess.py     |  8 +--
 .../execution_providers/qnn/quant_config.py   |  8 ++-
 .../python/tools/quantization/quantize.py     | 45 +++++++++++----
 .../tools/quantization/shape_inference.py     | 55 ++++++++++++++-----
 4 files changed, 83 insertions(+), 33 deletions(-)

diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
index e584a65574520..85f5d967f9ee3 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/preprocess.py
@@ -16,8 +16,8 @@
 
 
 def qnn_preprocess_model(
-    model_input: Path,
-    model_output: Path,
+    model_input: str | Path | onnx.ModelProto,
+    model_output: str | Path,
     fuse_layernorm: bool = False,
     save_as_external_data: bool = False,
     all_tensors_to_one_file: bool = False,
@@ -37,7 +37,7 @@ def qnn_preprocess_model(
     - (Optional) Fuse ReduceMean sequence into a single LayerNormalization node.
 
     Args:
-        model_input: Path to the input model file.
+        model_input: Path to the input model file or ModelProto.
         model_output: Path the output model file, which is only created if this method returns True.
         fuse_layernorm: True if ReduceMean sequences should be fused into LayerNormalization nodes.
             Defaults to False.
@@ -82,7 +82,7 @@ def qnn_preprocess_model(
             to cancel out.
     """
     modified = False
-    model = onnx.load_model(model_input)
+    model = model_input if isinstance(model_input, onnx.ModelProto) else onnx.load_model(model_input)
     onnx_model = ONNXModel(model)
 
     # Fuse Erf sequence into a single Gelu
diff --git a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
index 479eaf5b0c542..3a217fdfaaffd 100644
--- a/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
+++ b/onnxruntime/python/tools/quantization/execution_providers/qnn/quant_config.py
@@ -39,7 +39,7 @@ def warn_unable_to_override(
 
 
 def get_qnn_qdq_config(
-    model_input: Path,
+    model_input: str | Path | onnx.ModelProto,
     calibration_data_reader: CalibrationDataReader,
     calibrate_method=CalibrationMethod.MinMax,
     activation_type=QuantType.QUInt8,
@@ -56,7 +56,11 @@ def get_qnn_qdq_config(
     if weight_symmetric is None:
         weight_symmetric = weight_type in {QuantType.QInt8, QuantType.QInt16}
 
-    model = onnx.load_model(model_input, load_external_data=False)
+    model = (
+        model_input
+        if isinstance(model_input, onnx.ModelProto)
+        else onnx.load_model(model_input, load_external_data=False)
+    )
 
     op_types = set()
     model_has_external_data = False
diff --git a/onnxruntime/python/tools/quantization/quantize.py b/onnxruntime/python/tools/quantization/quantize.py
index 9b0c15e4b4dde..9ebd7bf3c408a 100644
--- a/onnxruntime/python/tools/quantization/quantize.py
+++ b/onnxruntime/python/tools/quantization/quantize.py
@@ -6,6 +6,9 @@
 import logging
 import tempfile
 from pathlib import Path
+from typing import Union
+
+import onnx
 
 from .calibrate import CalibrationDataReader, CalibrationMethod, TensorsData, create_calibrator
 from .onnx_quantizer import ONNXQuantizer
@@ -16,6 +19,7 @@
     QuantType,
     load_model_with_shape_infer,
     model_has_pre_process_metadata,
+    save_and_reload_model_with_shape_infer,
 )
 from .registry import IntegerOpsRegistry, QDQRegistry, QLinearOpsRegistry
 
@@ -280,8 +284,8 @@ def check_static_quant_arguments(quant_format: QuantFormat, activation_type: Qua
 
 
 def quantize_static(
-    model_input,
-    model_output,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     calibration_data_reader: CalibrationDataReader,
     quant_format=QuantFormat.QDQ,
     op_types_to_quantize=None,
@@ -304,7 +308,7 @@ def quantize_static(
 
     Args:
 
-        model_input: file path of model to quantize
+        model_input: file path of model or ModelProto to quantize
         model_output: file path of quantized model
         calibration_data_reader: a calibration data reader. It
             enumerates calibration data and generates inputs for the
@@ -435,7 +439,11 @@ def quantize_static(
         qdq_ops = list(QDQRegistry.keys())
         op_types_to_quantize = list(set(q_linear_ops + qdq_ops))
 
-    model = load_model_with_shape_infer(Path(model_input))
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
 
     pre_processed: bool = model_has_pre_process_metadata(model)
     if not pre_processed:
@@ -485,6 +493,15 @@ def inc_dataloader():
         model = load_model_with_shape_infer(Path(model_input))  # use smooth quant model for calibration
 
     with tempfile.TemporaryDirectory(prefix="ort.quant.") as quant_tmp_dir:
+        if isinstance(model_input, onnx.ModelProto):
+            output_path = str(Path(quant_tmp_dir) / "model_input.onnx")
+            onnx.save_model(
+                model_input,
+                output_path,
+                save_as_external_data=True,
+            )
+            model_input = output_path
+
         calibrator = create_calibrator(
             Path(model_input),
             op_types_to_quantize,
@@ -546,8 +563,8 @@ def inc_dataloader():
 
 
 def quantize_dynamic(
-    model_input: Path,
-    model_output: Path,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     op_types_to_quantize=None,
     per_channel=False,
     reduce_range=False,
@@ -560,7 +577,7 @@ def quantize_dynamic(
     """Given an onnx model, create a quantized onnx model and save it into a file
 
     Args:
-        model_input: file path of model to quantize
+        model_input: file path of model or ModelProto to quantize
         model_output: file path of quantized model
         op_types_to_quantize:
             specify the types of operators to quantize, like ['Conv'] to quantize Conv only.
@@ -609,7 +626,11 @@ def quantize_dynamic(
     if not op_types_to_quantize or len(op_types_to_quantize) == 0:
         op_types_to_quantize = list(IntegerOpsRegistry.keys())
 
-    model = load_model_with_shape_infer(Path(model_input))
+    model = (
+        save_and_reload_model_with_shape_infer(model_input)
+        if isinstance(model_input, onnx.ModelProto)
+        else load_model_with_shape_infer(Path(model_input))
+    )
 
     pre_processed: bool = model_has_pre_process_metadata(model)
     if not pre_processed:
@@ -642,15 +663,15 @@ def quantize_dynamic(
 
 
 def quantize(
-    model_input: Path,
-    model_output: Path,
+    model_input: Union[str, Path, onnx.ModelProto],
+    model_output: Union[str, Path],
     quant_config: QuantConfig,
 ):
     """Quantize a model with QuantConfig.
 
     Args:
-        model_input (Path): Path to the model to quantize.
-        model_output (Path): Path to save the quantized model.
+        model_input (str | Path | ModelProto): Path to the model or ModelProto to quantize.
+        model_output (str | Path): Path to save the quantized model.
         quant_config (QuantConfig): Quantization Configuration.
     """
 
diff --git a/onnxruntime/python/tools/quantization/shape_inference.py b/onnxruntime/python/tools/quantization/shape_inference.py
index b7d4726610387..7368304837a96 100644
--- a/onnxruntime/python/tools/quantization/shape_inference.py
+++ b/onnxruntime/python/tools/quantization/shape_inference.py
@@ -9,12 +9,13 @@
 import tempfile
 import traceback
 from pathlib import Path
-from typing import Optional
+from typing import Optional, Union
 
 import onnx
 
 import onnxruntime
 from onnxruntime.tools.symbolic_shape_infer import SymbolicShapeInference
+from onnxruntime.transformers.onnx_utils import extract_raw_data_from_model, has_external_data
 
 from .quant_utils import add_pre_process_metadata
 
@@ -22,8 +23,8 @@
 
 
 def quant_pre_process(
-    input_model_path: str,
-    output_model_path: str,
+    input_model: Union[str, Path, onnx.ModelProto],
+    output_model_path: Union[str, Path],
     skip_optimization: bool = False,
     skip_onnx_shape: bool = False,
     skip_symbolic_shape: bool = False,
@@ -39,7 +40,7 @@ def quant_pre_process(
     """Shape inference and model optimization, in preparation for quantization.
 
     Args:
-        input_model_path: Path to the input model file")
+        input_model: Path to the input model file or ModelProto
         output_model_path: Path to the output model file
         skip_optimization: Skip model optimization step if true. This may result in ONNX shape
             inference failure for some models.
@@ -68,8 +69,9 @@ def quant_pre_process(
 
         if not skip_symbolic_shape:
             logger.info("Performing symbolic shape inference...")
+            loaded_model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
             model = SymbolicShapeInference.infer_shapes(
-                onnx.load(input_model_path),
+                loaded_model,
                 int_max,
                 auto_merge,
                 guess_output_rank,
@@ -80,18 +82,18 @@ def quant_pre_process(
             # Use ORT optimizers (native code) to optimize model
             if not skip_symbolic_shape:
                 # Need to save the inferenced model to file so as to run the optimizer
-                input_model_path = str(temp_path / "symbolic_shape_inferred.onnx")
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
                 if save_as_external_data:
                     onnx.save_model(
                         model,
-                        input_model_path,
+                        input_model,
                         save_as_external_data=True,
                         all_tensors_to_one_file=all_tensors_to_one_file,
                         size_threshold=external_data_size_threshold,
                         convert_attribute=False,
                     )
                 else:
-                    onnx.save(model, input_model_path)
+                    onnx.save(model, input_model)
                 model = None
 
             opt_model_path = str(temp_path / "optimized.onnx")
@@ -99,7 +101,19 @@ def quant_pre_process(
                 sess_option = onnxruntime.SessionOptions()
                 sess_option.optimized_model_filepath = opt_model_path
                 sess_option.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_BASIC
-                sess = onnxruntime.InferenceSession(input_model_path, sess_option, providers=["CPUExecutionProvider"])
+                # For large model, extract external data from model and add to session options
+                if isinstance(input_model, onnx.ModelProto):
+                    if has_external_data(input_model):
+                        raise ValueError(
+                            "ModelProto has external data not loaded into memory, ORT cannot create session. "
+                            "Please load external data before calling this function. "
+                            "See https://onnx.ai/onnx/repo-docs/ExternalData.html for more information."
+                        )
+                    external_names, external_values = extract_raw_data_from_model(input_model)
+                    sess_option.add_external_initializers(list(external_names), list(external_values))
+                    input_model = input_model.SerializeToString()
+
+                sess = onnxruntime.InferenceSession(input_model, sess_option, providers=["CPUExecutionProvider"])
                 # Close the session to avoid the cleanup error on Windows for temp folders
                 # https://github.com/microsoft/onnxruntime/issues/17627
                 del sess
@@ -109,7 +123,7 @@ def quant_pre_process(
                 )
                 logger.error(traceback.format_exc())
 
-            input_model_path = opt_model_path
+            input_model = opt_model_path
 
         if not skip_onnx_shape:
             # ONNX shape inference.
@@ -117,26 +131,37 @@ def quant_pre_process(
             # If the skip optimization is specified, we could be dealing with a
             # large model. So be on the safe side, save the model
             if model is not None:
-                input_model_path = str(temp_path / "symbolic_shape_inferred.onnx")
+                input_model = str(temp_path / "symbolic_shape_inferred.onnx")
                 if save_as_external_data:
                     onnx.save_model(
                         model,
-                        input_model_path,
+                        input_model,
                         save_as_external_data=True,
                         all_tensors_to_one_file=all_tensors_to_one_file,
                         size_threshold=external_data_size_threshold,
                         convert_attribute=False,
                     )
                 else:
-                    onnx.save(model, input_model_path)
+                    onnx.save(model, input_model)
                 model = None
 
+            if isinstance(input_model, onnx.ModelProto):
+                input_model = str(Path(quant_tmp_dir) / "model_input.onnx")
+                onnx.save_model(
+                    model,
+                    input_model,
+                    save_as_external_data=True,
+                    all_tensors_to_one_file=all_tensors_to_one_file,
+                    size_threshold=external_data_size_threshold,
+                    convert_attribute=False,
+                )
+
             inferred_model_path = str(temp_path / "onnx_shape_inferred.onnx")
-            onnx.shape_inference.infer_shapes_path(input_model_path, inferred_model_path)
+            onnx.shape_inference.infer_shapes_path(input_model, inferred_model_path)
             model = onnx.load(inferred_model_path)
 
     if model is None:
-        model = onnx.load(input_model_path)
+        model = input_model if isinstance(input_model, onnx.ModelProto) else onnx.load(input_model)
 
     add_pre_process_metadata(model)
 

From 4df9d16f98cd5ac60e1c6151207dfbf9f5a165a4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 03:20:57 +0800
Subject: [PATCH 5/9] [Fix] TSAUpload task must be in building stage (#20098)

### Description
In #20085, TSAUpload was in testing stage so main branch failed.
---
 .../github/azure-pipelines/templates/py-win-gpu.yml   | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
index e200fb9e93bee..59387a0de4cd1 100644
--- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml
@@ -204,6 +204,17 @@ stages:
             inputs:
               AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll'
 
+          - task: TSAUpload@2
+            displayName: 'TSA upload'
+            condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main'))
+            inputs:
+              GdnPublishTsaOnboard: false
+              GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa'
+
+          - template: component-governance-component-detection-steps.yml
+            parameters:
+              condition: 'succeeded'
+
   - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests
     dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build
     jobs:

From ab2eaedfaa1e1abe5c128cfbccf8c6a45b75480e Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Thu, 28 Mar 2024 03:29:34 +0800
Subject: [PATCH 6/9] Install ONNX by buildling source code in Windows DML
 stage (#20079)

### Description
In #20073, I use pin onnx version to unblock the whole PR CI.
In fact, we could use the onnx that installed by building source code,
that the onnx version is controlled by deps.txt.
For some history reason, DML stage installed onnx from pypi. Now, the
onnx can be installed as other stages.

add an option to skip installing onnx in win-ci-prebuild-step
---
 tools/ci_build/build.py                                    | 7 -------
 .../templates/jobs/win-ci-prebuild-steps.yml               | 5 +++++
 tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml  | 5 ++---
 3 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index e1649ae251d88..7dfdbc301622a 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2083,13 +2083,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
             # For CUDA or DML enabled builds test IOBinding feature
             if args.use_cuda or args.use_dml:
                 log.info("Testing IOBinding feature")
-                if args.use_dml:
-                    run_subprocess(
-                        [sys.executable, "-m", "pip", "uninstall", "--yes", "onnx"], cwd=cwd, dll_path=dll_path
-                    )
-                    run_subprocess(
-                        [sys.executable, "-m", "pip", "install", "-q", "onnx==1.15.0"], cwd=cwd, dll_path=dll_path
-                    )
                 run_subprocess([sys.executable, "onnxruntime_test_python_iobinding.py"], cwd=cwd, dll_path=dll_path)
 
             if args.use_cuda:
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
index 864513bc4d671..67a2543bfb50e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-prebuild-steps.yml
@@ -13,6 +13,10 @@ parameters:
   type: boolean
   default: false
 
+- name: InstallONNX
+  type: boolean
+  default: true
+
 - name: WITHCACHE
   type: boolean
   default: false
@@ -106,6 +110,7 @@ steps:
 
     displayName: Install ccache and update PATH to use linked versions of gcc, cc, etc
 
+- ${{ if eq(parameters.InstallONNX, true) }}:
   - ${{ if eq(parameters.WITHCACHE, true) }}:
     - task: Cache@2
       # machinepool is used to ensure the compiler is same
diff --git a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
index 53eea1d69fb0e..c333c7ef084d0 100644
--- a/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-ci-pipeline.yml
@@ -68,6 +68,7 @@ stages:
           BuildConfig: Debug
           MachinePool: 'onnxruntime-Win-CPU-2022'
           WithCache: false
+          InstallONNX: false
           Today: $(TODAY)
 
       - task: PythonScript@0
@@ -155,7 +156,7 @@ stages:
         GenerateDocumentation: false
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
-        
+
 - stage: x86_release
   dependsOn: []
   jobs:
@@ -256,5 +257,3 @@ stages:
         GenerateDocumentation: false
         WITH_CACHE: false
         MachinePool: 'onnxruntime-Win-CPU-2022'
-
-

From b95fd4e644775a4343c13435bd729bd64f411752 Mon Sep 17 00:00:00 2001
From: Dmitri Smirnov <yuslepukhin@users.noreply.github.com>
Date: Wed, 27 Mar 2024 13:32:36 -0700
Subject: [PATCH 7/9] Enable CUDA EP unit testing on Windows (#20039)

### Description
Address build issues and source code discrepancies.
Fix cuda_test_provider gtest argument stack corruption.

### Motivation and Context
`OpTester` class that is widely used for kernel testing is not
suitable for testing internal classes for EPs that are built as shared
objects.
Currently, CUDA EP tests run only on Linux.
We want to enable testing and developments on Windows,
and create a usable pattern for testing of other EPs internals.

Alternatives considered:
Abstracting EP unit tests into separate test executable such as
`onnxruntime_test_all`.
This alternative was rejected as it would create a lot more changes in
the established patterns,
and potentially interfere with CUDA functionality with more complex
source code maintanence.
---
 cmake/CMakeLists.txt                          |   2 +-
 cmake/onnxruntime_providers_cuda.cmake        |   2 +-
 cmake/onnxruntime_unittests.cmake             |   7 +
 .../core/framework/execution_provider.h       |   2 +-
 .../onnxruntime/core/framework/run_options.h  |   2 +-
 .../core/mickey/blk_q4/f16_prepack_sm80.h     |  16 +-
 .../threadblock/quantb_mma_multistage.h       |   2 +-
 .../quantb_meta_mma_tensor_op_tile_iterator.h |  25 ++-
 .../shared_library/provider_wrappedtypes.h    |  15 +-
 onnxruntime/core/util/matrix_layout.h         |   2 +-
 .../test/cuda_host/blkq4_fp16_quant_sm80.h    |   7 +-
 .../cuda/test_cases/beam_search_topk.cc       |   3 +-
 .../cuda/test_cases/blkq4_fp16_gemm_sm80.h    |  22 ++-
 .../test_cases/blkq4_fp16_gemm_sm80_test.cc   |  16 +-
 .../test_cases/blkq4_fp16_gemm_sm80_testcu.cu | 187 +++++++++---------
 .../cuda_execution_provider_test.cc           |  11 +-
 .../cuda/test_cases/cuda_test_provider.cc     |  32 ++-
 .../cuda/test_cases/cuda_utils_test.cc        |   7 +-
 .../cuda/test_cases/gemm_options_test.cc      |   4 +-
 .../test_cases/reduction_functions_test.cc    |   4 +-
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |   4 +-
 .../azure-pipelines/win-gpu-ci-pipeline.yml   |  18 +-
 22 files changed, 216 insertions(+), 174 deletions(-)

diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt
index ee1959bb357fe..3293506141689 100644
--- a/cmake/CMakeLists.txt
+++ b/cmake/CMakeLists.txt
@@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
 # Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
 # OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
 # use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
-cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
+cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)
 
 option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
 option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake
index aeeac10ead27d..1346a9ce968c6 100644
--- a/cmake/onnxruntime_providers_cuda.cmake
+++ b/cmake/onnxruntime_providers_cuda.cmake
@@ -122,7 +122,7 @@
   endif()
   if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
     # cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
-    # add to the lib onnxruntime_providers_cuda separatedly.
+    # added to the lib onnxruntime_providers_cuda separately.
     # onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
     set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
     list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake
index 1ffb838328643..4a351dcf90d45 100644
--- a/cmake/onnxruntime_unittests.cmake
+++ b/cmake/onnxruntime_unittests.cmake
@@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
   onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
   target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
   target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
+  if (MSVC)
+    # Cutlass code has an issue with the following:
+    # warning C4100: 'magic': unreferenced formal parameter
+    target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4100>"
+                  "$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4100>")
+  endif()
+
   list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
 endif()
 
diff --git a/include/onnxruntime/core/framework/execution_provider.h b/include/onnxruntime/core/framework/execution_provider.h
index 40ca96a19aef1..16ad943a5f47e 100644
--- a/include/onnxruntime/core/framework/execution_provider.h
+++ b/include/onnxruntime/core/framework/execution_provider.h
@@ -53,7 +53,7 @@ struct NodeComputeInfo {
   DestroyFunctionStateFunc release_state_func;
 };
 
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 
 enum class DataLayout {
   NCHW,
diff --git a/include/onnxruntime/core/framework/run_options.h b/include/onnxruntime/core/framework/run_options.h
index 5444c825d7991..789c3b13f2c3e 100644
--- a/include/onnxruntime/core/framework/run_options.h
+++ b/include/onnxruntime/core/framework/run_options.h
@@ -45,5 +45,5 @@ struct OrtRunOptions {
 };
 
 namespace onnxruntime {
-using RunOptions = OrtRunOptions;
+using RunOptions = ::OrtRunOptions;
 }  // namespace onnxruntime
diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
index a08cfb97eed4a..c81b4967d2719 100644
--- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
+++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
@@ -110,8 +110,8 @@ struct BlockwiseQuantization {
   static void prepack_weights(
       int rows,
       int columns,
-      const gsl::span<uint8_t const>& weights,     // <- int4 weights, column major
-      const gsl::span<uint8_t>& weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
+      gsl::span<uint8_t const> weights,     // <- int4 weights, column major
+      gsl::span<uint8_t> weights_prepacked  // <- int4 prepacked weights tensor, same size buffer
   ) {
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 &&
                     (rows % QuantBlocking::kRow) == 0 &&
@@ -171,10 +171,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_scales(
       size_t rows,
       size_t columns,
-      const gsl::span<ElementT const>& scales,     // <- quant scales, column major layout
-      const gsl::span<ElementT>& scales_prepacked  // <- quant scales prepacked, same size buffer
+      gsl::span<ElementT const> scales,     // <- quant scales, column major layout
+      gsl::span<ElementT> scales_prepacked  // <- quant scales prepacked, same size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
     ORT_ENFORCE(scales.size() == size_t(meta_shape.product()),
                 "Quantization scale tensor shape mismatch!");
     ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()),
@@ -241,10 +241,10 @@ struct BlockwiseQuantization {
   static void prepack_quant_offsets(
       size_t rows,
       size_t columns,
-      const gsl::span<uint8_t const>& offsets,     // <- quant offsets, int4, column major layout
-      const gsl::span<uint8_t>& offsets_prepacked  // <- quant offsets prepacked, double size buffer
+      gsl::span<uint8_t const> offsets,     // <- quant offsets, int4, column major layout
+      gsl::span<uint8_t> offsets_prepacked  // <- quant offsets prepacked, double size buffer
   ) {
-    auto meta_shape = get_quant_meta_shape(rows, columns);
+    auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
 
     ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0,
                 "Does not support odd number of rows or columns!");
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
index 8b6bac8c5099a..28364cc34f2d7 100644
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/threadblock/quantb_mma_multistage.h
@@ -132,7 +132,7 @@ struct DummyType{
   }
 
   CUTLASS_HOST_DEVICE
-  std::monostate& operator[](int idx) {
+  std::monostate& operator[](int /*idx */) {
     return dummy_;
   }
 };
diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
index 4ba39dda3db8d..26239161cf8a3 100644
--- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
+++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h
@@ -437,7 +437,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
 
   CUTLASS_HOST_DEVICE
   static void dequant(FragmentScale const &scales,
-                      FragmentOffset const &offsets,
+                      FragmentOffset const &fragment_offsets,
                       Array<uint8_t,kExpandedSize/2> const &weights,
                       Array<ElementScale, kExpandedSize>& dest){
     static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
@@ -453,19 +453,18 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
 
       uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
       const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
-      const ElementOffset* offsets_ptr = nullptr;
-      if constexpr(kHasOffset) { offsets_ptr = offsets.data(); }
+      [[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr;
+      if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); }
 
       CUTLASS_PRAGMA_UNROLL
       for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
         // dequantize: d = scale * (weight - offset)
         // to use FMA, d = scale * weight + (scale * (-offset))
 
-        b64 offsets;
-        if constexpr(kHasOffset){
-          const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets_ptr);
-
+        [[maybe_unused]] b64 offsets{0};
+        if constexpr(kHasOffset) {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+          const uint32_t* p = reinterpret_cast<const uint32_t*>(fragment_offsets_ptr);
           asm volatile(
               "{\n\t"
               "  .reg  .b32    rb0, rb1;\n"      // b32 regs for fp16x2 mul operands
@@ -486,7 +485,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
           assert(0);
 #endif
 
-          offsets_ptr += 4;
+          fragment_offsets_ptr += 4;
         } else {
 #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
           asm volatile(
@@ -541,7 +540,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
             int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
             ElementScale s = scales[idx];
             if constexpr(kHasOffset){
-              offset = s * static_cast<ElementScale>(-16 - int(offsets[idx]));
+              offset = s * static_cast<ElementScale>(-16 - static_cast<int>(fragment_offsets[idx]));
             } else {
               offset = s * static_cast<ElementScale>(-16-8);
             }
@@ -795,13 +794,13 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
         }
       }
     } else if constexpr (kMmaIterationsB % 2 == 0) {
-      const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
-      uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
-
       if constexpr (kHasOffset){
+#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+        const uint32_t* scales_ptr = reinterpret_cast<const uint32_t*>(scales.data());
+        uint32_t* addon_ptr = reinterpret_cast<uint32_t*>(addon);
         // possible buffer over read 2 bytes here.
         const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets.data());
-#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
+
         asm volatile(
           "{\n\t"
           "  .reg  .b32    rb0, rb1, rb2;\n"
diff --git a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
index bdad18c7edec0..3bb938c1a3197 100644
--- a/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
+++ b/onnxruntime/core/providers/shared_library/provider_wrappedtypes.h
@@ -394,14 +394,6 @@ struct ConfigOptions final {
   PROVIDER_DISALLOW_ALL(ConfigOptions)
 };
 
-struct OrtRunOptions final {
-  const ConfigOptions& GetConfigOptions() const {
-    return g_host->RunOptions__GetConfigOptions(this);
-  }
-
-  PROVIDER_DISALLOW_ALL(OrtRunOptions)
-};
-
 struct ComputeCapability final {
   static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
   static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
@@ -1283,3 +1275,10 @@ template <>
 inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }
 
 }  // namespace onnxruntime
+
+struct OrtRunOptions final {
+  const onnxruntime::ConfigOptions& GetConfigOptions() const {
+    return onnxruntime::g_host->RunOptions__GetConfigOptions(this);
+  }
+  PROVIDER_DISALLOW_ALL(OrtRunOptions)
+};
diff --git a/onnxruntime/core/util/matrix_layout.h b/onnxruntime/core/util/matrix_layout.h
index 783a29d8a2055..43843da3fb96e 100644
--- a/onnxruntime/core/util/matrix_layout.h
+++ b/onnxruntime/core/util/matrix_layout.h
@@ -378,7 +378,7 @@ class MatrixRef {
   MatrixRef(
       NonConstMatrixRef const& ref,  ///< MatrixRef to non-const data
       /// SFINAE trick to avoid creating a copy-constructor when Element_ is already non-const
-      _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
+      [[maybe_unused]] _Magic magic = (typename std::enable_if<!IsNonConstRef, _Magic>::type)0
       ) : data_(ref.data()), shape_(ref.shape()), layout_(Layout::packed(ref.shape())) {}
 
   ORT_FORCEINLINE
diff --git a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
index 6ea8b55505214..942b1c4d2c2ad 100644
--- a/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
+++ b/onnxruntime/test/cuda_host/blkq4_fp16_quant_sm80.h
@@ -74,7 +74,8 @@ inline void sm80_prepack_quant_scales_ref(
     int columns,
     const MatrixRef<ScaleElementT const, Layout, true>& tensor_scale,
     const MatrixRef<ScaleElementT, Layout, true>& tensor_scale_prepacked) {
-  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] == (columns / QuantBlocking::kColumn),
+  ORT_ENFORCE(tensor_scale.shape()[0] == (rows / QuantBlocking::kRow) && tensor_scale.shape()[1] ==
+                                                                             (columns / QuantBlocking::kColumn),
               "Unexpected tensor_scale shape! Expected: (",
               rows / QuantBlocking::kRow, ", ", columns / QuantBlocking::kColumn, ")");
   ORT_ENFORCE(tensor_scale_prepacked.shape() == tensor_scale.shape());
@@ -84,7 +85,9 @@ inline void sm80_prepack_quant_scales_ref(
   //    2 B operand tiles per mma instruction stacked on k dimension
   //    (1,n) quantization blocking
   if constexpr (sizeof(ScaleElementT) != 2 || QuantBlocking::kRow != 1) {
-    ORT_THROW("sm80_prepack_quant_scales_ref should only be called for row-wise block quantization on 16b float values.");
+    ORT_THROW(
+        "sm80_prepack_quant_scales_ref should only be called for "
+        " row-wise block quantization on 16b float values.");
   }
 
   // In Ampere tensor op, each operand B tile is 8 x 8, in a warp of 32 threads, each thread
diff --git a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
index 9fecec9f7e8bb..a0d115c41c14b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/beam_search_topk.cc
@@ -80,7 +80,8 @@ TEST(TestBeamSearch, TopK) {
   std::vector<float> top_k_values_ref(batch_size * k);
   std::vector<int32_t> top_k_tokens_ref(batch_size * k);
   std::vector<int32_t> top_k_indices_ref(batch_size * k);
-  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size, beam_size, vocab_size, k);
+  ComputeTopKReference(values, top_k_values_ref, top_k_tokens_ref, top_k_indices_ref, batch_size,
+                       beam_size, vocab_size, k);
 
   const int32_t max_vocab_parts = 128;
   size_t buffer_size = batch_x_beam_x_vocab * 4                                      // input
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
index bbe370675fc48..f0dfaf1a58612 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h
@@ -14,12 +14,14 @@
 
 #pragma once
 
+#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+
 #include <random>
+#include <thrust/host_vector.h>
 
-#include "core/util/matrix_layout.h"
 #include "core/common/common.h"
 #include "core/mickey/blk_q4/f16_prepack_sm80.h"
-#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
+#include "core/util/matrix_layout.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -48,10 +50,10 @@ Status sm80_supported();
 template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
 inline void blkq4_weights_gen(
     int rows, int columns,
-    std::vector<ElementT>& dequants,
-    std::vector<uint8_t>& q_weights,
-    std::vector<ElementT>& q_scales,
-    std::vector<uint8_t>& q_zp) {
+    thrust::host_vector<ElementT>& dequants,
+    thrust::host_vector<uint8_t>& q_weights,
+    thrust::host_vector<ElementT>& q_scales,
+    thrust::host_vector<uint8_t>& q_zp) {
   using Base = onnxruntime::cuda::BlockwiseQuantization<
       ElementT,
       block_size,
@@ -74,7 +76,7 @@ inline void blkq4_weights_gen(
 
   const auto q_weight_shape = Base::get_quant_weights_shape(rows, columns);
   const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
-  const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
+  [[maybe_unused]] const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
 
   //
   // For testing quantization and dequantization, it is not straight
@@ -120,9 +122,9 @@ inline void blkq4_weights_gen(
 
   q_scales.resize(meta_shape.product());
   for (size_t i = 0; i < q_scales.size(); i++) {
-    uint32_t v = dis(gen);
-    uint32_t m = (v % 63) + 1;
-    uint32_t e = (v >> 6) % 4;
+    uint32_t vl = dis(gen);
+    uint32_t m = (vl % 63) + 1;
+    uint32_t e = (vl >> 6) % 4;
     q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
   }
   MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
index e687ae73e66f2..e7fa0dae02fda 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc
@@ -11,15 +11,15 @@
  *   well with CUTLASS headers.
  */
 
+#include "blkq4_fp16_gemm_sm80.h"
+
+#include "gtest/gtest.h"
+#include <thrust/host_vector.h>
 #include <random>
 
 #include "core/framework/float16.h"
 #include "core/mlas/inc/mlas_q4.h"
 
-#include "blkq4_fp16_gemm_sm80.h"
-
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace test {
 
@@ -43,10 +43,10 @@ void testPrepack(int rows, int columns) {
   const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
   const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);
 
-  std::vector<ElementW> q_weights;
-  std::vector<ElementT> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementT> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementT> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementT> dequants;
   onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
       rows, columns, dequants, q_weights, q_scales, q_zp);
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
index 69c929d446ce4..210c33933d90d 100644
--- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
+++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_testcu.cu
@@ -11,9 +11,11 @@
  *   well with gtest headers.
  */
 
+#include "blkq4_fp16_gemm_sm80.h"
+
 #include <random>
-#include <thrust/host_vector.h>
 #include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
 
 #include "core/mickey/blk_q4/f16_gemm_sm80.h"
 
@@ -26,13 +28,11 @@
 
 #include "core/common/common.h"
 
-#include "blkq4_fp16_gemm_sm80.h"
-
 namespace onnxruntime {
-namespace cuda{
-namespace test{
+namespace cuda {
+namespace test {
 
-Status sm80_supported(){
+Status sm80_supported() {
   cudaDeviceProp props;
 
   cudaError_t error = cudaGetDeviceProperties(&props, 0);
@@ -55,27 +55,25 @@ Status sm80_supported(){
  *        Copied directly from cutlass util/reference/device/gemm.h
  *        for the strange reason that compiler insists on asking
  *        for explicit stream argument in kernel launch.
-*/
+ */
 template <
-  typename ElementA,
-  typename LayoutA,
-  typename ElementB,
-  typename LayoutB,
-  typename ElementC,
-  typename LayoutC,
-  typename ScalarType,
-  typename AccumulatorType
->
+    typename ElementA,
+    typename LayoutA,
+    typename ElementB,
+    typename LayoutB,
+    typename ElementC,
+    typename LayoutC,
+    typename ScalarType,
+    typename AccumulatorType>
 void compute_gemm_ref(
-  cutlass::gemm::GemmCoord problem_size,
-  ScalarType alpha,
-  cutlass::TensorRef<ElementA, LayoutA> tensor_a,
-  cutlass::TensorRef<ElementB, LayoutB> tensor_b,
-  ScalarType beta,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_c,
-  cutlass::TensorRef<ElementC, LayoutC> tensor_d,
-  AccumulatorType initial_accum = AccumulatorType(0)) {
-
+    cutlass::gemm::GemmCoord problem_size,
+    ScalarType alpha,
+    cutlass::TensorRef<ElementA, LayoutA> tensor_a,
+    cutlass::TensorRef<ElementB, LayoutB> tensor_b,
+    ScalarType beta,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_c,
+    cutlass::TensorRef<ElementC, LayoutC> tensor_d,
+    AccumulatorType initial_accum = AccumulatorType(0)) {
   // Blocking structure potentially improves performance of reference implementation
   // with a minor increase in complexity.
   //
@@ -85,30 +83,27 @@ void compute_gemm_ref(
   dim3 block(16, 8);
 
   dim3 grid(
-    (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
-    (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn)
-  );
+      (problem_size.m() + block.x * OutputTile::kRow - 1) / (block.x * OutputTile::kRow),
+      (problem_size.n() + block.y * OutputTile::kColumn - 1) / (block.y * OutputTile::kColumn));
 
   // Launch a GEMM kernel
   cutlass::reference::device::kernel::Gemm<
-    cutlass::TensorRef<ElementA, LayoutA>,
-    cutlass::TensorRef<ElementB, LayoutB>,
-    cutlass::TensorRef<ElementC, LayoutC>,
-    ScalarType,
-    AccumulatorType,
-    OutputTile,
-    cutlass::multiply_add<AccumulatorType>,
-    cutlass::NumericConverter<ElementC, ScalarType>
-  ><<<grid, block, 0, 0>>>(
-    problem_size,
-    alpha,
-    tensor_a,
-    tensor_b,
-    beta,
-    tensor_c,
-    tensor_d,
-    initial_accum
-  );
+      cutlass::TensorRef<ElementA, LayoutA>,
+      cutlass::TensorRef<ElementB, LayoutB>,
+      cutlass::TensorRef<ElementC, LayoutC>,
+      ScalarType,
+      AccumulatorType,
+      OutputTile,
+      cutlass::multiply_add<AccumulatorType>,
+      cutlass::NumericConverter<ElementC, ScalarType>><<<grid, block, 0, 0>>>(
+      problem_size,
+      alpha,
+      tensor_a,
+      tensor_b,
+      beta,
+      tensor_c,
+      tensor_d,
+      initial_accum);
 }
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -117,28 +112,31 @@ void compute_gemm_ref(
 //
 
 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass,
+                                                      cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element, Layout, true> make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element, Layout, true>
+    make_MatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
   auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
-  auto* ptr = const_cast<typename std::remove_const<Element>::type *>(tensor.host_data());
+  auto* ptr = const_cast<typename std::remove_const<Element>::type*>(tensor.host_data());
   return MatrixRef<Element, Layout, true>(ptr, tensor.capacity(), shape);
 }
 
 template <
-  typename Element,
-  typename LayoutCutlass,
-  typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value, ColumnMajorLayout, RowMajorLayout>
-  >
+    typename Element,
+    typename LayoutCutlass,
+    typename Layout = std::conditional_t<std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value,
+                                         ColumnMajorLayout, RowMajorLayout>>
 __forceinline__
-MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
-  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value
-                || std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
+    MatrixRef<Element const, Layout, true>
+    make_ConstMatrixRef(cutlass::HostTensor<Element, LayoutCutlass> const& tensor) {
+  static_assert(std::is_same<LayoutCutlass, cutlass::layout::ColumnMajor>::value ||
+                std::is_same<LayoutCutlass, cutlass::layout::RowMajor>::value);
   auto shape = make_Position(tensor.extent().row(), tensor.extent().column());
   return MatrixRef<Element const, Layout, true>(tensor.host_data(), tensor.capacity(), shape);
 }
@@ -147,7 +145,7 @@ MatrixRef<Element const, Layout, true> make_ConstMatrixRef(cutlass::HostTensor<E
 // Invoking the kernel
 //
 
-template<
+template <
     int block_size,
     bool column_wise_blocking,
     bool small_m,
@@ -160,9 +158,9 @@ void run_blkq4_gemm(int m, int n, int k) {
 
   using ElementDequant = cutlass::half_t;
   using QuantBlocking =
-    typename std::conditional<column_wise_blocking,
-                     cutlass::MatrixShape<block_size, 1>,
-                     cutlass::MatrixShape<1, block_size>>::type;
+      typename std::conditional<column_wise_blocking,
+                                cutlass::MatrixShape<block_size, 1>,
+                                cutlass::MatrixShape<1, block_size>>::type;
 
   using GemmRunner = BlkQ4F16GemmImpl<ElementDequant, QuantBlocking, small_m, has_offsets>;
 
@@ -181,17 +179,18 @@ void run_blkq4_gemm(int m, int n, int k) {
   using LayoutInputQScale = typename GemmRunner::LayoutInputQScale;
 
   const cutlass::gemm::GemmCoord problem_size = {m, n, k};
-  const auto q_weight_shape = cutlass::make_Coord(problem_size.k()/2, problem_size.n());
-  const auto meta_shape = cutlass::make_Coord(problem_size.k()/QuantBlocking::kRow, problem_size.n()/QuantBlocking::kColumn);
+  const auto q_weight_shape = cutlass::make_Coord(problem_size.k() / 2, problem_size.n());
+  const auto meta_shape = cutlass::make_Coord(problem_size.k() / QuantBlocking::kRow, problem_size.n() /
+                                                                                          QuantBlocking::kColumn);
 
   //
   // Generate quantized and dequantizeed input matrix B [K, N]
   //
   static_assert(std::is_same<LayoutInputWPack, cutlass::layout::ColumnMajor>::value);
-  std::vector<ElementW> q_weights;
-  std::vector<ElementQScale> q_scales;
-  std::vector<ElementQOffset> q_zp;
-  std::vector<ElementDequant> dequants;
+  thrust::host_vector<ElementW> q_weights;
+  thrust::host_vector<ElementQScale> q_scales;
+  thrust::host_vector<ElementQOffset> q_zp;
+  thrust::host_vector<ElementDequant> dequants;
   onnxruntime::cuda::test::blkq4_weights_gen<ElementDequant, block_size, column_wise_blocking, has_offsets>(
       problem_size.k(), problem_size.n(), dequants, q_weights, q_scales, q_zp);
 
@@ -201,11 +200,11 @@ void run_blkq4_gemm(int m, int n, int k) {
       4,
       column_wise_blocking>;
 
-  std::vector<ElementW> packed_w(q_weight_shape.product());
+  thrust::host_vector<ElementW> packed_w(q_weight_shape.product());
   PrepackT::prepack_weights(problem_size.k(), problem_size.n(), q_weights, packed_w);
-  std::vector<ElementQScale> packed_scales(meta_shape.product());
+  thrust::host_vector<ElementQScale> packed_scales(meta_shape.product());
   PrepackT::prepack_quant_scales(problem_size.k(), problem_size.n(), q_scales, packed_scales);
-  std::vector<ElementQOffset> packed_zp;
+  thrust::host_vector<ElementQOffset> packed_zp;
   if constexpr (has_offsets) {
     packed_zp.resize(meta_shape.product());
     PrepackT::prepack_quant_offsets(problem_size.k(), problem_size.n(), q_zp, packed_zp);
@@ -240,16 +239,16 @@ void run_blkq4_gemm(int m, int n, int k) {
   //
   thrust::device_vector<ElementW> d_packed_w(packed_w);
   cutlass::TensorRef<ElementWPack const, LayoutInputWPack> ref_W(
-    reinterpret_cast<ElementWPack const *>(d_packed_w.data().get()),
-    LayoutInputWPack::packed({problem_size.k()/2, problem_size.n()/2}));
+      reinterpret_cast<ElementWPack const*>(d_packed_w.data().get()),
+      LayoutInputWPack::packed({problem_size.k() / 2, problem_size.n() / 2}));
 
   thrust::device_vector<ElementQScale> d_packed_scales(packed_scales);
   cutlass::TensorRef<ElementQScale const, LayoutInputQScale> ref_scales(
-    d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_scales.data().get(), LayoutInputQScale::packed(meta_shape));
 
   thrust::device_vector<ElementQOffset> d_packed_zp(packed_zp);
   cutlass::TensorRef<ElementQOffset const, LayoutInputQScale> ref_zp(
-    d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
+      d_packed_zp.data().get(), LayoutInputQScale::packed(meta_shape));
 
   tensor_a.sync_device();
   tensor_c.sync_device();
@@ -257,16 +256,16 @@ void run_blkq4_gemm(int m, int n, int k) {
 
   // run GEMM
   cutlass::Status status;
-  if constexpr (has_offsets){
+  if constexpr (has_offsets) {
     status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales, ref_zp,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales, ref_zp,
+        tensor_c.device_ref(), tensor_d.device_ref());
   } else {
     status = GemmRunner::run(
-      nullptr, problem_size, tensor_a.device_ref(), ref_W,
-      ref_scales,
-      tensor_c.device_ref(), tensor_d.device_ref());
+        nullptr, problem_size, tensor_a.device_ref(), ref_W,
+        ref_scales,
+        tensor_c.device_ref(), tensor_d.device_ref());
   }
   ORT_ENFORCE(status == cutlass::Status::kSuccess, "Kernel execution failed: ", cutlassGetStatusString(status));
 
@@ -275,7 +274,7 @@ void run_blkq4_gemm(int m, int n, int k) {
   using LayoutInputB = cutlass::layout::ColumnMajor;
   thrust::device_vector<ElementInputB> d_dequants(dequants);
   cutlass::TensorRef<ElementInputB, LayoutInputB> ref_B(
-    d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
+      d_dequants.data().get(), LayoutInputB::packed(problem_size.kn()));
   cutlass::HostTensor<ElementOutput, LayoutOutput> tensor_ref_d(
       problem_size.mn());  // <- Create matrix D with dimensions M x N used to store output from
                            // reference kernel
@@ -289,9 +288,9 @@ void run_blkq4_gemm(int m, int n, int k) {
   ElementComputeEpilogue beta = ElementComputeEpilogue(0);
 
   compute_gemm_ref<ElementInputA, LayoutInputA,
-               ElementInputB, LayoutInputB,
-               ElementOutput, LayoutOutput,
-               ElementComputeEpilogue, ElementAccumulator>(
+                   ElementInputB, LayoutInputB,
+                   ElementOutput, LayoutOutput,
+                   ElementComputeEpilogue, ElementAccumulator>(
       problem_size,
       alpha,
       tensor_a.device_ref(),
@@ -300,17 +299,17 @@ void run_blkq4_gemm(int m, int n, int k) {
       tensor_c.device_ref(),
       tensor_ref_d.device_ref());
 
-  // Wait for kernels to finish
+  //// Wait for kernels to finish
   cudaDeviceSynchronize();
 
-  // Copy output data from CUTLASS and reference kernel to host for comparison
+  //// Copy output data from CUTLASS and reference kernel to host for comparison
   tensor_d.sync_host();
   tensor_ref_d.sync_host();
 
-  // Check if output from CUTLASS kernel and reference kernel are equal or not
+  //// Check if output from CUTLASS kernel and reference kernel are equal or not
   bool passed = cutlass::reference::host::TensorEquals(
-    tensor_d.host_view(),
-    tensor_ref_d.host_view());
+      tensor_d.host_view(),
+      tensor_ref_d.host_view());
   ORT_ENFORCE(passed, "Gemm kernel result wrong!");
 }
 
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
index 8dfaaedcbb378..72357ec7e02d2 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_execution_provider_test.cc
@@ -5,11 +5,14 @@
 // extra code in the core of CUDA EP and that code may
 //  1. slow down performance critical applications and
 //  2. increase binary size of ORT.
+
+#include "gtest/gtest.h"
 #include <iostream>
-#include "core/providers/cuda/cuda_execution_provider.h"
+
+#include "core/framework/run_options.h"
 #include "core/providers/cuda/cuda_allocator.h"
+#include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_stream_handle.h"
-#include "gtest/gtest.h"
 
 namespace onnxruntime {
 namespace cuda {
@@ -22,7 +25,7 @@ TEST(TestDeferredRelease, WithArena) {
   CUDAExecutionProvider ep(info);
   AllocatorPtr gpu_alloctor = ep.CreatePreferredAllocators()[0];
 
-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
   run_opts.run_tag = "log1";
   // Allocator for call cudaMallocHost and cudaFreeHost
   // For details, see CUDAPinnedAllocator in cuda_allocator.cc.
@@ -54,7 +57,7 @@ TEST(TestDeferredRelease, WithoutArena) {
   CUDAExecutionProviderInfo info;
   CUDAExecutionProvider ep(info);
 
-  RunOptions run_opts;
+  onnxruntime::RunOptions run_opts;
   run_opts.run_tag = "log1";
 
   OrtDevice pinned_device{OrtDevice::CPU, OrtDevice::MemType::CUDA_PINNED, DEFAULT_CPU_ALLOCATOR_DEVICE_ID};
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
index 96c1e173316de..d8384b432786b 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_test_provider.cc
@@ -6,12 +6,11 @@
 #include "core/providers/cuda/cuda_provider_factory_creator.h"
 #include "core/providers/cuda/cuda_provider_options.h"
 
+#include "gtest/gtest.h"
 #include <memory>
 #include <chrono>
 
 #include "core/common/gsl.h"
-#include "gtest/gtest.h"
-
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_execution_provider_info.h"
 #include "core/providers/cuda/cuda_allocator.h"
@@ -64,8 +63,15 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
 
   void cuda__Impl_Cast(void*, const float*, double*, size_t) override {}
 
-  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
-  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode, const char* msg, const char* file, const int line) override { CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line); }
+  Status CudaCall_false(int retCode, const char* exprString, const char* libName, int successCode,
+                        const char* msg, const char* file, const int line) override {
+    return CudaCall<cudaError, false>(cudaError(retCode), exprString, libName,
+                                      cudaError(successCode), msg, file, line);
+  }
+  void CudaCall_true(int retCode, const char* exprString, const char* libName, int successCode,
+                     const char* msg, const char* file, const int line) override {
+    CudaCall<cudaError, true>(cudaError(retCode), exprString, libName, cudaError(successCode), msg, file, line);
+  }
 
   void CopyGpuToCpu(void*, const void*, const size_t, const OrtMemoryInfo&, const OrtMemoryInfo&) override {}
 
@@ -93,19 +99,27 @@ struct ProviderInfo_CUDA_TestImpl : ProviderInfo_CUDA {
     return nullptr;
   }
 
-  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy, onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&, const OrtArenaCfg*) override {
+  std::shared_ptr<IAllocator> CreateCudaAllocator(int16_t, size_t, onnxruntime::ArenaExtendStrategy,
+                                                  onnxruntime::CUDAExecutionProviderExternalAllocatorInfo&,
+                                                  const OrtArenaCfg*) override {
     return nullptr;
   }
 
   void TestAll() override {
-    // TestAll is the entry point of CUDA EP's insternal tests.
+    // TestAll is the entry point of CUDA EP's internal tests.
     // Those internal tests are not directly callable from onnxruntime_test_all
     // because CUDA EP is a shared library now.
     // Instead, this is a test provider that implements all the test cases.
     // onnxruntime_test_all is calling this function through TryGetProviderInfo_CUDA_Test.
-    int argc = 1;
-    std::string mock_exe_name = "onnxruntime_providers_cuda_ut";
-    char* argv[] = {const_cast<char*>(mock_exe_name.data())};
+    char mock_exe_name[] = "onnxruntime_providers_cuda_ut";
+
+    // InitGoogleTest decrements argc and removes args from argv if
+    // recognized. By doing so it decrements argc and shifts argv,
+    // to do so, from the code comments it expects argc + 1 with the last one always being nullptr
+    // otherwise, windows diagnostics reports stack corruption. when
+    int argc = 1;  // Change argc to 2 and edit the filter below if necessary
+    char* argv[] = {mock_exe_name, nullptr};
+    // char* argv[] = {mock_exe_name, "--gtest_filter=ReductionFunctionsTest.*", nullptr};
     ::testing::InitGoogleTest(&argc, argv);
     ORT_ENFORCE(RUN_ALL_TESTS() == 0);
   }
diff --git a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
index 9d20bc545df5f..7468a5718425e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/cuda_utils_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include <memory>
 #include <vector>
 
-#include "gtest/gtest.h"
-
 #include "core/common/common.h"
 #include "core/providers/cuda/shared_inc/cuda_call.h"
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
@@ -32,7 +32,8 @@ void TestFillCorrectness(size_t num_elements, TElement value) {
   Fill<TElement>(nullptr, buffer.get(), value, num_elements);
 
   auto cpu_buffer = std::make_unique<TElement[]>(num_elements);
-  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement), cudaMemcpyKind::cudaMemcpyDeviceToHost));
+  CUDA_CALL_THROW(cudaMemcpy(cpu_buffer.get(), buffer.get(), num_elements * sizeof(TElement),
+                             cudaMemcpyKind::cudaMemcpyDeviceToHost));
 
   std::vector<TElement> expected_data(num_elements, value);
   EXPECT_EQ(std::memcmp(cpu_buffer.get(), expected_data.data(), num_elements * sizeof(TElement)), 0);
diff --git a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
index 4917701e5197d..6636e15040393 100644
--- a/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/gemm_options_test.cc
@@ -1,11 +1,11 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
+#include "gtest/gtest.h"
+
 #include "core/common/common.h"
 #include "core/providers/cuda/cuda_common.h"
 
-#include "gtest/gtest.h"
-
 namespace onnxruntime {
 namespace cuda {
 namespace test {
diff --git a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
index c460e806c1a80..ec7e98528504e 100644
--- a/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
+++ b/onnxruntime/test/providers/cuda/test_cases/reduction_functions_test.cc
@@ -1,10 +1,10 @@
 // Copyright (c) Microsoft Corporation. All rights reserved.
 // Licensed under the MIT License.
 
-#include <memory>
-
 #include "gtest/gtest.h"
 
+#include <memory>
+
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
 #include "core/common/optional.h"
 #include "core/providers/cuda/reduction/reduction_functions.h"
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index b7232e9dc4ba1..0e885b71b486a 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -150,7 +150,9 @@ stages:
               --enable_cuda_profiling --enable_cuda_nhwc_ops \
               --enable_pybind --build_java \
               --use_cache \
-              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75; \
+              --cmake_extra_defines  CMAKE_CUDA_ARCHITECTURES=75 \
+              --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON \
+              --cmake_extra_defines  onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON; \
                 ccache -sv; \
                 ccache -z"
       workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
index eee38ac04b355..291e2f4e19401 100644
--- a/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/win-gpu-ci-pipeline.yml
@@ -42,7 +42,12 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --enable_cuda_profiling --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --build_java --build_nodejs --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --enable_cuda_profiling
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=ON
+          --cmake_extra_defines onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS=ON
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -59,7 +64,10 @@ stages:
         BuildConfig: 'RelWithDebInfo'
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
-        additionalBuildFlags: --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --skip_onnx_tests --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+        additionalBuildFlags: >-
+          --enable_pybind --enable_training --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --skip_onnx_tests
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo
@@ -95,7 +103,11 @@ stages:
         EnvSetupScript: setup_env_cuda.bat
         buildArch: x64
         # note: need to specify `--gen_doc` when creating the build config so it has to be in additionalBuildFlags
-        additionalBuildFlags: --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda --cuda_home="$(Agent.TempDirectory)\v11.8" --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86 --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
+        additionalBuildFlags: >-
+          --gen_doc validate --skip_tests --enable_pybind --use_dml --use_cuda
+          --cuda_home="$(Agent.TempDirectory)\v11.8"
+          --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=86
+          --cmake_extra_defines onnxruntime_BUILD_UNIT_TESTS=OFF
         msbuildPlatform: x64
         isX86: false
         job_name_suffix: x64_RelWithDebInfo

From a9d9b083e425f102b1e1fbc9ed4d715afc392aab Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Wed, 27 Mar 2024 15:59:35 -0700
Subject: [PATCH 8/9] Fix py package pipeline (#20065)

### Description
<!-- Describe your changes. -->


### Motivation and Context
Fixes #20068
---
 onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
index 5e6e484567988..c299cdcfe6a3d 100644
--- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
+++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu
@@ -656,7 +656,9 @@ inline __device__ float4 operator*(const float4 a, const float4 b) {
   return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w);
 }
 
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+// TODO(wy): use cuda common header and investigate pipeline build issue.
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2)))
 inline __device__ half operator*(const half a, const half b) {
   return __float2half(__half2float(a) * __half2float(b));
 }
@@ -666,8 +668,10 @@ inline __device__ half2 operator*(const half2 a, const half2 b) {
 }
 #endif
 
+// TODO(wy): use cuda common header and investigate pipeline build issue.
 inline __device__ Half4 operator*(const Half4 a, const Half4 b) {
-#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 && \
+    ((__CUDACC_VER_MAJOR__ < 12) || ((__CUDACC_VER_MAJOR__ == 12) && (__CUDACC_VER_MINOR__ < 2)))
   Half4 result;
   result.x = a.x * b.x;
   result.y = a.y * b.y;

From 55f63a48ca07ceb7480756b68a7795a221d3a45d Mon Sep 17 00:00:00 2001
From: pengwa <pengwa@microsoft.com>
Date: Thu, 28 Mar 2024 08:40:34 +0800
Subject: [PATCH 9/9] Keep original name during fusion (#20097)

### Keep original name during fusion

This could be helpful to know where the fused node coming from, I feel
this is very useful when debugging the execution order issues between
different transformer layers.

For example:

- A node named
`/_original_module/model/layers.1/self_attn/MatMul/MatmulTransposeFusion//MatMulScaleFusion/`
goes through two fusion paths in the 1st transformer layer - e.g.
`MatmulTransposeFusion` and `MatMulScaleFusion`.

-
`/_original_module/model/layers.2/post_attention_layernorm/Mul_1/SimplifiedLayerNormFusion/`
node is a fused node by `SimplifiedLayerNormFusion`.


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/core/optimizer/gather_fusion.cc                 | 2 +-
 onnxruntime/core/optimizer/gemm_transpose_fusion.cc         | 2 +-
 onnxruntime/core/optimizer/layer_norm_fusion.cc             | 4 ++--
 onnxruntime/core/optimizer/matmul_scale_fusion.cc           | 2 +-
 onnxruntime/core/optimizer/matmul_transpose_fusion.cc       | 6 +++---
 onnxruntime/core/optimizer/quick_gelu_fusion.cc             | 2 +-
 onnxruntime/test/optimizer/graph_transform_test.cc          | 6 +++---
 .../orttraining/core/optimizer/concat_replacement.cc        | 2 +-
 8 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/onnxruntime/core/optimizer/gather_fusion.cc b/onnxruntime/core/optimizer/gather_fusion.cc
index 90cabff88122c..1f2b31526c6b8 100644
--- a/onnxruntime/core/optimizer/gather_fusion.cc
+++ b/onnxruntime/core/optimizer/gather_fusion.cc
@@ -273,7 +273,7 @@ Status GatherSliceToSplitFusion::ApplyImpl(Graph& graph, bool& modified, int gra
     split_initializer_proto.add_dims(static_cast<int64_t>(split_values.size()));
     split_initializer_proto.mutable_int64_data()->Add(split_values.begin(), split_values.end());
     NodeArg* split_initializer_arg = &graph_utils::AddInitializer(graph, split_initializer_proto);
-    Node& split_node = graph.AddNode(graph.GenerateNodeName("Split"), "Split", "Split for Fused Gather nodes",
+    Node& split_node = graph.AddNode(nodes_to_fuse[0].get().Name() + "/GatherSliceToSplitFusion/", "Split", "Split for Fused Gather nodes",
                                      {graph.GetNodeArg(node_arg->Name()), split_initializer_arg}, split_outputs);
     split_node.AddAttribute("axis", axis);
     split_node.SetExecutionProviderType(nodes_to_fuse[0].get().GetExecutionProviderType());
diff --git a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
index b97cce9c2e785..a52517d23db86 100644
--- a/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/gemm_transpose_fusion.cc
@@ -75,7 +75,7 @@ Status GemmTransposeFusion::Apply(Graph& graph, Node& node, RewriteRuleEffect& m
     nodes_to_remove.push_back(output_node);
   }
 
-  Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "_transformed"),
+  Node& new_gemm_node = graph.AddNode(graph.GenerateNodeName(gemm_node.Name() + "/GemmTransposeFusion/"),
                                       gemm_node.OpType(),
                                       "Fused Gemm with Transpose",
                                       new_gemm_input_defs,
diff --git a/onnxruntime/core/optimizer/layer_norm_fusion.cc b/onnxruntime/core/optimizer/layer_norm_fusion.cc
index ce696154adb6d..48edf4854fbbb 100644
--- a/onnxruntime/core/optimizer/layer_norm_fusion.cc
+++ b/onnxruntime/core/optimizer/layer_norm_fusion.cc
@@ -455,7 +455,7 @@ Status LayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
     }
 
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale, bias};
-    Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName("LayerNormalization"),
+    Node& layer_norm_node = graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/LayerNormFusion/"),
                                           "LayerNormalization",
                                           "fused LayerNorm subgraphs ",
                                           layer_norm_input_defs,
@@ -705,7 +705,7 @@ Status SimplifiedLayerNormFusion::ApplyImpl(Graph& graph, bool& modified, int gr
 
     InlinedVector<NodeArg*> layer_norm_input_defs{x_input, scale};
     Node& layer_norm_node =
-        graph.AddNode(graph.GenerateNodeName("SimplifiedLayerNormalization"), "SimplifiedLayerNormalization",
+        graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/SimplifiedLayerNormFusion/"), "SimplifiedLayerNormalization",
                       "fused LayerNorm subgraphs ", layer_norm_input_defs, {}, {}, kOnnxDomain);
 
     // Get constant "epsilon" from "Add" node if available. Else, default value will be used.
diff --git a/onnxruntime/core/optimizer/matmul_scale_fusion.cc b/onnxruntime/core/optimizer/matmul_scale_fusion.cc
index b04d794cc9469..e4cdeadbf54d7 100644
--- a/onnxruntime/core/optimizer/matmul_scale_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_scale_fusion.cc
@@ -245,7 +245,7 @@ Status ProcessNode(
   }
 
   Node& matmul_scale_node = graph.AddNode(
-      graph.GenerateNodeName(node.Name() + "_FusedMatMulAndScale"),
+      graph.GenerateNodeName(node.Name() + "/MatMulScaleFusion/"),
       "FusedMatMul",
       "Fused MatMul and Scale",
       fused_node_inputs,
diff --git a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
index 789466778edc6..8eb224013618d 100644
--- a/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
+++ b/onnxruntime/core/optimizer/matmul_transpose_fusion.cc
@@ -154,14 +154,14 @@ static Node* ReorderCastAndTranspose(Graph& graph, Node* cast,
   const ONNX_NAMESPACE::TensorProto_DataType element_type =
       static_cast<ONNX_NAMESPACE::TensorProto_DataType>(cast_output->TypeAsProto()->tensor_type().elem_type());
   new_cast_output_type_proto.mutable_tensor_type()->set_elem_type(element_type);
-  auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "_transformed", &new_cast_output_type_proto);
+  auto& new_cast_output = graph.GetOrCreateNodeArg(cast_output->Name() + "/MatmulTransposeFusion/", &new_cast_output_type_proto);
 
   const std::array new_cast_input_defs{transpose_input};
   const std::array new_cast_output_defs{&new_cast_output};
   const std::array new_transpose_input_defs = {&new_cast_output};
   const std::array new_transpose_output_defs = {cast_output};
 
-  Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "_transformed"),
+  Node& new_cast = graph.AddNode(graph.GenerateNodeName(cast->Name() + "/MatmulTransposeFusion/"),
                                  cast->OpType(),
                                  "Created a new Cast node to interchange Cast and Transpose nodes",
                                  new_cast_input_defs,
@@ -385,7 +385,7 @@ Status MatmulTransposeFusion::ApplyImpl(Graph& graph, bool& modified, int graph_
     const std::array input_defs{left_input, right_input};
     const std::array output_defs{node.MutableOutputDefs()[0]};
 
-    Node& matmul_node = graph.AddNode(graph.GenerateNodeName("MatMul_With_Transpose"),
+    Node& matmul_node = graph.AddNode(graph.GenerateNodeName(node.Name() + "/MatmulTransposeFusion/"),
                                       "FusedMatMul",
                                       "fused MatMul and Transpose ",
                                       input_defs,
diff --git a/onnxruntime/core/optimizer/quick_gelu_fusion.cc b/onnxruntime/core/optimizer/quick_gelu_fusion.cc
index 6e5eb5612a701..b09ef1c460b8e 100644
--- a/onnxruntime/core/optimizer/quick_gelu_fusion.cc
+++ b/onnxruntime/core/optimizer/quick_gelu_fusion.cc
@@ -88,7 +88,7 @@ Status QuickGeluFusion::ApplyImpl(Graph& graph, bool& modified, int graph_level,
 
     NodeArg* quick_gelu_output_arg = mul_node.MutableOutputDefs()[0];
     Node& quick_gelu_node =
-        graph.AddNode(graph.GenerateNodeName("QuickGelu"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg},
+        graph.AddNode(graph.GenerateNodeName(mul_node.Name() + "/QuickGeluFusion/"), "QuickGelu", "QuickGelu", std::array{quick_gelu_input_arg},
                       std::array{quick_gelu_output_arg}, {}, kMSDomain);
     quick_gelu_node.AddAttribute("alpha", alpha);
     quick_gelu_node.SetExecutionProviderType(node.GetExecutionProviderType());
diff --git a/onnxruntime/test/optimizer/graph_transform_test.cc b/onnxruntime/test/optimizer/graph_transform_test.cc
index 97f1feaaa612d..0d1f213618e54 100755
--- a/onnxruntime/test/optimizer/graph_transform_test.cc
+++ b/onnxruntime/test/optimizer/graph_transform_test.cc
@@ -2724,7 +2724,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTranspose) {
   auto gemm_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm/GemmTransposeFusion/"; });
 
   auto& node = *gemm_node;
   ASSERT_TRUE(node.OpType() == "Gemm");
@@ -2760,7 +2760,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm
   auto gemm1_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm1_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm1/GemmTransposeFusion/"; });
 
   auto& node1 = *gemm1_node;
   ASSERT_TRUE(node1.OpType() == "Gemm");
@@ -2773,7 +2773,7 @@ TEST_F(GraphTransformationTests, GemmTransposeFusion2OutputsFromTransposeTo2Gemm
   auto gemm2_node =
       std::find_if(
           graph.Nodes().cbegin(), graph.Nodes().cend(),
-          [](const Node& node) { return node.Name() == "Gemm2_transformed"; });
+          [](const Node& node) { return node.Name() == "Gemm2/GemmTransposeFusion/"; });
 
   auto& node2 = *gemm2_node;
   ASSERT_TRUE(node2.OpType() == "Gemm");
diff --git a/orttraining/orttraining/core/optimizer/concat_replacement.cc b/orttraining/orttraining/core/optimizer/concat_replacement.cc
index 37d302765cda8..2c919591ec081 100644
--- a/orttraining/orttraining/core/optimizer/concat_replacement.cc
+++ b/orttraining/orttraining/core/optimizer/concat_replacement.cc
@@ -23,7 +23,7 @@ Status ConcatReplacement::Apply(Graph& graph, Node& concat_node, RewriteRuleEffe
 
   concat_outputs.push_back(&ip_shape_op);
 
-  Node& concat_training_node = graph.AddNode(graph.GenerateNodeName("ConcatTraining"),
+  Node& concat_training_node = graph.AddNode(graph.GenerateNodeName(concat_node.Name() + "/ConcatReplacement/"),
                                              "ConcatTraining",
                                              "Concat with extra output",
                                              concat_inputs,