From 6a424ccf8c2f9cd7f191c843547d5f37ef409493 Mon Sep 17 00:00:00 2001
From: Ye Wang <52801275+wangyems@users.noreply.github.com>
Date: Wed, 24 Jan 2024 03:33:49 +0000
Subject: [PATCH 1/3] Fix AMD pipeline test failures (#19250)

### Description
<!-- Describe your changes. -->

Fix amd test failure

### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu | 5 +++--
 onnxruntime/contrib_ops/rocm/bert/multihead_attention.h  | 1 +
 2 files changed, 4 insertions(+), 2 deletions(-)
diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
index 6f98312e4067d..09e7d61b71db9 100644
--- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
+++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu
@@ -68,6 +68,7 @@ MultiHeadAttention<T>::MultiHeadAttention(const OpKernelInfo& info)
   scale_ = info.GetAttrOrDefault<float>("scale", 0.0f);
 
   past_present_share_buffer_ = info.GetAttrOrDefault<int64_t>("past_present_share_buffer", 0LL) != 0LL;
+  is_unidirectional_ = info.GetAttrOrDefault<int64_t>("unidirectional", 0) == 1;
 
   using HipT = typename ToHipType<T>::MappedType;
   using AttentionTunableOp = GemmSoftmaxGemmPermuteTunableOp<HipT>;
@@ -121,8 +122,8 @@ Status MultiHeadAttention<T>::ComputeInternal(OpKernelContext* context) const {
           query, key, value, bias,
           key_padding_mask, relative_position_bias,
           past_key, past_value, past_seq_len,
-          &attn,
-          num_heads_, mask_filter_value_, scale_,
+          &attn, num_heads_, 
+          mask_filter_value_, scale_, false, /*is_unidirectional_*/ 
           past_present_share_buffer_, false, device_prop.maxThreadsPerBlock));
 
   if (attn_type_ == kDecoderMaskedMultiHeadAttention && attn.sequence_length != 1) {
diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
index 84d8b76bbfebe..1d676d7a7bcac 100644
--- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
+++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h
@@ -25,6 +25,7 @@ class MultiHeadAttention final : public RocmKernel {
   float mask_filter_value_;
   float scale_;
   bool past_present_share_buffer_{false};
+  bool is_unidirectional_{false};
 
   // type-erased GemmSoftmaxGemmPermuteTunableOp<HipT>, the reason for this is:
   //   1. We don't want to include the cuh file where GemmSoftmaxGemmPermuteTunableOp<HipT> is defined.

From c10be1848cafa7575ba298cbcc01e89dcd841851 Mon Sep 17 00:00:00 2001
From: Chi Lo <54722500+chilo-ms@users.noreply.github.com>
Date: Tue, 23 Jan 2024 21:30:22 -0800
Subject: [PATCH 2/3] [TensorRT EP] Avoid calling unavailable function with cpu
 python package (#19251)

C.register_tensorrt_plugins_as_custom_ops() is only available in gpu
python package.
Add condition to avoid calling it in cpu python package.
---
 .../python/onnxruntime_inference_collection.py       | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py
index 1a3e22142f80e..09f768f53ea65 100644
--- a/onnxruntime/python/onnxruntime_inference_collection.py
+++ b/onnxruntime/python/onnxruntime_inference_collection.py
@@ -466,7 +466,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi
 
         session_options = self._sess_options if self._sess_options else C.get_default_session_options()
 
-        self._register_ep_custom_ops(session_options, providers, provider_options)
+        self._register_ep_custom_ops(session_options, providers, provider_options, available_providers)
 
         if self._model_path:
             sess = C.InferenceSession(session_options, self._model_path, True, self._read_config_from_model)
@@ -510,11 +510,15 @@ def _reset_session(self, providers, provider_options):
         self._sess_options = self._sess_options_initial
         self._create_inference_session(providers, provider_options)
 
-    def _register_ep_custom_ops(self, session_options, providers, provider_options):
+    def _register_ep_custom_ops(self, session_options, providers, provider_options, available_providers):
         for i in range(len(providers)):
-            if providers[i] == "TensorrtExecutionProvider":
+            if providers[i] in available_providers and providers[i] == "TensorrtExecutionProvider":
                 C.register_tensorrt_plugins_as_custom_ops(session_options, provider_options[i])
-            elif isinstance(providers[i], tuple) and providers[i][0] == "TensorrtExecutionProvider":
+            elif (
+                isinstance(providers[i], tuple)
+                and providers[i][0] in available_providers
+                and providers[i][0] == "TensorrtExecutionProvider"
+            ):
                 C.register_tensorrt_plugins_as_custom_ops(session_options, providers[i][1])
 
 

From d7aebf9ea8a4a651088384f219292bae9062439b Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 24 Jan 2024 14:15:07 +0800
Subject: [PATCH 3/3] Move Nuget Test from T4 to A10 to reduce release duration
 (#19253)

### Description
<!-- Describe your changes. -->



### Motivation and Context
Running release process is very painful and boring because some GPU jobs
have to wait so long time.

![image](https://github.com/microsoft/onnxruntime/assets/16190118/1c5c981e-68d4-4678-9758-443fbf362802)

![image](https://github.com/microsoft/onnxruntime/assets/16190118/ba0d79ba-1554-4c7a-93dd-6ea8144c9295)

![image](https://github.com/microsoft/onnxruntime/assets/16190118/36cab833-71c1-4ff5-bca5-f4caa9aee0c9)
On the one hand, we could move some T4 from PR process since some jobs
are not using T4 any more and on the other hand, we can continue to
change some jobs' agent from T4 to A4 too.

In the future, T4 will mainly be used for the scenarioes that big GPU
memory is needed, multiple GPU cards or some special cases.


Test runs:

https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=401786&view=logs&j=8048494c-e6eb-5e47-5e87-ff0aa863325d

cc @YUNQIUGUO @snnn
---
 .../c-api-noopenmp-packaging-pipelines.yml                | 8 ++++----
 .../github/azure-pipelines/cuda-packaging-pipeline.yml    | 8 ++++----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index aa1a75bfcda45..5a50a9964bead 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -1023,7 +1023,7 @@ stages:
 
 - template: nuget/templates/test_win.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2022-GPU-T4'
+    AgentPool : 'onnxruntime-Win2022-GPU-A10'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
@@ -1034,7 +1034,7 @@ stages:
 
 - template: nuget/templates/test_win.yml
   parameters:
-    AgentPool : 'onnxruntime-Win2022-GPU-T4'
+    AgentPool : 'onnxruntime-Win2022-GPU-A10'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
@@ -1046,7 +1046,7 @@ stages:
 
 - template: nuget/templates/test_linux.yml
   parameters:
-    AgentPool : Onnxruntime-Linux-GPU
+    AgentPool : Onnxruntime-Linux-GPU-A10
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
@@ -1055,7 +1055,7 @@ stages:
 
 - template: nuget/templates/test_linux.yml
   parameters:
-    AgentPool : Onnxruntime-Linux-GPU
+    AgentPool : Onnxruntime-Linux-GPU-A10
     ArtifactSuffix: 'GPU'
     StageSuffix: 'GPU'
     MoreSuffix: '_Linux'
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 1d2ba88652f48..0c24d4897ddf1 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -151,7 +151,7 @@ stages:
   # Testing
   - template: nuget/templates/test_win.yml
     parameters:
-      AgentPool : 'onnxruntime-Win2022-GPU-T4'
+      AgentPool : 'onnxruntime-Win2022-GPU-A10'
       NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
@@ -162,7 +162,7 @@ stages:
 
   - template: nuget/templates/test_win.yml
     parameters:
-      AgentPool : 'onnxruntime-Win2022-GPU-T4'
+      AgentPool : 'onnxruntime-Win2022-GPU-A10'
       NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows'
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
@@ -174,7 +174,7 @@ stages:
 
   - template: nuget/templates/test_linux.yml
     parameters:
-      AgentPool : Onnxruntime-Linux-GPU
+      AgentPool : Onnxruntime-Linux-GPU-A10
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
       NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu'
@@ -184,7 +184,7 @@ stages:
 
   - template: nuget/templates/test_linux.yml
     parameters:
-      AgentPool : Onnxruntime-Linux-GPU
+      AgentPool : Onnxruntime-Linux-GPU-A10
       ArtifactSuffix: 'GPU'
       StageSuffix: 'GPU'
       MoreSuffix: '_Linux'