From 6a424ccf8c2f9cd7f191c843547d5f37ef409493 Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Wed, 24 Jan 2024 03:33:49 +0000 Subject: [PATCH 1/3] Fix AMD pipeline test failures (#19250) ### Description Fix amd test failure ### Motivation and Context --- onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu | 5 +++-- onnxruntime/contrib_ops/rocm/bert/multihead_attention.h | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu index 6f98312e4067d..09e7d61b71db9 100644 --- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu +++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.cu @@ -68,6 +68,7 @@ MultiHeadAttention::MultiHeadAttention(const OpKernelInfo& info) scale_ = info.GetAttrOrDefault("scale", 0.0f); past_present_share_buffer_ = info.GetAttrOrDefault("past_present_share_buffer", 0LL) != 0LL; + is_unidirectional_ = info.GetAttrOrDefault("unidirectional", 0) == 1; using HipT = typename ToHipType::MappedType; using AttentionTunableOp = GemmSoftmaxGemmPermuteTunableOp; @@ -121,8 +122,8 @@ Status MultiHeadAttention::ComputeInternal(OpKernelContext* context) const { query, key, value, bias, key_padding_mask, relative_position_bias, past_key, past_value, past_seq_len, - &attn, - num_heads_, mask_filter_value_, scale_, + &attn, num_heads_, + mask_filter_value_, scale_, false, /*is_unidirectional_*/ past_present_share_buffer_, false, device_prop.maxThreadsPerBlock)); if (attn_type_ == kDecoderMaskedMultiHeadAttention && attn.sequence_length != 1) { diff --git a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h index 84d8b76bbfebe..1d676d7a7bcac 100644 --- a/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h +++ b/onnxruntime/contrib_ops/rocm/bert/multihead_attention.h @@ -25,6 +25,7 @@ class MultiHeadAttention final : public RocmKernel { float mask_filter_value_; float scale_; bool past_present_share_buffer_{false}; + bool is_unidirectional_{false}; // type-erased GemmSoftmaxGemmPermuteTunableOp, the reason for this is: // 1. We don't want to include the cuh file where GemmSoftmaxGemmPermuteTunableOp is defined. From c10be1848cafa7575ba298cbcc01e89dcd841851 Mon Sep 17 00:00:00 2001 From: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Date: Tue, 23 Jan 2024 21:30:22 -0800 Subject: [PATCH 2/3] [TensorRT EP] Avoid calling unavailable function with cpu python package (#19251) C.register_tensorrt_plugins_as_custom_ops() is only available in gpu python package. Add condition to avoid calling it in cpu python package. --- .../python/onnxruntime_inference_collection.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/onnxruntime/python/onnxruntime_inference_collection.py b/onnxruntime/python/onnxruntime_inference_collection.py index 1a3e22142f80e..09f768f53ea65 100644 --- a/onnxruntime/python/onnxruntime_inference_collection.py +++ b/onnxruntime/python/onnxruntime_inference_collection.py @@ -466,7 +466,7 @@ def _create_inference_session(self, providers, provider_options, disabled_optimi session_options = self._sess_options if self._sess_options else C.get_default_session_options() - self._register_ep_custom_ops(session_options, providers, provider_options) + self._register_ep_custom_ops(session_options, providers, provider_options, available_providers) if self._model_path: sess = C.InferenceSession(session_options, self._model_path, True, self._read_config_from_model) @@ -510,11 +510,15 @@ def _reset_session(self, providers, provider_options): self._sess_options = self._sess_options_initial self._create_inference_session(providers, provider_options) - def _register_ep_custom_ops(self, session_options, providers, provider_options): + def _register_ep_custom_ops(self, session_options, providers, provider_options, available_providers): for i in range(len(providers)): - if providers[i] == "TensorrtExecutionProvider": + if providers[i] in available_providers and providers[i] == "TensorrtExecutionProvider": C.register_tensorrt_plugins_as_custom_ops(session_options, provider_options[i]) - elif isinstance(providers[i], tuple) and providers[i][0] == "TensorrtExecutionProvider": + elif ( + isinstance(providers[i], tuple) + and providers[i][0] in available_providers + and providers[i][0] == "TensorrtExecutionProvider" + ): C.register_tensorrt_plugins_as_custom_ops(session_options, providers[i][1]) From d7aebf9ea8a4a651088384f219292bae9062439b Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 24 Jan 2024 14:15:07 +0800 Subject: [PATCH 3/3] Move Nuget Test from T4 to A10 to reduce release duration (#19253) ### Description ### Motivation and Context Running release process is very painful and boring because some GPU jobs have to wait so long time. ![image](https://github.com/microsoft/onnxruntime/assets/16190118/1c5c981e-68d4-4678-9758-443fbf362802) ![image](https://github.com/microsoft/onnxruntime/assets/16190118/ba0d79ba-1554-4c7a-93dd-6ea8144c9295) ![image](https://github.com/microsoft/onnxruntime/assets/16190118/36cab833-71c1-4ff5-bca5-f4caa9aee0c9) On the one hand, we could move some T4 from PR process since some jobs are not using T4 any more and on the other hand, we can continue to change some jobs' agent from T4 to A4 too. In the future, T4 will mainly be used for the scenarioes that big GPU memory is needed, multiple GPU cards or some special cases. Test runs: https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=401786&view=logs&j=8048494c-e6eb-5e47-5e87-ff0aa863325d cc @YUNQIUGUO @snnn --- .../c-api-noopenmp-packaging-pipelines.yml | 8 ++++---- .../github/azure-pipelines/cuda-packaging-pipeline.yml | 8 ++++---- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index aa1a75bfcda45..5a50a9964bead 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -1023,7 +1023,7 @@ stages: - template: nuget/templates/test_win.yml parameters: - AgentPool : 'onnxruntime-Win2022-GPU-T4' + AgentPool : 'onnxruntime-Win2022-GPU-A10' NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' @@ -1034,7 +1034,7 @@ stages: - template: nuget/templates/test_win.yml parameters: - AgentPool : 'onnxruntime-Win2022-GPU-T4' + AgentPool : 'onnxruntime-Win2022-GPU-A10' NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' @@ -1046,7 +1046,7 @@ stages: - template: nuget/templates/test_linux.yml parameters: - AgentPool : Onnxruntime-Linux-GPU + AgentPool : Onnxruntime-Linux-GPU-A10 ArtifactSuffix: 'GPU' StageSuffix: 'GPU' NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu' @@ -1055,7 +1055,7 @@ stages: - template: nuget/templates/test_linux.yml parameters: - AgentPool : Onnxruntime-Linux-GPU + AgentPool : Onnxruntime-Linux-GPU-A10 ArtifactSuffix: 'GPU' StageSuffix: 'GPU' MoreSuffix: '_Linux' diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index 1d2ba88652f48..0c24d4897ddf1 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -151,7 +151,7 @@ stages: # Testing - template: nuget/templates/test_win.yml parameters: - AgentPool : 'onnxruntime-Win2022-GPU-T4' + AgentPool : 'onnxruntime-Win2022-GPU-A10' NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' @@ -162,7 +162,7 @@ stages: - template: nuget/templates/test_win.yml parameters: - AgentPool : 'onnxruntime-Win2022-GPU-T4' + AgentPool : 'onnxruntime-Win2022-GPU-A10' NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu.Windows' ArtifactSuffix: 'GPU' StageSuffix: 'GPU' @@ -174,7 +174,7 @@ stages: - template: nuget/templates/test_linux.yml parameters: - AgentPool : Onnxruntime-Linux-GPU + AgentPool : Onnxruntime-Linux-GPU-A10 ArtifactSuffix: 'GPU' StageSuffix: 'GPU' NugetPackageName : 'Microsoft.ML.OnnxRuntime.Gpu' @@ -184,7 +184,7 @@ stages: - template: nuget/templates/test_linux.yml parameters: - AgentPool : Onnxruntime-Linux-GPU + AgentPool : Onnxruntime-Linux-GPU-A10 ArtifactSuffix: 'GPU' StageSuffix: 'GPU' MoreSuffix: '_Linux'