From 54bd87b3e4d97db8ac1f58d9e9095f5d79b4a7f0 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 7 Oct 2024 17:34:02 -0700
Subject: [PATCH 01/65] rename cuda to gpu

---
 .../c-api-noopenmp-packaging-pipelines.yml    | 170 +-----------------
 .../cuda-packaging-pipeline.yml               |   2 +-
 .../nuget-cuda-publishing-pipeline.yml        |   2 +-
 ...-stage.yml => nuget-combine-gpu-stage.yml} |   4 +-
 ...tage.yml => nuget-gpu-packaging-stage.yml} |   0
 ...age.yml => nuget-gpu-publishing-stage.yml} |   0
 ....yml => nuget-win-gpu-packaging-stage.yml} |   0
 7 files changed, 6 insertions(+), 172 deletions(-)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-combine-cuda-stage.yml => nuget-combine-gpu-stage.yml} (96%)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-cuda-packaging-stage.yml => nuget-gpu-packaging-stage.yml} (100%)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-cuda-publishing-stage.yml => nuget-gpu-publishing-stage.yml} (100%)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-win-cuda-packaging-stage.yml => nuget-win-gpu-packaging-stage.yml} (100%)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index e2d977bd60986..d06bfb3710c10 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -118,7 +118,7 @@ stages:
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
-- template: stages/nuget-combine-cuda-stage.yml
+- template: stages/nuget-combine-gpu-stage.yml
   parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       CudaVersion: 11.8
@@ -131,170 +131,4 @@ stages:
       buildJava: true
       buildNodejs: true
       SpecificArtifact: ${{ parameters.SpecificArtifact }}
-      BuildId: ${{ parameters.BuildId }}
-
-
-- template: nuget/templates/dml-vs-2022.yml
-  parameters:
-    AgentPool: 'onnxruntime-Win2022-GPU-dml-A10'
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    ArtifactName: 'drop-nuget-dml'
-    StageName: 'Windows_CI_GPU_DML_Dev'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --build_nodejs --cmake_generator "Visual Studio 17 2022"
-    BuildArch: 'x64'
-    msbuildArchitecture: 'amd64'
-    EnvSetupScript: 'setup_env.bat'
-    sln_platform: 'x64'
-    DoDebugBuild: 'false'
-    DoNugetPack: 'true'
-    DoCompliance: 'false'
-    DoEsrp: ${{ parameters.DoEsrp }}
-    NuPackScript: |
-     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} /p:CurrentData=$(BuildDate) /p:CurrentTime=$(BuildTime)
-     copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
-     mkdir $(Build.ArtifactStagingDirectory)\testdata
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
-- template: nuget/templates/dml-vs-2022.yml
-  parameters:
-    AgentPool: 'onnxruntime-Win2022-GPU-dml-A10'
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    ArtifactName: 'drop-win-dml-x86-zip'
-    StageName: 'Windows_CI_GPU_DML_Dev_x86'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
-    BuildArch: 'x86'
-    EnvSetupScript: 'setup_env_x86.bat'
-    sln_platform: 'Win32'
-    DoDebugBuild: 'false'
-    DoNugetPack: 'true'
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    RunTests: 'false'
-    NuPackScript: |
-     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=x86 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-x86.zip
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-x86.zip $(Build.ArtifactStagingDirectory)
-     mkdir $(Build.ArtifactStagingDirectory)\testdata
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
-- template: nuget/templates/dml-vs-2022.yml
-  parameters:
-    AgentPool: 'onnxruntime-Win2022-GPU-dml-A10'
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    ArtifactName: 'drop-win-dml-arm64-zip'
-    StageName: 'Windows_CI_GPU_DML_Dev_arm64'
-    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --build_nodejs --cmake_generator "Visual Studio 17 2022"
-    BuildArch: 'x64'
-    EnvSetupScript: 'setup_env.bat'
-    sln_platform: 'arm64'
-    DoDebugBuild: 'false'
-    DoNugetPack: 'true'
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    RunTests: 'false'
-    NuPackScript: |
-     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm64 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
-     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm64.zip
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm64.zip $(Build.ArtifactStagingDirectory)
-     mkdir $(Build.ArtifactStagingDirectory)\testdata
-     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
-
-- stage: NuGet_Packaging_DML
-  dependsOn:
-  - Windows_CI_GPU_DML_Dev
-  - Windows_CI_GPU_DML_Dev_x86
-  - Windows_CI_GPU_DML_Dev_arm64
-  condition: succeeded()
-  jobs:
-  - job: NuGet_Packaging_DML
-    workspace:
-      clean: all
-    pool: 'onnxruntime-Win2022-GPU-dml-A10'
-    steps:
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet DirectML'
-      inputs:
-        artifactName: 'drop-nuget-dml'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
-
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet DirectML x86'
-      inputs:
-        artifactName: 'drop-win-dml-x86-zip'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
-
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet DirectML arm64'
-      inputs:
-        artifactName: 'drop-win-dml-arm64-zip'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
-
-    - script: |
-        pushd $(Build.BinariesDirectory)\nuget-artifact-dml
-        dir
-        powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/unzip.exe -OutFile unzip.exe"
-        powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/zip.exe -OutFile zip.exe"
-        set PATH=%CD%;%PATH%
-        SETLOCAL EnableDelayedExpansion
-        FOR /R %%i IN (*.nupkg) do (
-            set filename=%%~ni
-            IF NOT "!filename:~25,7!"=="Managed" (
-                rename %%~ni.nupkg %%~ni.zip
-                unzip %%~ni.zip -d %%~ni
-                del /Q %%~ni.zip
-
-                unzip win-dml-x86.zip -d win-x86
-                mkdir %%~ni\runtimes\win-x86
-                mkdir %%~ni\runtimes\win-x86\native
-
-                move win-x86\runtimes\win-x86\native\onnxruntime.dll %%~ni\runtimes\win-x86\native\onnxruntime.dll
-                move win-x86\runtimes\win-x86\native\onnxruntime.lib %%~ni\runtimes\win-x86\native\onnxruntime.lib
-                move win-x86\runtimes\win-x86\native\onnxruntime.pdb %%~ni\runtimes\win-x86\native\onnxruntime.pdb
-
-                unzip win-dml-arm64.zip -d win-arm64
-                mkdir %%~ni\runtimes\win-arm64
-                mkdir %%~ni\runtimes\win-arm64\native
-
-                move win-arm64\runtimes\win-arm64\native\onnxruntime.dll %%~ni\runtimes\win-arm64\native\onnxruntime.dll
-                move win-arm64\runtimes\win-arm64\native\onnxruntime.lib %%~ni\runtimes\win-arm64\native\onnxruntime.lib
-                move win-arm64\runtimes\win-arm64\native\onnxruntime.pdb %%~ni\runtimes\win-arm64\native\onnxruntime.pdb
-
-
-                pushd %%~ni
-                zip -r ..\%%~ni.zip .
-                popd
-                move %%~ni.zip %%~ni.nupkg
-            )
-        )
-        popd
-        copy $(Build.BinariesDirectory)\nuget-artifact-dml\Microsoft.ML.OnnxRuntime.DirectML*nupkg $(Build.ArtifactStagingDirectory)
-      displayName: 'Bundle DML NuGet and other binaries'
-
-    - template: templates/esrp_nuget.yml
-      parameters:
-        DisplayName: 'ESRP - sign NuGet package'
-        FolderPath: '$(Build.ArtifactStagingDirectory)'
-        DoEsrp: ${{ parameters.DoEsrp }}
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'nuget'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg'
-        PlatformsSupported: 'win-x64,win-x86,win-arm64'
-        VerifyNugetSigning: ${{ parameters.DoEsrp }}
-
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Pipeline NuGet Artifact'
-      inputs:
-        artifactName: 'drop-signed-nuget-dml'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
+      BuildId: ${{ parameters.BuildId }}
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 7118e85e9ea4b..6a245eed44828 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -105,7 +105,7 @@ stages:
         PackageNodeJS: false
 
   # Nuget Packaging
-  - template: stages/nuget-combine-cuda-stage.yml
+  - template: stages/nuget-combine-gpu-stage.yml
     parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       CudaVersion: ${{ parameters.CudaVersion }}
diff --git a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
index aeb250e1e0cbc..0e17bef4c8f73 100644
--- a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
@@ -22,7 +22,7 @@ variables:
       value: onnxruntime-cuda-12
 
 stages:
-  - template: stages/nuget-cuda-publishing-stage.yml
+  - template: stages/nuget-gpu-publishing-stage.yml
     parameters:
       artifact_feed: $(ArtifactFeed)
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-gpu-stage.yml
similarity index 96%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-combine-gpu-stage.yml
index 9c7fbc24ab1b6..f10d700e7d1d9 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-gpu-stage.yml
@@ -42,7 +42,7 @@ stages:
     buildJava: ${{ parameters.buildJava }}
     buildNodejs: ${{ parameters.buildNodejs }}
 
-- template: nuget-win-cuda-packaging-stage.yml
+- template: nuget-win-gpu-packaging-stage.yml
   parameters:
     RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
@@ -51,7 +51,7 @@ stages:
     win_cuda_home: ${{ parameters.win_cuda_home }}
     buildJava: ${{ parameters.buildJava }}
 
-- template: nuget-cuda-packaging-stage.yml
+- template: nuget-gpu-packaging-stage.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
     DoEsrp: ${{ parameters.DoEsrp }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-gpu-packaging-stage.yml
similarity index 100%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-gpu-packaging-stage.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-gpu-publishing-stage.yml
similarity index 100%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-cuda-publishing-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-gpu-publishing-stage.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
similarity index 100%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml

From df06c82c8282e77f78540b4f46dd9c13ee56d3a9 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 7 Oct 2024 17:34:36 -0700
Subject: [PATCH 02/65] rename cuda to gpu

---
 tools/ci_build/github/azure-pipelines/publish-nuget.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
index b78d586288ba3..42ead98209505 100644
--- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml
+++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
@@ -33,7 +33,7 @@ stages:
         - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-dml\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
 
   # Publish CUDA 11 Nuget/Java pkgs to ADO feed
-  - template: stages/nuget-cuda-publishing-stage.yml
+  - template: stages/nuget-gpu-publishing-stage.yml
     parameters:
       artifact_feed: $(ArtifactFeed)
 

From 984e123e8df6e69f2dbb62b958d29b8c80305d3b Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Thu, 10 Oct 2024 17:01:14 -0700
Subject: [PATCH 03/65] Use Cuda with use dml

---
 .../azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
index 445066f08995a..bf5fc661fd70c 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From e5315a479e20dc971c34afb1eab0cd0b2200c0f6 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Thu, 10 Oct 2024 17:06:12 -0700
Subject: [PATCH 04/65] revert c-api-noopenmp-packaging-pipelines.yml

---
 .../c-api-noopenmp-packaging-pipelines.yml    | 595 +++++++++++++++++-
 1 file changed, 592 insertions(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index d06bfb3710c10..a24e40c1957ea 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -62,7 +62,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.26.0.240828
+  default: 2.22.0.240425
 
 resources:
   repositories:
@@ -83,7 +83,7 @@ variables:
   value: 11.8
 
 - name: win_trt_home
-  value: $(Agent.TempDirectory)\TensorRT-10.4.0.26.Windows10.x86_64.cuda-11.8
+  value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
 - name: win_cuda_home
   value: $(Agent.TempDirectory)\v11.8
 
@@ -94,6 +94,28 @@ stages:
     PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
     PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
 
+- stage: Debug
+  dependsOn: Setup
+  jobs:
+  - job: D1
+    pool:
+      name: 'onnxruntime-Ubuntu2204-AMD-CPU'
+    variables:
+      MyVar: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
+      BuildDate: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']]
+      BuildTime: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']]
+    steps:
+    - checkout: none
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+    - bash: echo $(MyVar)
+    - bash: echo $(BuildTime)
+    - bash: echo $(BuildDate)
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
 - template: stages/download-java-tools-stage.yml
 
 - template: templates/c-api-cpu.yml
@@ -112,6 +134,17 @@ stages:
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
+- template: templates/ondevice-training-cpu-packaging-pipeline.yml
+  parameters:
+    RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+    DoCompliance: ${{ parameters.DoCompliance }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training'
+    AdditionalBuildFlags: '--enable_training_apis'
+    AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
+    BuildVariant: 'default'
+
 - template: stages/java-cuda-packaging-stage.yml
   parameters:
     CudaVersion: 11.8
@@ -122,6 +155,7 @@ stages:
   parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       CudaVersion: 11.8
+      docker_base_image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubi8'
       RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
       UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
       win_trt_home: ${{ variables.win_trt_home }}
@@ -131,4 +165,559 @@ stages:
       buildJava: true
       buildNodejs: true
       SpecificArtifact: ${{ parameters.SpecificArtifact }}
-      BuildId: ${{ parameters.BuildId }}
\ No newline at end of file
+      BuildId: ${{ parameters.BuildId }}
+
+# ROCm
+- stage: Linux_C_API_Packaging_ROCm_x64
+  dependsOn: []
+  jobs:
+  - job: Linux_C_API_Packaging_ROCm_x64
+    workspace:
+      clean: all
+    timeoutInMinutes: 120
+    pool: onnxruntime-Ubuntu2204-AMD-CPU
+    variables:
+      RocmVersion: '5.6'
+    steps:
+    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
+      submodules: recursive
+    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux, for get-docker-image-steps.yml
+      submodules: false
+
+    # get-docker-image-steps.yml will move the $(Build.SourcesDirectory)/manylinux into $(Build.SourcesDirectory)/onnxruntime,
+    # then rename $(Build.SourcesDirectory)/onnxruntime as $(Build.SourcesDirectory)
+    - template: templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
+        Context: tools/ci_build/github/linux/docker
+        DockerBuildArgs: >-
+          --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
+          --build-arg BUILD_UID=$(id -u)
+          --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
+          --build-arg ROCM_VERSION=$(RocmVersion)
+          --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
+          --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin:
+          --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
+        Repository: onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
+        CheckOutManyLinux: true
+
+    - template: templates/set-version-number-variables-step.yml
+
+    - task: Bash@3
+      displayName: 'Build'
+      inputs:
+        targetType: filePath
+        filePath: tools/ci_build/github/linux/build_rocm_c_api_package.sh
+        arguments: >-
+          -S $(Build.SourcesDirectory)
+          -B $(Build.BinariesDirectory)
+          -V $(RocmVersion)
+          -I onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
+          -P python3.10
+
+    - script: |
+       set -e -x
+       mkdir $(Build.ArtifactStagingDirectory)/testdata
+       cp $(Build.BinariesDirectory)/Release/libcustom_op_library.so* $(Build.ArtifactStagingDirectory)/testdata
+       ls -al $(Build.ArtifactStagingDirectory)
+      displayName: 'Create Artifacts for CustomOp'  # libcustom_op_library.so from cpu build is built with fp8, ROCm does not support it.
+
+    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
+      parameters:
+        buildConfig: 'Release'
+        artifactName: 'onnxruntime-linux-x64-rocm-$(OnnxRuntimeVersion)'
+        artifactNameNoVersionString: 'onnxruntime-linux-x64-rocm'
+        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
+
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters:
+        condition: 'succeeded'
+    - template: templates/clean-agent-build-directory-step.yml
+
+
+- stage: NuGet_Packaging_ROCm
+  dependsOn:
+  - Setup
+  - Linux_C_API_Packaging_ROCm_x64
+  condition: succeeded()
+  jobs:
+  - job: NuGet_Packaging_ROCm
+    workspace:
+      clean: all
+    # we need to use the 2022 pool to create the nuget package with both pre-net6+Xamarin and net6 targets.
+    # VS2019 has no support for net6 and we need to use msbuild (from the VS install) to do the packing
+    pool: 'Onnxruntime-Win-CPU-2022'
+    variables:
+      breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
+      ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
+      BuildDate : $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']]
+      BuildTime : $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']]
+
+    steps:
+    - checkout: self
+      submodules: true
+      fetchDepth: 1
+
+    - template: templates/flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Pipeline Artifact - NuGet'
+        ArtifactName: 'onnxruntime-linux-x64-rocm'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - task: PowerShell@2
+      displayName: 'Reconstruct Build Directory'
+      inputs:
+        targetType: inline
+        script: |
+          Get-ChildItem $(Build.BinariesDirectory)\nuget-artifact -Filter *.tgz | % {
+            # *.tar will be created after *.tgz is extracted
+            $cmd = "7z.exe x $($_.FullName) -y -o$(Build.BinariesDirectory)\nuget-artifact"
+            Write-Output $cmd
+            Invoke-Expression -Command $cmd
+          }
+
+          Get-ChildItem $(Build.BinariesDirectory)\nuget-artifact -Filter *.tar | % {
+            $cmd = "7z.exe x $($_.FullName) -y -o$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
+            Write-Output $cmd
+            Invoke-Expression -Command $cmd
+          }
+
+          $ort_dirs = Get-ChildItem -Path $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-* -Directory
+          foreach ($ort_dir in $ort_dirs)
+          {
+            $dirname = Split-Path -Path $ort_dir -Leaf
+            $dirname = $dirname.SubString(0, $dirname.LastIndexOf('-'))
+            Write-Output "Renaming $ort_dir to $dirname"
+            Rename-Item -Path $ort_dir -NewName $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\$dirname
+          }
+
+          Copy-Item -Path $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-linux-x64-rocm\lib\* -Destination $(Build.BinariesDirectory)\RelWithDebInfo
+
+    - script: |
+       tree /F
+      workingDirectory: '$(Build.BinariesDirectory)'
+      displayName: 'Inspect Build Binaries Directory'
+
+    - script: |
+       mklink /D /J models C:\local\models
+      workingDirectory: '$(Build.BinariesDirectory)'
+      displayName: 'Create models link'
+
+    - task: NuGetToolInstaller@0
+      displayName: Use Nuget 6.2.1
+      inputs:
+        versionSpec: 6.2.1
+
+    - task: PowerShell@2
+      displayName: Build .NET 6 targets using dotnet
+      inputs:
+        targetType: 'inline'
+        # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path
+        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\
+        # which is inconsistent with the msbuild output path for the pre-.net6 targets
+        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0
+        # and makes it harder to do the packing
+        #
+        # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine.
+        script: |
+          dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj `
+            -p:SelectedTargets=Net6 `
+            /p:Net6Targets=net6.0 `
+            -p:Configuration=RelWithDebInfo `
+            -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" `
+            -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm" `
+            -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} `
+            -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: MSBuild@1
+      displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+        platform: 'Any CPU'
+        configuration: RelWithDebInfo
+        msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: MSBuild@1
+      displayName: 'Build C# for pre-.net6 targets'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+        configuration: RelWithDebInfo
+        platform: 'Any CPU'
+        msbuildArguments: >
+          -p:SelectedTargets=PreNet6
+          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
+          -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"
+          -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+          -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
+          -p:IsLinuxBuild=true
+          -p:IsWindowsBuild=false
+          -p:IsMacOSBuild=false
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - template: templates/win-esrp-dll.yml
+      parameters:
+        FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
+        DisplayName: 'ESRP - Sign C# dlls'
+        DoEsrp: ${{ parameters.DoEsrp }}
+
+    - task: MSBuild@1
+      displayName: Update projects.assets.json with combined list of all target frameworks
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
+        platform: 'Any CPU'
+        configuration: RelWithDebInfo
+        msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: MSBuild@1
+      displayName: 'Build Nuget Packages'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
+        configuration: RelWithDebInfo
+        platform: 'Any CPU'
+        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:CurrentTime=$(BuildTime) -p:CurrentDate=$(BuildDate)'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: CopyFiles@2
+      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+        Contents: '*.snupkg'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+    - task: CopyFiles@2
+      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+        Contents: '*.nupkg'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+    - task: CopyFiles@2
+      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
+      inputs:
+        SourceFolder: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
+        Contents: '*.nupkg'
+        TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+    - template: templates/esrp_nuget.yml
+      parameters:
+        DisplayName: 'ESRP - sign NuGet package'
+        FolderPath: '$(Build.ArtifactStagingDirectory)'
+        DoEsrp: ${{ parameters.DoEsrp }}
+
+    - template: templates/validate-package.yml
+      parameters:
+        PackageType: 'nuget'
+        PackagePath: '$(Build.ArtifactStagingDirectory)'
+        PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
+        PlatformsSupported: 'linux-x64'
+        VerifyNugetSigning: false
+
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish Pipeline NuGet Artifact'
+      inputs:
+        artifactName: 'drop-signed-nuget-ROCm'
+        targetPath: '$(Build.ArtifactStagingDirectory)'
+
+    - task: MSBuild@1
+      displayName: 'Clean C#'
+      inputs:
+        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+        platform: 'Any CPU'
+        configuration: RelWithDebInfo
+        msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm'
+        workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+    - task: RoslynAnalyzers@2
+      displayName: 'Run Roslyn Analyzers'
+      inputs:
+        userProvideBuildInfo: msBuildInfo
+        msBuildCommandline: >
+          "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\msbuild.exe"
+          $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln
+          -p:configuration="RelWithDebInfo"
+          -p:Platform="Any CPU"
+          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
+          -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm
+          -p:IsLinuxBuild=true
+          -p:IsWindowsBuild=false
+          -p:IsMacOSBuild=false
+      condition: and(succeeded(), eq('${{ parameters.DoCompliance }}', true))
+
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters :
+        condition : 'succeeded'
+
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+
+- template: nuget/templates/test_linux.yml
+  parameters:
+    AgentPool: AMD-GPU
+    ArtifactSuffix: 'ROCm'
+    StageSuffix: 'ROCm'
+    NugetPackageName: 'Microsoft.ML.OnnxRuntime.ROCm'
+    SpecificArtifact: ${{ parameters.specificArtifact }}
+    CustomOpArtifactName: 'onnxruntime-linux-x64-rocm'
+    BuildId: ${{ parameters.BuildId }}
+
+- template: nuget/templates/dml-vs-2022.yml
+  parameters:
+    AgentPool: 'onnxruntime-Win2022-GPU-dml-A10'
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    ArtifactName: 'drop-nuget-dml'
+    StageName: 'Windows_CI_GPU_DML_Dev'
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --build_nodejs --cmake_generator "Visual Studio 17 2022"
+    BuildArch: 'x64'
+    msbuildArchitecture: 'amd64'
+    EnvSetupScript: 'setup_env.bat'
+    sln_platform: 'x64'
+    DoDebugBuild: 'false'
+    DoNugetPack: 'true'
+    DoCompliance: 'false'
+    DoEsrp: ${{ parameters.DoEsrp }}
+    NuPackScript: |
+     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} /p:CurrentData=$(BuildDate) /p:CurrentTime=$(BuildTime)
+     copy $(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\*.nupkg $(Build.ArtifactStagingDirectory)
+     mkdir $(Build.ArtifactStagingDirectory)\testdata
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
+
+- template: nuget/templates/dml-vs-2022.yml
+  parameters:
+    AgentPool: 'onnxruntime-Win2022-GPU-dml-A10'
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    ArtifactName: 'drop-win-dml-x86-zip'
+    StageName: 'Windows_CI_GPU_DML_Dev_x86'
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --cmake_generator "Visual Studio 17 2022"
+    BuildArch: 'x86'
+    EnvSetupScript: 'setup_env_x86.bat'
+    sln_platform: 'Win32'
+    DoDebugBuild: 'false'
+    DoNugetPack: 'true'
+    DoCompliance: ${{ parameters.DoCompliance }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    RunTests: 'false'
+    NuPackScript: |
+     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=x86 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
+     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-x86.zip
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-x86.zip $(Build.ArtifactStagingDirectory)
+     mkdir $(Build.ArtifactStagingDirectory)\testdata
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
+
+- template: nuget/templates/dml-vs-2022.yml
+  parameters:
+    AgentPool: 'onnxruntime-Win2022-GPU-dml-A10'
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    ArtifactName: 'drop-win-dml-arm64-zip'
+    StageName: 'Windows_CI_GPU_DML_Dev_arm64'
+    BuildCommand: --build_dir $(Build.BinariesDirectory) --arm64 --skip_submodule_sync --build_shared_lib --enable_onnx_tests --enable_wcos --use_telemetry --use_dml --use_winml --build_nodejs --cmake_generator "Visual Studio 17 2022"
+    BuildArch: 'x64'
+    EnvSetupScript: 'setup_env.bat'
+    sln_platform: 'arm64'
+    DoDebugBuild: 'false'
+    DoNugetPack: 'true'
+    DoCompliance: ${{ parameters.DoCompliance }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    RunTests: 'false'
+    NuPackScript: |
+     msbuild $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj /p:Configuration=RelWithDebInfo /p:TargetArchitecture=arm64 /t:CreatePackage /p:OrtPackageId=Microsoft.ML.OnnxRuntime.DirectML /p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
+     cd $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\
+     ren Microsoft.ML.OnnxRuntime.DirectML.* win-dml-arm64.zip
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\win-dml-arm64.zip $(Build.ArtifactStagingDirectory)
+     mkdir $(Build.ArtifactStagingDirectory)\testdata
+     copy $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\custom_op_library.* $(Build.ArtifactStagingDirectory)\testdata
+
+- stage: NuGet_Packaging_DML
+  dependsOn:
+  - Windows_CI_GPU_DML_Dev
+  - Windows_CI_GPU_DML_Dev_x86
+  - Windows_CI_GPU_DML_Dev_arm64
+  condition: succeeded()
+  jobs:
+  - job: NuGet_Packaging_DML
+    workspace:
+      clean: all
+    pool: 'onnxruntime-Win2022-GPU-dml-A10'
+    steps:
+    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+      displayName: 'Clean Agent Directories'
+      condition: always()
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - NuGet DirectML'
+      inputs:
+        artifactName: 'drop-nuget-dml'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - NuGet DirectML x86'
+      inputs:
+        artifactName: 'drop-win-dml-x86-zip'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - NuGet DirectML arm64'
+      inputs:
+        artifactName: 'drop-win-dml-arm64-zip'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-dml'
+
+    - script: |
+        pushd $(Build.BinariesDirectory)\nuget-artifact-dml
+        dir
+        powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/unzip.exe -OutFile unzip.exe"
+        powershell -Command "Invoke-WebRequest http://stahlworks.com/dev/zip.exe -OutFile zip.exe"
+        set PATH=%CD%;%PATH%
+        SETLOCAL EnableDelayedExpansion
+        FOR /R %%i IN (*.nupkg) do (
+            set filename=%%~ni
+            IF NOT "!filename:~25,7!"=="Managed" (
+                rename %%~ni.nupkg %%~ni.zip
+                unzip %%~ni.zip -d %%~ni
+                del /Q %%~ni.zip
+
+                unzip win-dml-x86.zip -d win-x86
+                mkdir %%~ni\runtimes\win-x86
+                mkdir %%~ni\runtimes\win-x86\native
+
+                move win-x86\runtimes\win-x86\native\onnxruntime.dll %%~ni\runtimes\win-x86\native\onnxruntime.dll
+                move win-x86\runtimes\win-x86\native\onnxruntime.lib %%~ni\runtimes\win-x86\native\onnxruntime.lib
+                move win-x86\runtimes\win-x86\native\onnxruntime.pdb %%~ni\runtimes\win-x86\native\onnxruntime.pdb
+
+                unzip win-dml-arm64.zip -d win-arm64
+                mkdir %%~ni\runtimes\win-arm64
+                mkdir %%~ni\runtimes\win-arm64\native
+
+                move win-arm64\runtimes\win-arm64\native\onnxruntime.dll %%~ni\runtimes\win-arm64\native\onnxruntime.dll
+                move win-arm64\runtimes\win-arm64\native\onnxruntime.lib %%~ni\runtimes\win-arm64\native\onnxruntime.lib
+                move win-arm64\runtimes\win-arm64\native\onnxruntime.pdb %%~ni\runtimes\win-arm64\native\onnxruntime.pdb
+
+
+                pushd %%~ni
+                zip -r ..\%%~ni.zip .
+                popd
+                move %%~ni.zip %%~ni.nupkg
+            )
+        )
+        popd
+        copy $(Build.BinariesDirectory)\nuget-artifact-dml\Microsoft.ML.OnnxRuntime.DirectML*nupkg $(Build.ArtifactStagingDirectory)
+      displayName: 'Bundle DML NuGet and other binaries'
+
+    - template: templates/esrp_nuget.yml
+      parameters:
+        DisplayName: 'ESRP - sign NuGet package'
+        FolderPath: '$(Build.ArtifactStagingDirectory)'
+        DoEsrp: ${{ parameters.DoEsrp }}
+
+    - template: templates/validate-package.yml
+      parameters:
+        PackageType: 'nuget'
+        PackagePath: '$(Build.ArtifactStagingDirectory)'
+        PackageName: 'Microsoft.ML.OnnxRuntime.DirectML*nupkg'
+        PlatformsSupported: 'win-x64,win-x86,win-arm64'
+        VerifyNugetSigning: ${{ parameters.DoEsrp }}
+
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish Pipeline NuGet Artifact'
+      inputs:
+        artifactName: 'drop-signed-nuget-dml'
+        targetPath: '$(Build.ArtifactStagingDirectory)'
+    - template: templates/component-governance-component-detection-steps.yml
+      parameters:
+        condition: 'succeeded'
+
+- template: templates/qnn-ep-win.yml
+  parameters:
+    qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+    QnnSdk: ${{ parameters.QnnSdk }}
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    ArtifactName: 'drop-nuget-qnn-x64'
+    StageName: 'OnnxRuntime_QNN_Nuget_Win_x64'
+    build_config: 'RelWithDebInfo'
+- template: templates/qnn-ep-win.yml
+  parameters:
+    qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+    QnnSdk: ${{ parameters.QnnSdk }}
+    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    ArtifactName: 'drop-nuget-qnn-arm64'
+    buildParameter: '--arm64'
+    buildPlatform: 'ARM64'
+    buildArch: 'ARM64'
+    StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64'
+    build_config: 'RelWithDebInfo'
+
+- stage: NuGet_Packaging_QNN
+  pool: 'Onnxruntime-QNNEP-Windows-2022-CPU'
+  dependsOn:
+  - OnnxRuntime_QNN_Nuget_Win_x64
+  - OnnxRuntime_QNN_Nuget_Win_Arm64
+  condition: succeeded()
+  jobs:
+  - job: NuGet_Packaging_QNN
+    workspace:
+      clean: all
+    steps:
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - QNN NuGet x64'
+      inputs:
+        artifactName: 'drop-nuget-qnn-x64'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-x64'
+
+    - task: DownloadPipelineArtifact@0
+      displayName: 'Download Pipeline Artifact - QNN NuGet arm64'
+      inputs:
+        artifactName: 'drop-nuget-qnn-arm64'
+        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
+
+    - task: PowerShell@2
+      displayName: 'Bundle NuGet'
+      inputs:
+        targetType: 'inline'
+        script: |
+        
+          $x64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-x64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
+          $nuget_package_name = $x64_nupkgs[0].Name
+          $x64_nuget_package = $x64_nupkgs[0].FullName
+
+          $nupkg_unzipped_directory = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget_unzip_merged', [System.IO.Path]::GetFileNameWithoutExtension($nuget_package_name))
+                    
+          $x64_unzip_cmd = "7z.exe x $x64_nuget_package -y -o$nupkg_unzipped_directory"
+          Invoke-Expression -Command $x64_unzip_cmd
+
+          $arm64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-arm64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
+          $arm64_nuget_package = $arm64_nupkgs[0].FullName
+
+          $arm64_unzip_cmd = "7z.exe x $arm64_nuget_package -y -o$nupkg_unzipped_directory"
+          Invoke-Expression -Command $arm64_unzip_cmd
+
+          $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget-artifact-merged')
+          if (!(Test-Path $merged_nuget_path)) {
+              New-Item -Path $merged_nuget_path -ItemType Directory
+          }
+
+          $merged_zip = [System.IO.Path]::Combine($merged_nuget_path, 'qnn_nuget.zip')
+          $zip_cmd = "7z.exe a -r $merged_zip $nupkg_unzipped_directory/*" 
+          Invoke-Expression -Command $zip_cmd
+          
+          $merged_nuget = [System.IO.Path]::Combine($merged_nuget_path, $nuget_package_name)
+          move $merged_zip $merged_nuget
+        workingDirectory: $(Build.BinariesDirectory)
+
+    - template: templates/esrp_nuget.yml
+      parameters:
+        DisplayName: 'ESRP - sign NuGet package'
+        FolderPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
+        DoEsrp: ${{ parameters.DoEsrp }}
+
+    - task: PublishPipelineArtifact@0
+      displayName: 'Publish Pipeline NuGet Artifact'
+      inputs:
+        artifactName: 'drop-signed-nuget-qnn'
+        targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'

From bd5eb99252637fb5559eda1ce38167ed68705f92 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Thu, 10 Oct 2024 17:08:42 -0700
Subject: [PATCH 05/65] revert c-api-noopenmp-packaging-pipelines.yml

---
 .../c-api-noopenmp-packaging-pipelines.yml    | 427 +-----------------
 1 file changed, 2 insertions(+), 425 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index a24e40c1957ea..ba111f804a55e 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -62,7 +62,7 @@ parameters:
 - name: QnnSdk
   displayName: QNN SDK Version
   type: string
-  default: 2.22.0.240425
+  default: 2.26.0.240828
 
 resources:
   repositories:
@@ -83,7 +83,7 @@ variables:
   value: 11.8
 
 - name: win_trt_home
-  value: $(Agent.TempDirectory)\TensorRT-10.0.1.6.Windows10.x86_64.cuda-11.8
+  value: $(Agent.TempDirectory)\TensorRT-10.4.0.26.Windows10.x86_64.cuda-11.8
 - name: win_cuda_home
   value: $(Agent.TempDirectory)\v11.8
 
@@ -94,28 +94,6 @@ stages:
     PreReleaseVersionSuffixString: ${{ parameters.PreReleaseVersionSuffixString }}
     PreReleaseVersionSuffixNumber: ${{ parameters.PreReleaseVersionSuffixNumber }}
 
-- stage: Debug
-  dependsOn: Setup
-  jobs:
-  - job: D1
-    pool:
-      name: 'onnxruntime-Ubuntu2204-AMD-CPU'
-    variables:
-      MyVar: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
-      BuildDate: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']]
-      BuildTime: $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']]
-    steps:
-    - checkout: none
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
-    - bash: echo $(MyVar)
-    - bash: echo $(BuildTime)
-    - bash: echo $(BuildDate)
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
-
 - template: stages/download-java-tools-stage.yml
 
 - template: templates/c-api-cpu.yml
@@ -134,17 +112,6 @@ stages:
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
-- template: templates/ondevice-training-cpu-packaging-pipeline.yml
-  parameters:
-    RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
-    DoCompliance: ${{ parameters.DoCompliance }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    OrtNugetPackageId: 'Microsoft.ML.OnnxRuntime.Training'
-    AdditionalBuildFlags: '--enable_training_apis'
-    AdditionalWinBuildFlags: '--enable_onnx_tests --enable_wcos'
-    BuildVariant: 'default'
-
 - template: stages/java-cuda-packaging-stage.yml
   parameters:
     CudaVersion: 11.8
@@ -155,7 +122,6 @@ stages:
   parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       CudaVersion: 11.8
-      docker_base_image: 'nvidia/cuda:11.8.0-cudnn8-devel-ubi8'
       RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
       UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
       win_trt_home: ${{ variables.win_trt_home }}
@@ -167,304 +133,6 @@ stages:
       SpecificArtifact: ${{ parameters.SpecificArtifact }}
       BuildId: ${{ parameters.BuildId }}
 
-# ROCm
-- stage: Linux_C_API_Packaging_ROCm_x64
-  dependsOn: []
-  jobs:
-  - job: Linux_C_API_Packaging_ROCm_x64
-    workspace:
-      clean: all
-    timeoutInMinutes: 120
-    pool: onnxruntime-Ubuntu2204-AMD-CPU
-    variables:
-      RocmVersion: '5.6'
-    steps:
-    - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
-      submodules: recursive
-    - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux, for get-docker-image-steps.yml
-      submodules: false
-
-    # get-docker-image-steps.yml will move the $(Build.SourcesDirectory)/manylinux into $(Build.SourcesDirectory)/onnxruntime,
-    # then rename $(Build.SourcesDirectory)/onnxruntime as $(Build.SourcesDirectory)
-    - template: templates/get-docker-image-steps.yml
-      parameters:
-        Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_rocm
-        Context: tools/ci_build/github/linux/docker
-        DockerBuildArgs: >-
-          --build-arg INSTALL_DEPS_EXTRA_ARGS=-tmur
-          --build-arg BUILD_UID=$(id -u)
-          --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64
-          --build-arg ROCM_VERSION=$(RocmVersion)
-          --build-arg DEVTOOLSET_ROOTPATH=/opt/rh/gcc-toolset-12/root
-          --build-arg PREPEND_PATH=/opt/rh/gcc-toolset-12/root/usr/bin:
-          --build-arg LD_LIBRARY_PATH_ARG=/opt/rh/gcc-toolset-12/root/usr/lib64:/opt/rh/gcc-toolset-12/root/usr/lib:/opt/rh/gcc-toolset-12/root/usr/lib64/dyninst:/opt/rh/gcc-toolset-12/root/usr/lib/dyninst:/usr/local/lib64:/usr/local/lib
-        Repository: onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
-        CheckOutManyLinux: true
-
-    - template: templates/set-version-number-variables-step.yml
-
-    - task: Bash@3
-      displayName: 'Build'
-      inputs:
-        targetType: filePath
-        filePath: tools/ci_build/github/linux/build_rocm_c_api_package.sh
-        arguments: >-
-          -S $(Build.SourcesDirectory)
-          -B $(Build.BinariesDirectory)
-          -V $(RocmVersion)
-          -I onnxruntimetrainingrocmbuild-rocm$(RocmVersion)
-          -P python3.10
-
-    - script: |
-       set -e -x
-       mkdir $(Build.ArtifactStagingDirectory)/testdata
-       cp $(Build.BinariesDirectory)/Release/libcustom_op_library.so* $(Build.ArtifactStagingDirectory)/testdata
-       ls -al $(Build.ArtifactStagingDirectory)
-      displayName: 'Create Artifacts for CustomOp'  # libcustom_op_library.so from cpu build is built with fp8, ROCm does not support it.
-
-    - template: templates/c-api-artifacts-package-and-publish-steps-posix.yml
-      parameters:
-        buildConfig: 'Release'
-        artifactName: 'onnxruntime-linux-x64-rocm-$(OnnxRuntimeVersion)'
-        artifactNameNoVersionString: 'onnxruntime-linux-x64-rocm'
-        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters:
-        condition: 'succeeded'
-    - template: templates/clean-agent-build-directory-step.yml
-
-
-- stage: NuGet_Packaging_ROCm
-  dependsOn:
-  - Setup
-  - Linux_C_API_Packaging_ROCm_x64
-  condition: succeeded()
-  jobs:
-  - job: NuGet_Packaging_ROCm
-    workspace:
-      clean: all
-    # we need to use the 2022 pool to create the nuget package with both pre-net6+Xamarin and net6 targets.
-    # VS2019 has no support for net6 and we need to use msbuild (from the VS install) to do the packing
-    pool: 'Onnxruntime-Win-CPU-2022'
-    variables:
-      breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
-      ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
-      BuildDate : $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Date.BuildDate']]
-      BuildTime : $[stageDependencies.Setup.Set_Variables.outputs['Set_Build_Time.BuildTime']]
-
-    steps:
-    - checkout: self
-      submodules: true
-      fetchDepth: 1
-
-    - template: templates/flex-downloadPipelineArtifact.yml
-      parameters:
-        StepName: 'Download Pipeline Artifact - NuGet'
-        ArtifactName: 'onnxruntime-linux-x64-rocm'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
-        SpecificArtifact: ${{ parameters.specificArtifact }}
-        BuildId: ${{ parameters.BuildId }}
-
-    - task: PowerShell@2
-      displayName: 'Reconstruct Build Directory'
-      inputs:
-        targetType: inline
-        script: |
-          Get-ChildItem $(Build.BinariesDirectory)\nuget-artifact -Filter *.tgz | % {
-            # *.tar will be created after *.tgz is extracted
-            $cmd = "7z.exe x $($_.FullName) -y -o$(Build.BinariesDirectory)\nuget-artifact"
-            Write-Output $cmd
-            Invoke-Expression -Command $cmd
-          }
-
-          Get-ChildItem $(Build.BinariesDirectory)\nuget-artifact -Filter *.tar | % {
-            $cmd = "7z.exe x $($_.FullName) -y -o$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts"
-            Write-Output $cmd
-            Invoke-Expression -Command $cmd
-          }
-
-          $ort_dirs = Get-ChildItem -Path $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-* -Directory
-          foreach ($ort_dir in $ort_dirs)
-          {
-            $dirname = Split-Path -Path $ort_dir -Leaf
-            $dirname = $dirname.SubString(0, $dirname.LastIndexOf('-'))
-            Write-Output "Renaming $ort_dir to $dirname"
-            Rename-Item -Path $ort_dir -NewName $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\$dirname
-          }
-
-          Copy-Item -Path $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\nuget-artifacts\onnxruntime-linux-x64-rocm\lib\* -Destination $(Build.BinariesDirectory)\RelWithDebInfo
-
-    - script: |
-       tree /F
-      workingDirectory: '$(Build.BinariesDirectory)'
-      displayName: 'Inspect Build Binaries Directory'
-
-    - script: |
-       mklink /D /J models C:\local\models
-      workingDirectory: '$(Build.BinariesDirectory)'
-      displayName: 'Create models link'
-
-    - task: NuGetToolInstaller@0
-      displayName: Use Nuget 6.2.1
-      inputs:
-        versionSpec: 6.2.1
-
-    - task: PowerShell@2
-      displayName: Build .NET 6 targets using dotnet
-      inputs:
-        targetType: 'inline'
-        # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\
-        # which is inconsistent with the msbuild output path for the pre-.net6 targets
-        #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0
-        # and makes it harder to do the packing
-        #
-        # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine.
-        script: |
-          dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj `
-            -p:SelectedTargets=Net6 `
-            /p:Net6Targets=net6.0 `
-            -p:Configuration=RelWithDebInfo `
-            -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" `
-            -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm" `
-            -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} `
-            -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: MSBuild@1
-      displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: MSBuild@1
-      displayName: 'Build C# for pre-.net6 targets'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
-        configuration: RelWithDebInfo
-        platform: 'Any CPU'
-        msbuildArguments: >
-          -p:SelectedTargets=PreNet6
-          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
-          -p:OrtPackageId="Microsoft.ML.OnnxRuntime.ROCm"
-          -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }}
-          -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
-          -p:IsLinuxBuild=true
-          -p:IsWindowsBuild=false
-          -p:IsMacOSBuild=false
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - template: templates/win-esrp-dll.yml
-      parameters:
-        FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
-        DisplayName: 'ESRP - Sign C# dlls'
-        DoEsrp: ${{ parameters.DoEsrp }}
-
-    - task: MSBuild@1
-      displayName: Update projects.assets.json with combined list of all target frameworks
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: MSBuild@1
-      displayName: 'Build Nuget Packages'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
-        configuration: RelWithDebInfo
-        platform: 'Any CPU'
-        msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) -p:CurrentTime=$(BuildTime) -p:CurrentDate=$(BuildDate)'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: CopyFiles@2
-      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        Contents: '*.snupkg'
-        TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-    - task: CopyFiles@2
-      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
-      inputs:
-        SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
-        Contents: '*.nupkg'
-        TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-    - task: CopyFiles@2
-      displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
-      inputs:
-        SourceFolder: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
-        Contents: '*.nupkg'
-        TargetFolder: '$(Build.ArtifactStagingDirectory)'
-
-    - template: templates/esrp_nuget.yml
-      parameters:
-        DisplayName: 'ESRP - sign NuGet package'
-        FolderPath: '$(Build.ArtifactStagingDirectory)'
-        DoEsrp: ${{ parameters.DoEsrp }}
-
-    - template: templates/validate-package.yml
-      parameters:
-        PackageType: 'nuget'
-        PackagePath: '$(Build.ArtifactStagingDirectory)'
-        PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
-        PlatformsSupported: 'linux-x64'
-        VerifyNugetSigning: false
-
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Pipeline NuGet Artifact'
-      inputs:
-        artifactName: 'drop-signed-nuget-ROCm'
-        targetPath: '$(Build.ArtifactStagingDirectory)'
-
-    - task: MSBuild@1
-      displayName: 'Clean C#'
-      inputs:
-        solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
-        platform: 'Any CPU'
-        configuration: RelWithDebInfo
-        msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm'
-        workingDirectory: '$(Build.SourcesDirectory)\csharp'
-
-    - task: RoslynAnalyzers@2
-      displayName: 'Run Roslyn Analyzers'
-      inputs:
-        userProvideBuildInfo: msBuildInfo
-        msBuildCommandline: >
-          "C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\msbuild.exe"
-          $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln
-          -p:configuration="RelWithDebInfo"
-          -p:Platform="Any CPU"
-          -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)"
-          -p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm
-          -p:IsLinuxBuild=true
-          -p:IsWindowsBuild=false
-          -p:IsMacOSBuild=false
-      condition: and(succeeded(), eq('${{ parameters.DoCompliance }}', true))
-
-    - template: templates/component-governance-component-detection-steps.yml
-      parameters :
-        condition : 'succeeded'
-
-    - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
-      displayName: 'Clean Agent Directories'
-      condition: always()
-
-- template: nuget/templates/test_linux.yml
-  parameters:
-    AgentPool: AMD-GPU
-    ArtifactSuffix: 'ROCm'
-    StageSuffix: 'ROCm'
-    NugetPackageName: 'Microsoft.ML.OnnxRuntime.ROCm'
-    SpecificArtifact: ${{ parameters.specificArtifact }}
-    CustomOpArtifactName: 'onnxruntime-linux-x64-rocm'
-    BuildId: ${{ parameters.BuildId }}
 
 - template: nuget/templates/dml-vs-2022.yml
   parameters:
@@ -630,94 +298,3 @@ stages:
     - template: templates/component-governance-component-detection-steps.yml
       parameters:
         condition: 'succeeded'
-
-- template: templates/qnn-ep-win.yml
-  parameters:
-    qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-    QnnSdk: ${{ parameters.QnnSdk }}
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    ArtifactName: 'drop-nuget-qnn-x64'
-    StageName: 'OnnxRuntime_QNN_Nuget_Win_x64'
-    build_config: 'RelWithDebInfo'
-- template: templates/qnn-ep-win.yml
-  parameters:
-    qnn_ep_build_pool_name: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-    QnnSdk: ${{ parameters.QnnSdk }}
-    IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
-    DoEsrp: ${{ parameters.DoEsrp }}
-    ArtifactName: 'drop-nuget-qnn-arm64'
-    buildParameter: '--arm64'
-    buildPlatform: 'ARM64'
-    buildArch: 'ARM64'
-    StageName: 'OnnxRuntime_QNN_Nuget_Win_Arm64'
-    build_config: 'RelWithDebInfo'
-
-- stage: NuGet_Packaging_QNN
-  pool: 'Onnxruntime-QNNEP-Windows-2022-CPU'
-  dependsOn:
-  - OnnxRuntime_QNN_Nuget_Win_x64
-  - OnnxRuntime_QNN_Nuget_Win_Arm64
-  condition: succeeded()
-  jobs:
-  - job: NuGet_Packaging_QNN
-    workspace:
-      clean: all
-    steps:
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - QNN NuGet x64'
-      inputs:
-        artifactName: 'drop-nuget-qnn-x64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-x64'
-
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - QNN NuGet arm64'
-      inputs:
-        artifactName: 'drop-nuget-qnn-arm64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact-arm64'
-
-    - task: PowerShell@2
-      displayName: 'Bundle NuGet'
-      inputs:
-        targetType: 'inline'
-        script: |
-        
-          $x64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-x64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
-          $nuget_package_name = $x64_nupkgs[0].Name
-          $x64_nuget_package = $x64_nupkgs[0].FullName
-
-          $nupkg_unzipped_directory = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget_unzip_merged', [System.IO.Path]::GetFileNameWithoutExtension($nuget_package_name))
-                    
-          $x64_unzip_cmd = "7z.exe x $x64_nuget_package -y -o$nupkg_unzipped_directory"
-          Invoke-Expression -Command $x64_unzip_cmd
-
-          $arm64_nupkgs = (Get-ChildItem $(Build.BinariesDirectory)/nuget-artifact-arm64 -Filter Microsoft.ML.OnnxRuntime.QNN*.nupkg -Recurse)
-          $arm64_nuget_package = $arm64_nupkgs[0].FullName
-
-          $arm64_unzip_cmd = "7z.exe x $arm64_nuget_package -y -o$nupkg_unzipped_directory"
-          Invoke-Expression -Command $arm64_unzip_cmd
-
-          $merged_nuget_path = [System.IO.Path]::Combine($Env:BUILD_ARTIFACTSTAGINGDIRECTORY, 'nuget-artifact-merged')
-          if (!(Test-Path $merged_nuget_path)) {
-              New-Item -Path $merged_nuget_path -ItemType Directory
-          }
-
-          $merged_zip = [System.IO.Path]::Combine($merged_nuget_path, 'qnn_nuget.zip')
-          $zip_cmd = "7z.exe a -r $merged_zip $nupkg_unzipped_directory/*" 
-          Invoke-Expression -Command $zip_cmd
-          
-          $merged_nuget = [System.IO.Path]::Combine($merged_nuget_path, $nuget_package_name)
-          move $merged_zip $merged_nuget
-        workingDirectory: $(Build.BinariesDirectory)
-
-    - template: templates/esrp_nuget.yml
-      parameters:
-        DisplayName: 'ESRP - sign NuGet package'
-        FolderPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'
-        DoEsrp: ${{ parameters.DoEsrp }}
-
-    - task: PublishPipelineArtifact@0
-      displayName: 'Publish Pipeline NuGet Artifact'
-      inputs:
-        artifactName: 'drop-signed-nuget-qnn'
-        targetPath: '$(Build.ArtifactStagingDirectory)/nuget-artifact-merged'

From c162c88d0d136454f12c8915c19ea300a7308348 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 11 Oct 2024 10:24:54 -0700
Subject: [PATCH 06/65] adding --use_winml and --parallel

---
 .../azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml  | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
index bf5fc661fd70c..77c1e1470ea31 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --use_winml --build_csharp --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
@@ -68,7 +68,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From e229292c363cd178bb17278d160ff06219f697f7 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Fri, 11 Oct 2024 14:05:18 -0700
Subject: [PATCH 07/65] Remove --use_winml

---
 .../azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml    | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
index 77c1e1470ea31..77701c8ff934d 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --use_winml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From fb68a59f15320d32ed007981b3368aac540191ad Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Mon, 21 Oct 2024 21:21:53 -0400
Subject: [PATCH 08/65] remove --test from build

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 52547fd9a796b..9eb9e9558405a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -214,7 +214,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
             workingDirectory: '$(Build.BinariesDirectory)'
       - ${{ else }}:
         - powershell: |
@@ -389,7 +389,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests  $(TelemetryOption) '
             workingDirectory: '$(Build.BinariesDirectory)'
 # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:

From 04717a87ba7d817b00283fd3233af0a655f6f94c Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 22 Oct 2024 14:14:34 -0400
Subject: [PATCH 09/65] Update

---
 .../github/azure-pipelines/cuda-packaging-pipeline.yml        | 2 +-
 .../github/azure-pipelines/nuget-cuda-publishing-pipeline.yml | 2 +-
 tools/ci_build/github/azure-pipelines/publish-nuget.yml       | 2 +-
 ...get-combine-gpu-stage.yml => nuget-combine-cuda-stage.yml} | 4 ++--
 ...gpu-packaging-stage.yml => nuget-cuda-packaging-stage.yml} | 0
 ...u-publishing-stage.yml => nuget-cuda-publishing-stage.yml} | 0
 ...packaging-stage.yml => nuget-win-cuda-packaging-stage.yml} | 4 ++--
 7 files changed, 7 insertions(+), 7 deletions(-)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-combine-gpu-stage.yml => nuget-combine-cuda-stage.yml} (96%)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-gpu-packaging-stage.yml => nuget-cuda-packaging-stage.yml} (100%)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-gpu-publishing-stage.yml => nuget-cuda-publishing-stage.yml} (100%)
 rename tools/ci_build/github/azure-pipelines/stages/{nuget-win-gpu-packaging-stage.yml => nuget-win-cuda-packaging-stage.yml} (98%)

diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
index 6a245eed44828..7118e85e9ea4b 100644
--- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -105,7 +105,7 @@ stages:
         PackageNodeJS: false
 
   # Nuget Packaging
-  - template: stages/nuget-combine-gpu-stage.yml
+  - template: stages/nuget-combine-cuda-stage.yml
     parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       CudaVersion: ${{ parameters.CudaVersion }}
diff --git a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
index 0e17bef4c8f73..aeb250e1e0cbc 100644
--- a/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget-cuda-publishing-pipeline.yml
@@ -22,7 +22,7 @@ variables:
       value: onnxruntime-cuda-12
 
 stages:
-  - template: stages/nuget-gpu-publishing-stage.yml
+  - template: stages/nuget-cuda-publishing-stage.yml
     parameters:
       artifact_feed: $(ArtifactFeed)
 
diff --git a/tools/ci_build/github/azure-pipelines/publish-nuget.yml b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
index 42ead98209505..b78d586288ba3 100644
--- a/tools/ci_build/github/azure-pipelines/publish-nuget.yml
+++ b/tools/ci_build/github/azure-pipelines/publish-nuget.yml
@@ -33,7 +33,7 @@ stages:
         - script: move "$(Pipeline.Workspace)\build\drop-signed-nuget-dml\*" $(Build.BinariesDirectory)\nuget-artifact\final-package
 
   # Publish CUDA 11 Nuget/Java pkgs to ADO feed
-  - template: stages/nuget-gpu-publishing-stage.yml
+  - template: stages/nuget-cuda-publishing-stage.yml
     parameters:
       artifact_feed: $(ArtifactFeed)
 
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-gpu-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
similarity index 96%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-combine-gpu-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index f10d700e7d1d9..9c7fbc24ab1b6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-gpu-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -42,7 +42,7 @@ stages:
     buildJava: ${{ parameters.buildJava }}
     buildNodejs: ${{ parameters.buildNodejs }}
 
-- template: nuget-win-gpu-packaging-stage.yml
+- template: nuget-win-cuda-packaging-stage.yml
   parameters:
     RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
@@ -51,7 +51,7 @@ stages:
     win_cuda_home: ${{ parameters.win_cuda_home }}
     buildJava: ${{ parameters.buildJava }}
 
-- template: nuget-gpu-packaging-stage.yml
+- template: nuget-cuda-packaging-stage.yml
   parameters:
     DoCompliance: ${{ parameters.DoCompliance }}
     DoEsrp: ${{ parameters.DoEsrp }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml
similarity index 100%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-gpu-packaging-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-cuda-packaging-stage.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-gpu-publishing-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-cuda-publishing-stage.yml
similarity index 100%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-gpu-publishing-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-cuda-publishing-stage.yml
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
similarity index 98%
rename from tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
rename to tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 77701c8ff934d..445066f08995a 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-gpu-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
@@ -68,7 +68,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From 2401b80d0b512dcb776757f46c70e7fd3b0d262e Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 22 Oct 2024 14:15:33 -0400
Subject: [PATCH 10/65] Merge with main

---
 .../azure-pipelines/c-api-noopenmp-packaging-pipelines.yml      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 4d76f4c610def..b12360d2710d0 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -118,7 +118,7 @@ stages:
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
 
-- template: stages/nuget-combine-gpu-stage.yml
+- template: stages/nuget-combine-cuda-stage.yml
   parameters:
       DoCompliance: ${{ parameters.DoCompliance }}
       CudaVersion: 11.8

From a9e47fb2fedfc4fb4b484cbc900b63b15948b458 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 22 Oct 2024 14:21:18 -0400
Subject: [PATCH 11/65] parallel

---
 .../azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 445066f08995a..1adebe434a64c 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From 4d0ce6dcc9670cb348825d77ed6f8d3b1cbe9d38 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 22 Oct 2024 14:25:14 -0400
Subject: [PATCH 12/65] --use_dml --build_csharp --parallel

---
 .../azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml   | 2 +-
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 1adebe434a64c..4ce7a13ca2cc6 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index cc8048e2e84ce..259fe79de243a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -25,7 +25,7 @@ parameters:
 
 - name: runTests
   type: boolean
-  default: true
+  default: false
 
 - name: buildJava
   type: boolean

From 24750acfe47e5a9b383404c7c9c3842182a62416 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 22 Oct 2024 14:26:00 -0400
Subject: [PATCH 13/65] --use_dml --build_csharp --parallel

---
 .../azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 4ce7a13ca2cc6..be9abc5640cf4 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -68,7 +68,7 @@ stages:
     msbuildPlatform: x64
     CudaVersion: ${{ parameters.CudaVersion }}
     packageName: x64-tensorrt
-    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From 3ddf44be50ac2b25ae205acefc02966e46411422 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 13:33:14 +0800
Subject: [PATCH 14/65] verify image

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 259fe79de243a..cb2d95171979f 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -319,7 +319,10 @@ stages:
     - job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
       workspace:
         clean: all
-      pool: ${{ parameters.ort_build_pool_name }}
+      pool:
+        name: ${{ parameters.ort_build_pool_name }}
+        demainds:
+        - ImageVersionOverride: 233.0.0
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From c16ded2bccc59db6f41cee10dbe348ae5752eca8 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 21:45:23 +0800
Subject: [PATCH 15/65] update

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index cb2d95171979f..06bf5421e1b16 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -214,7 +214,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --test --skip_submodule_sync --build_shared_lib --cmake_generator "$(VSGenerator)" --enable_onnx_tests  $(TelemetryOption) ${{ parameters.buildparameter }}'
             workingDirectory: '$(Build.BinariesDirectory)'
       - ${{ else }}:
         - powershell: |
@@ -392,7 +392,7 @@ stages:
           condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
           inputs:
             scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --enable_onnx_tests  $(TelemetryOption) '
+            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
             workingDirectory: '$(Build.BinariesDirectory)'
 # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:

From 5a13349f6b7abc7e81b260ddeebcc569eb693e49 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 22:01:06 +0800
Subject: [PATCH 16/65] typo

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 06bf5421e1b16..7a319e6c8d27e 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -321,7 +321,7 @@ stages:
         clean: all
       pool:
         name: ${{ parameters.ort_build_pool_name }}
-        demainds:
+        demands:
         - ImageVersionOverride: 233.0.0
       timeoutInMinutes: 180
       steps:

From e21c3875411117e5cc65781d1cce83ef509e3f3d Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 22:05:39 +0800
Subject: [PATCH 17/65] update1

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 7a319e6c8d27e..0fac73831f527 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -321,8 +321,7 @@ stages:
         clean: all
       pool:
         name: ${{ parameters.ort_build_pool_name }}
-        demands:
-        - ImageVersionOverride: 233.0.0
+        demands: ImageVersionOverride -equals 233.0.0
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From 1e45aafdf9a9e0634834927d2622cc02669d9056 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 22:09:14 +0800
Subject: [PATCH 18/65] update 2

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 0fac73831f527..136dadd9ae52a 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -321,7 +321,7 @@ stages:
         clean: all
       pool:
         name: ${{ parameters.ort_build_pool_name }}
-        demands: ImageVersionOverride -equals 233.0.0
+        demands: ImageVersionOverride -equals 248.0.0
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From 73ecd6012655fdfbe44fec992a8b3d19b0818ee9 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 22:22:36 +0800
Subject: [PATCH 19/65] print log

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 136dadd9ae52a..dd3ee17184f15 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -328,6 +328,10 @@ stages:
           displayName: 'Clean Agent Directories'
           condition: always()
 
+        - script:
+            echo ${{ parameters.SpecificArtifact }}
+          displayName: 'Print Specific Artifact'
+
         - checkout: self
           clean: true
           submodules: none

From 9fd99e4a447d26f34481308946534f7daa915bc9 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 22:35:04 +0800
Subject: [PATCH 20/65] update 3

---
 .../github/azure-pipelines/stages/nuget-combine-cuda-stage.yml  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
index 9c7fbc24ab1b6..0b3eac0110abc 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -50,6 +50,8 @@ stages:
     win_trt_home: ${{ parameters.win_trt_home }}
     win_cuda_home: ${{ parameters.win_cuda_home }}
     buildJava: ${{ parameters.buildJava }}
+    SpecificArtifact: ${{ parameters.SpecificArtifact }}
+    BuildId: ${{ parameters.BuildId }}
 
 - template: nuget-cuda-packaging-stage.yml
   parameters:

From ffa9c2bb60f83ee1e9318b9ce70df822b1135f73 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 23:38:14 +0800
Subject: [PATCH 21/65] test filter

---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 9624f9112c49f..2677e759d0ef3 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2072,7 +2072,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 executables.append("onnxruntime_global_thread_pools_test")
                 executables.append("onnxruntime_customopregistration_test")
             for exe in executables:
-                test_output = f"--gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
+                test_output = f"--gtest_filter==*FusedMatMulOpTest* --gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
                 run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path)
         else:
             ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.test_all_timeout]

From 7490e44586c50944abb730ee7e7c2829096688ce Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 24 Oct 2024 23:54:09 +0800
Subject: [PATCH 22/65] test filter1

---
 tools/ci_build/build.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 2677e759d0ef3..2bc7be6b0115c 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2072,7 +2072,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 executables.append("onnxruntime_global_thread_pools_test")
                 executables.append("onnxruntime_customopregistration_test")
             for exe in executables:
-                test_output = f"--gtest_filter==*FusedMatMulOpTest* --gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
+                test_output = f"--gtest_filter=*FusedMatMulOpTest* --gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
                 run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path)
         else:
             ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.test_all_timeout]

From 09fc7ec1c4f64cc40f893228688607f7058acaaa Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 00:07:52 +0800
Subject: [PATCH 23/65] complete A10

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index dd3ee17184f15..aa315002bb0b3 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -319,9 +319,9 @@ stages:
     - job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
       workspace:
         clean: all
-      pool:
-        name: ${{ parameters.ort_build_pool_name }}
-        demands: ImageVersionOverride -equals 248.0.0
+      pool: zhanyi_test_A100_pool
+        # name: ${{ parameters.ort_build_pool_name }}
+        # demands: ImageVersionOverride -equals 248.0.0
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From 652950e586b06602cf6fee51c95e661ed78b4847 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 07:34:12 +0800
Subject: [PATCH 24/65] add --use-winml

---
 .../stages/nuget-win-cuda-packaging-stage.yml               | 4 ++--
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml  | 6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index be9abc5640cf4..99b972fac2455 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -34,7 +34,7 @@ parameters:
   displayName: Specific Artifact's BuildId
   type: string
   default: '0'
-  
+
 - name: buildJava
   type: boolean
 
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --use_winml --build_csharp --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index aa315002bb0b3..dd3ee17184f15 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -319,9 +319,9 @@ stages:
     - job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
       workspace:
         clean: all
-      pool: zhanyi_test_A100_pool
-        # name: ${{ parameters.ort_build_pool_name }}
-        # demands: ImageVersionOverride -equals 248.0.0
+      pool:
+        name: ${{ parameters.ort_build_pool_name }}
+        demands: ImageVersionOverride -equals 248.0.0
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From 982a67411a17489c7e2427c0aa96fafb55731b0a Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 15:17:19 +0800
Subject: [PATCH 25/65] split cuda and dml test

---
 onnxruntime/test/util/default_providers.cc    | 18 +++++++--
 .../stages/nuget-win-cuda-packaging-stage.yml |  1 +
 .../azure-pipelines/templates/win-ci.yml      | 39 +++++++++++++++----
 3 files changed, 48 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index d57a22f024d5f..3039c6cae0398 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -147,6 +147,12 @@ std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options) {
 #ifdef USE_CUDA
+#ifdef USE_CUDA
+  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
+  if (no_dml_ep_test == "1") {
+    return nullptr;
+  }
+#endif
   if (auto factory = CudaProviderFactoryCreator::Create(provider_options))
     return factory->CreateProvider();
 #else
@@ -324,10 +330,16 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
-  ConfigOptions config_options{};
-  if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
-    return factory->CreateProvider();
+#ifdef USE_CUDA
+  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
+  if (no_cuda_ep_test == "1") {
+    return nullptr;
   }
+#endif
+    ConfigOptions config_options{};
+    if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
+      return factory->CreateProvider();
+    }
 #endif
   return nullptr;
 }
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index 99b972fac2455..bf64361102df2 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -57,6 +57,7 @@ stages:
     UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
     SpecificArtifact: ${{ parameters.SpecificArtifact }}
     BuildId: ${{ parameters.BuildId }}
+    ComboTests: true
 # Windows CUDA with TensorRT Packaging
 - template: ../templates/win-ci.yml
   parameters:
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index dd3ee17184f15..fdbb665697eb8 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -71,6 +71,10 @@ parameters:
       - 11.8
       - 12.2
 
+- name: ComboTests
+  type: boolean
+  default: false
+
 - name: SpecificArtifact
   displayName: Use Specific Artifact
   type: boolean
@@ -390,13 +394,34 @@ stages:
           displayName: 'Append dotnet x86  Directory to PATH'
           condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
 
-        - task: PythonScript@0
-          displayName: 'test'
-          condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
-          inputs:
-            scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
-            arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
-            workingDirectory: '$(Build.BinariesDirectory)'
+        - ${{ if eq(parameters.ComboTests, 'true') }}:
+          - task: PythonScript@0
+            displayName: 'test'
+            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+              workingDirectory: '$(Build.BinariesDirectory)'
+            environment:
+              NO_CUDA_TEST: 1
+          - task: PythonScript@0
+            displayName: 'test'
+            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+              workingDirectory: '$(Build.BinariesDirectory)'
+            environment:
+              NO_DML_TEST: 1
+        - ${{ else }}:
+          - task: PythonScript@0
+            displayName: 'test'
+            condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
+            inputs:
+              scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
+              arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
+              workingDirectory: '$(Build.BinariesDirectory)'
+
 # Previous stage only assembles the java binaries, testing will be done in this stage with GPU machine
         - ${{ if eq(parameters.buildJava, 'true') }}:
           - template: make_java_win_binaries.yml

From e6c18de492f5138ed1f055330db7f1b5552ecdd5 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 15:18:26 +0800
Subject: [PATCH 26/65] update

---
 .../azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index bf64361102df2..c2d44f4d249aa 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --use_winml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From 6a5118ee0285d7df16b5419d856691838fdd0189 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 15:21:17 +0800
Subject: [PATCH 27/65] update 1

---
 .../ci_build/github/azure-pipelines/templates/win-ci.yml  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index fdbb665697eb8..11e3ce04c5c85 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -396,22 +396,22 @@ stages:
 
         - ${{ if eq(parameters.ComboTests, 'true') }}:
           - task: PythonScript@0
-            displayName: 'test'
+            displayName: 'test excludes CUDA'
             condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
             inputs:
               scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
               arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
               workingDirectory: '$(Build.BinariesDirectory)'
-            environment:
+            env:
               NO_CUDA_TEST: 1
           - task: PythonScript@0
-            displayName: 'test'
+            displayName: 'test excludes DML'
             condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
             inputs:
               scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py'
               arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
               workingDirectory: '$(Build.BinariesDirectory)'
-            environment:
+            env:
               NO_DML_TEST: 1
         - ${{ else }}:
           - task: PythonScript@0

From 65223279ffd05e8d102ec32cd901f0881b0526b4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 16:27:21 +0800
Subject: [PATCH 28/65] update 3

---
 onnxruntime/test/util/default_providers.cc | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 3039c6cae0398..d79151d79fce3 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -122,6 +122,12 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
+#ifdef USE_DML
+  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
+  if (no_dml_ep_test == "1") {
+    return nullptr;
+  }
+#endif
   OrtCUDAProviderOptionsV2 provider_options{};
   provider_options.do_copy_in_default_stream = true;
   provider_options.use_tf32 = false;
@@ -147,12 +153,6 @@ std::unique_ptr<IExecutionProvider> DefaultCudaNHWCExecutionProvider() {
 
 std::unique_ptr<IExecutionProvider> CudaExecutionProviderWithOptions(const OrtCUDAProviderOptionsV2* provider_options) {
 #ifdef USE_CUDA
-#ifdef USE_CUDA
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
-    return nullptr;
-  }
-#endif
   if (auto factory = CudaProviderFactoryCreator::Create(provider_options))
     return factory->CreateProvider();
 #else
@@ -336,10 +336,10 @@ std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
     return nullptr;
   }
 #endif
-    ConfigOptions config_options{};
-    if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
-      return factory->CreateProvider();
-    }
+  ConfigOptions config_options{};
+  if (auto factory = DMLProviderFactoryCreator::CreateFromDeviceOptions(config_options, nullptr, false, false)) {
+    return factory->CreateProvider();
+  }
 #endif
   return nullptr;
 }

From 974ee3a2f65764526769fc302cfb3f906cdf8841 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Fri, 25 Oct 2024 22:07:34 +0800
Subject: [PATCH 29/65] update tests

---
 .../test/contrib_ops/matmul_4bits_test.cc     | 23 ++++++++++++++-----
 .../matmul_integer_to_float_test.cc           |  2 +-
 onnxruntime/test/lora/lora_test.cc            | 16 +++++++++++++
 onnxruntime/test/providers/cpu/model_tests.cc | 12 ++++++++++
 tools/ci_build/build.py                       |  2 +-
 5 files changed, 47 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 8138829b057f2..9fa1e155f0d7a 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -485,13 +485,17 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
   if (use_float16) {
 #ifdef USE_CUDA
-    execution_providers.push_back(DefaultCudaExecutionProvider());
+    if (DefaultCudaExecutionProvider() != nullptr) {
+      execution_providers.push_back(DefaultCudaExecutionProvider());
+    }
 #endif
 #ifdef USE_ROCM
     execution_providers.push_back(DefaultRocmExecutionProvider());
 #endif
 #ifdef USE_DML
-    execution_providers.push_back(DefaultDmlExecutionProvider());
+    if (DefaultDmlExecutionProvider() != nullptr) {
+      execution_providers.push_back(DefaultDmlExecutionProvider());
+    }
 #endif
 
     RunTest<MLFloat16>(opts, std::move(execution_providers));
@@ -506,8 +510,11 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
 }  // namespace
 
 TEST(MatMulNBits, Float16Cuda) {
-#if defined(USE_CUDA) || defined(USE_ROCM)
+#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
   auto has_gidx_options = {true, false};
+  if (DefaultDmlExecutionProvider() != nullptr) {
+    has_gidx_options = {false};
+  }
 #else
   auto has_gidx_options = {false};
 #endif
@@ -518,7 +525,9 @@ TEST(MatMulNBits, Float16Cuda) {
         for (auto block_size : {16, 32, 64, 128}) {
           for (auto has_gidx : has_gidx_options) {
 #ifdef USE_DML
-            RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+            if (DefaultDmlExecutionProvider() != nullptr) {
+              RunTest(M, N, K, block_size, 0, false, true, has_gidx, true, 0.04f);
+            }
 #else
             RunTest(M, N, K, block_size, 0, false, true, has_gidx);
             RunTest(M, N, K, block_size, 0, true, true, has_gidx, false);
@@ -531,12 +540,15 @@ TEST(MatMulNBits, Float16Cuda) {
 }
 
 TEST(MatMulNBits, Float16Large) {
-#ifdef USE_DML
+#if defined(USE_CUDA) || defined(USE_DML)
   // For some reason, the A10 machine that runs these tests during CI has a much bigger error than all retail
   // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
   // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
   // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
   float abs_error = 0.3f;
+  if (DefaultDmlExecutionProvider() != nullptr) {
+    abs_error = 0.05f;
+  }
 #else
   float abs_error = 0.05f;
 #endif
@@ -549,7 +561,6 @@ TEST(MatMulNBits, Float16Large) {
     }
   }
 }
-
 #endif  // defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
 }  // namespace test
 }  // namespace onnxruntime
diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
index 8d7629b5fda1c..d88c3131a4ca5 100644
--- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc
@@ -227,7 +227,7 @@ TEST(MatMulIntegerToFloat, HasZeroPoint_HasBias_test_U8S8) {
 }
 
 // DML EP supports Float16 output type and Signed A Matrix and Unsigned B Matric for Float32 output
-#if defined(USE_DML)
+#if defined(USE_DML) && !defined(USE_CUDA)
 
 TEST(MatMulIntegerToFloat, HasZeroPoint_NoBias_test_S8U8) {
   RunMatMulIntegerToFloatTest<int8_t, uint8_t, float, true, false>();
diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index fde603858f9a9..4155cb7abc279 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -201,6 +201,14 @@ TEST(LoraAdapterTest, Load) {
 
 #ifdef USE_CUDA
 TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "Skip This Test Due to this EP is null";
+  }
+#ifdef USE_DML
+  if (DefaultDmlExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "It should not run with DML EP";
+  }
+#endif
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
   auto cuda_allocator = DefaultCudaExecutionProvider()->CreatePreferredAllocators()[0];
@@ -234,6 +242,14 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
 
 #ifdef USE_DML
 TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
+  if (DefaultDmlExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "Skip This Test Due to this EP is null";
+  }
+#ifdef USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    GTEST_SKIP() << "It should not run with CUDA EP";
+  }
+#endif
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
 
diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc
index e3c86a137484f..b46c253fb8ed9 100644
--- a/onnxruntime/test/providers/cpu/model_tests.cc
+++ b/onnxruntime/test/providers/cpu/model_tests.cc
@@ -491,6 +491,18 @@ ::std::vector<::std::basic_string<ORTCHAR_T>> GetParameterStrings() {
   // the number of times these are run to reduce the CI time.
   provider_names.erase(provider_name_cpu);
 #endif
+
+#if defined(USE_CUDA) && defined(USE_DML)
+  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
+  if (no_cuda_ep_test == "1") {
+    provider_names.erase(provider_name_cuda);
+  }
+  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
+  if (no_dml_ep_test == "1") {
+    provider_names.erase(provider_name_dml);
+  }
+#endif
+
   std::vector<std::basic_string<ORTCHAR_T>> v;
   // Permanently exclude following tests because ORT support only opset starting from 7,
   // Please make no more changes to the list
diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py
index 2bc7be6b0115c..9624f9112c49f 100644
--- a/tools/ci_build/build.py
+++ b/tools/ci_build/build.py
@@ -2072,7 +2072,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs):
                 executables.append("onnxruntime_global_thread_pools_test")
                 executables.append("onnxruntime_customopregistration_test")
             for exe in executables:
-                test_output = f"--gtest_filter=*FusedMatMulOpTest* --gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
+                test_output = f"--gtest_output=xml:{cwd}/{exe}.{config}.results.xml"
                 run_subprocess([os.path.join(cwd, exe), test_output], cwd=cwd, dll_path=dll_path)
         else:
             ctest_cmd = [ctest_path, "--build-config", config, "--verbose", "--timeout", args.test_all_timeout]

From 9494656ed2eac36225e096f87d159ae842bc73c5 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 10:37:22 +0800
Subject: [PATCH 30/65] onnxruntime4j_test

---
 java/src/test/java/ai/onnxruntime/InferenceTest.java | 9 ++++++++-
 onnxruntime/test/contrib_ops/matmul_4bits_test.cc    | 4 ++--
 2 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index e11537492d3a7..e16a0526efd8e 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -740,7 +740,14 @@ public void testCoreML() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
-    runProvider(OrtProvider.DIRECT_ML);
+    if (System.getProperty("USE_CUDA") != "1") {
+      runProvider(OrtProvider.DIRECT_ML);
+    } else if(System.getProperty("USE_CUDA") == "1" && System.getenv("NO_CUDA_TEST") == "1" ) {
+      runProvider(OrtProvider.DIRECT_ML);
+    } else {
+      System.out.println("Skipping DirectML test because CUDA EP test is enabled.");
+      return;
+    }
   }
 
   @Test
diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index 9fa1e155f0d7a..f3ad5618f267f 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -511,9 +511,9 @@ void RunTest(int64_t M, int64_t N, int64_t K, int64_t block_size, int64_t accura
 
 TEST(MatMulNBits, Float16Cuda) {
 #if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML)
-  auto has_gidx_options = {true, false};
+  std::vector<bool> has_gidx_options = {true, false};
   if (DefaultDmlExecutionProvider() != nullptr) {
-    has_gidx_options = {false};
+    has_gidx_options.assign(1, false);
   }
 #else
   auto has_gidx_options = {false};

From 1b213cba17d40b21d73f29a1e1a21d1faeb69140 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 13:58:56 +0800
Subject: [PATCH 31/65] typo

---
 java/src/test/java/ai/onnxruntime/InferenceTest.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index e16a0526efd8e..b2611a322fa16 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -740,7 +740,7 @@ public void testCoreML() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
-    if (System.getProperty("USE_CUDA") != "1") {
+    if (System.getenv("USE_CUDA") != "1") {
       runProvider(OrtProvider.DIRECT_ML);
     } else if(System.getProperty("USE_CUDA") == "1" && System.getenv("NO_CUDA_TEST") == "1" ) {
       runProvider(OrtProvider.DIRECT_ML);

From 4019016885a87b1c3dcbbd84aebcc1eedf950ee4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 14:12:33 +0800
Subject: [PATCH 32/65] update

---
 .../src/test/java/ai/onnxruntime/InferenceTest.java | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index b2611a322fa16..724ccbcf4756e 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -740,13 +740,14 @@ public void testCoreML() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
-    if (System.getenv("USE_CUDA") != "1") {
-      runProvider(OrtProvider.DIRECT_ML);
-    } else if(System.getProperty("USE_CUDA") == "1" && System.getenv("NO_CUDA_TEST") == "1" ) {
-      runProvider(OrtProvider.DIRECT_ML);
+    String no_cuda_test = System.getenv("NO_CUDA_TEST");
+    if (no_cuda_test == null || no_cuda_test.isEmpty() || no_cuda_test != "1") {
+      if(System.getProperty("USE_CUDA") == "1") {
+        System.out.println("Skipping DirectML test because CUDA EP test is enabled.");
+        return;
+      }
     } else {
-      System.out.println("Skipping DirectML test because CUDA EP test is enabled.");
-      return;
+      runProvider(OrtProvider.CORE_ML);
     }
   }
 

From abe4326c2c796393efa8bcbd1b0e7d3c9939a539 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 14:33:16 +0800
Subject: [PATCH 33/65] update

---
 java/src/test/java/ai/onnxruntime/InferenceTest.java | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 724ccbcf4756e..54ef492ac016d 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -741,8 +741,8 @@ public void testCoreML() throws OrtException {
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
     String no_cuda_test = System.getenv("NO_CUDA_TEST");
-    if (no_cuda_test == null || no_cuda_test.isEmpty() || no_cuda_test != "1") {
-      if(System.getProperty("USE_CUDA") == "1") {
+    if (no_cuda_test == null || no_cuda_test.isEmpty() || ! no_cuda_test.equals("1")) {
+      if (System.getProperty("USE_CUDA").equals("1")) {
         System.out.println("Skipping DirectML test because CUDA EP test is enabled.");
         return;
       }

From f24e621a6915bd0f83f1cdb26b1b1b30112bab45 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 15:23:29 +0800
Subject: [PATCH 34/65] update

---
 onnxruntime/test/lora/lora_test.cc         | 4 ++--
 onnxruntime/test/util/default_providers.cc | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index 4155cb7abc279..bfb6dfcc606fe 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -205,7 +205,7 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
     GTEST_SKIP() << "Skip This Test Due to this EP is null";
   }
 #ifdef USE_DML
-  if (DefaultDmlExecutionProvider() == nullptr) {
+  if (DefaultDmlExecutionProvider() != nullptr) {
     GTEST_SKIP() << "It should not run with DML EP";
   }
 #endif
@@ -246,7 +246,7 @@ TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
     GTEST_SKIP() << "Skip This Test Due to this EP is null";
   }
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
+  if (DefaultCudaExecutionProvider() != nullptr) {
     GTEST_SKIP() << "It should not run with CUDA EP";
   }
 #endif
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index d79151d79fce3..ec391c397abac 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -124,7 +124,7 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
 #ifdef USE_DML
   const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test == "1") {
+  if (no_dml_ep_test != "1") {
     return nullptr;
   }
 #endif
@@ -332,7 +332,7 @@ std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
 #ifdef USE_CUDA
   const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test == "1") {
+  if (no_cuda_ep_test != "1") {
     return nullptr;
   }
 #endif

From 1598875e896552fc0aedbbb1c5ad517815055fb4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 15:26:53 +0800
Subject: [PATCH 35/65] update java code

---
 java/src/test/java/ai/onnxruntime/InferenceTest.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 54ef492ac016d..4b391987cc826 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -741,7 +741,7 @@ public void testCoreML() throws OrtException {
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
     String no_cuda_test = System.getenv("NO_CUDA_TEST");
-    if (no_cuda_test == null || no_cuda_test.isEmpty() || ! no_cuda_test.equals("1")) {
+    if (no_cuda_test == null || no_cuda_test.isEmpty() || !no_cuda_test.equals("1")) {
       if (System.getProperty("USE_CUDA").equals("1")) {
         System.out.println("Skipping DirectML test because CUDA EP test is enabled.");
         return;

From 0a28ba564ecdb37869f59424eccd0804f328423e Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 18:05:59 +0800
Subject: [PATCH 36/65] update pool image

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 11e3ce04c5c85..8e38604f90fce 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -323,9 +323,7 @@ stages:
     - job: Windows_Packaging_${{ parameters.stage_name_suffix }}_Testing
       workspace:
         clean: all
-      pool:
-        name: ${{ parameters.ort_build_pool_name }}
-        demands: ImageVersionOverride -equals 248.0.0
+      pool: ${{ parameters.ort_build_pool_name }}
       timeoutInMinutes: 180
       steps:
         - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3

From e81aea8971bab1cb08e8c0f941ff64342e996735 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 18:45:52 +0800
Subject: [PATCH 37/65] fail condition

---
 onnxruntime/test/lora/lora_test.cc         | 7 +++++--
 onnxruntime/test/util/default_providers.cc | 2 +-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc
index bfb6dfcc606fe..8338c7d547a09 100644
--- a/onnxruntime/test/lora/lora_test.cc
+++ b/onnxruntime/test/lora/lora_test.cc
@@ -206,7 +206,7 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
   }
 #ifdef USE_DML
   if (DefaultDmlExecutionProvider() != nullptr) {
-    GTEST_SKIP() << "It should not run with DML EP";
+    GTEST_FAIL() << "It should not run with DML EP";
   }
 #endif
   auto cpu_ep = DefaultCpuExecutionProvider();
@@ -242,14 +242,17 @@ TEST(LoraAdapterTest, VerifyCudaDeviceCopy) {
 
 #ifdef USE_DML
 TEST(LoraAdapterTest, VerifyDmlDeviceCopy) {
+  // NO_DML_TEST is set, DML test is skipped
   if (DefaultDmlExecutionProvider() == nullptr) {
     GTEST_SKIP() << "Skip This Test Due to this EP is null";
   }
+
 #ifdef USE_CUDA
   if (DefaultCudaExecutionProvider() != nullptr) {
-    GTEST_SKIP() << "It should not run with CUDA EP";
+    GTEST_FAIL() << "It should not run with CUDA EP";
   }
 #endif
+
   auto cpu_ep = DefaultCpuExecutionProvider();
   auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0];
 
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index ec391c397abac..1acaf40eab79b 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -125,7 +125,7 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_DML
   const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
   if (no_dml_ep_test != "1") {
-    return nullptr;
+     return nullptr;
   }
 #endif
   OrtCUDAProviderOptionsV2 provider_options{};

From 5e976d381849b0e41489834fd9d49a9e9d02761e Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 21:24:18 +0800
Subject: [PATCH 38/65] Float16Larget test

---
 onnxruntime/test/contrib_ops/matmul_4bits_test.cc | 6 ++++--
 onnxruntime/test/util/default_providers.cc        | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
index f3ad5618f267f..638f85c06c30b 100644
--- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
+++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc
@@ -545,9 +545,11 @@ TEST(MatMulNBits, Float16Large) {
   // machines we tested on. All consumer-grade machines from Nvidia/AMD/Intel seem to pass these tests with an
   // absolute error of 0.08, but the A10 has errors going as high as 0.22. Ultimately, given the large number
   // of elements in this test, ULPs should probably be used instead of absolute/relative tolerances.
-  float abs_error = 0.3f;
+
+  float abs_error = 0.05f;
   if (DefaultDmlExecutionProvider() != nullptr) {
-    abs_error = 0.05f;
+    // it means the ep is dml in runtime, the abs_error is changed to 0.3f
+    abs_error = 0.3f;
   }
 #else
   float abs_error = 0.05f;
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index 1acaf40eab79b..ec391c397abac 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -125,7 +125,7 @@ std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_DML
   const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
   if (no_dml_ep_test != "1") {
-     return nullptr;
+    return nullptr;
   }
 #endif
   OrtCUDAProviderOptionsV2 provider_options{};

From d5cf61f412ae89e5a9736c3789ea9de5a69ff79e Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 22:27:09 +0800
Subject: [PATCH 39/65] remove nullptr in eps

---
 onnxruntime/test/common/cuda_op_test_utils.h         | 4 ++++
 onnxruntime/test/framework/inference_session_test.cc | 3 +++
 onnxruntime/test/providers/base_tester.cc            | 7 +++++++
 3 files changed, 14 insertions(+)

diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 6f3e460628566..6833c3785466d 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -13,6 +13,10 @@ namespace test {
 int GetCudaArchitecture();
 
 inline bool HasCudaEnvironment(int min_cuda_architecture) {
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return false;
+  }
+
   if (DefaultCudaExecutionProvider().get() == nullptr) {
     return false;
   }
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 61a8f7e23fe87..0e1bf3f8c6965 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -996,6 +996,9 @@ static void TestBindHelper(const std::string& log_str,
   if (bind_provider_type == kCudaExecutionProvider || bind_provider_type == kRocmExecutionProvider) {
 #ifdef USE_CUDA
     auto provider = DefaultCudaExecutionProvider();
+    if (provider == nullptr) {
+      return;
+    }
     gpu_provider = provider.get();
     ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(std::move(provider)));
 #endif
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index dea39bc99d3e9..ba6546b791baf 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -529,6 +529,13 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai
   so.use_deterministic_compute = use_determinism_;
   so.graph_optimization_level = TransformerLevel::Default;  // 'Default' == off
 
+  // remove nullptr in execution_providers.
+  // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime.
+  // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly.
+  if (execution_providers != nullptr || execution_providers->empty()) {
+    execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end());
+  }
+
   Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options);
 }
 

From e3b25cfc2490c12b382ba4d2339f2c69bdab38a8 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Mon, 28 Oct 2024 23:53:51 +0800
Subject: [PATCH 40/65] skip cuda tests 1

---
 .../test/contrib_ops/bias_dropout_op_test.cc   |  3 +++
 .../contrib_ops/bitmask_dropout_op_test.cc     |  3 +++
 .../test/contrib_ops/layer_norm_test.cc        | 13 ++++++++++---
 .../test/framework/allocation_planner_test.cc  | 18 ++++++++++++++++++
 .../test/framework/cuda/fence_cuda_test.cc     |  9 +++++++++
 .../test/framework/inference_session_test.cc   | 15 +++++++++++++++
 .../test/framework/sparse_kernels_test.cc      |  6 ++++++
 onnxruntime/test/providers/base_tester.cc      |  6 +++++-
 .../providers/cpu/tensor/grid_sample_test.cc   |  8 +++++---
 9 files changed, 74 insertions(+), 7 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
index 027d4b3fff1b0..297629b015796 100644
--- a/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bias_dropout_op_test.cc
@@ -181,6 +181,9 @@ void RunBiasDropoutTest(const bool use_mask, const std::vector<int64_t>& input_s
   t.SetCustomOutputVerifier(output_verifier);
   std::vector<std::unique_ptr<IExecutionProvider>> t_eps;
 #ifdef USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   t_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   t_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
index 7ca4e1004066c..ce474cc75431b 100644
--- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
@@ -61,6 +61,9 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio
 
   std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
 #ifdef USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   test_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   test_eps.emplace_back(DefaultRocmExecutionProvider());
diff --git a/onnxruntime/test/contrib_ops/layer_norm_test.cc b/onnxruntime/test/contrib_ops/layer_norm_test.cc
index 438a1100ca95c..4055b1449e70a 100644
--- a/onnxruntime/test/contrib_ops/layer_norm_test.cc
+++ b/onnxruntime/test/contrib_ops/layer_norm_test.cc
@@ -2,6 +2,7 @@
 // Licensed under the MIT License.
 
 #include "test/providers/compare_provider_test_utils.h"
+#include "test/util/include/default_providers.h"
 
 namespace onnxruntime {
 namespace test {
@@ -79,11 +80,17 @@ static void TestLayerNorm(const std::vector<int64_t>& x_dims,
 #endif
 
 #ifdef USE_CUDA
-  test.CompareWithCPU(kCudaExecutionProvider);
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    test.CompareWithCPU(kCudaExecutionProvider);
+  }
 #elif USE_ROCM
   test.CompareWithCPU(kRocmExecutionProvider);
-#elif USE_DML
-  test.CompareWithCPU(kDmlExecutionProvider);
+#endif
+
+#ifdef USE_DML
+  if (DefaultDmlExecutionProvider() != nullptr) {
+    test.CompareWithCPU(kDmlExecutionProvider);
+  }
 #endif
 }
 
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 0105e90b5a24a..407f61b8eb1ef 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -894,6 +894,9 @@ TEST_F(PlannerTest, LocationPlanningForPassThroughExplicitAndImplicitSubgraphInp
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1036,6 +1039,9 @@ TEST_F(PlannerTest, LocationPlanningForInitializersOnlyUsedInANestedSubgraph) {
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1143,6 +1149,9 @@ TEST_F(PlannerTest, LocationPlanningForInitializersUsedOnDifferentDevicesInMainG
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -1235,6 +1244,9 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
   SessionOptions so;
   InferenceSession sess{so, GetEnvironment()};
 
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   ASSERT_TRUE(status.IsOK());
 
@@ -2007,6 +2019,9 @@ TEST_F(PlannerTest, TestCpuIf) {
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/cpu_if.onnx"));
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(sess.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(sess.Load());
   ASSERT_STATUS_OK(sess.Initialize());
@@ -2071,6 +2086,9 @@ TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
   InferenceSession sess(sess_opt, GetEnvironment(), ORT_TSTR("./testdata/multi_stream_models/issue_19480.onnx"));
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   auto status = sess.RegisterExecutionProvider(DefaultCudaExecutionProvider());
   status = sess.Load();
   status = sess.Initialize();
diff --git a/onnxruntime/test/framework/cuda/fence_cuda_test.cc b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
index e28327941dda4..3e5ef30e7ebef 100644
--- a/onnxruntime/test/framework/cuda/fence_cuda_test.cc
+++ b/onnxruntime/test/framework/cuda/fence_cuda_test.cc
@@ -115,6 +115,9 @@ TEST(CUDAFenceTests, DISABLED_PartOnCPU) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
   ASSERT_TRUE(1 == CountCopyNodes(graph));
@@ -164,6 +167,9 @@ TEST(CUDAFenceTests, TileWithInitializer) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_STATUS_OK(session.Initialize());
 
@@ -224,6 +230,9 @@ TEST(CUDAFenceTests, TileWithComputedInput) {
   SessionOptions so;
   FenceCudaTestInferenceSession session(so, GetEnvironment());
   ASSERT_STATUS_OK(LoadInferenceSessionFromModel(session, *model));
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
   ASSERT_TRUE(session.Initialize().IsOK());
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index 0e1bf3f8c6965..ac5a5933cc107 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -635,6 +635,9 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -689,6 +692,9 @@ TEST(InferenceSessionTests, CheckRunProfilerWithSessionOptions2) {
 
   InferenceSession session_object(so, GetEnvironment());
 #ifdef USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #endif
 #ifdef USE_ROCM
@@ -1594,6 +1600,9 @@ TEST(InferenceSessionTests, Test3LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -1746,6 +1755,9 @@ TEST(InferenceSessionTests, Test2LayerNestedSubgraph) {
 #if USE_TENSORRT
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultTensorrtExecutionProvider()));
 #elif USE_CUDA
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 #elif USE_ROCM
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultRocmExecutionProvider()));
@@ -2108,6 +2120,9 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
   so.session_logid = "InferenceSessionTests.TestParallelExecutionWithCudaProvider";
   InferenceSession session_object{so, GetEnvironment()};
 
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 
   ASSERT_STATUS_OK(session_object.Load(model_uri));
diff --git a/onnxruntime/test/framework/sparse_kernels_test.cc b/onnxruntime/test/framework/sparse_kernels_test.cc
index 7bd6b47f52b7d..db9592c293fd0 100644
--- a/onnxruntime/test/framework/sparse_kernels_test.cc
+++ b/onnxruntime/test/framework/sparse_kernels_test.cc
@@ -1457,6 +1457,9 @@ TEST(SparseTensorConversionTests, CsrConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
+  if (cuda_provider == nullptr) {
+    return;
+  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
@@ -1684,6 +1687,9 @@ TEST(SparseTensorConversionTests, CooConversion) {
 
 #ifdef USE_CUDA
   auto cuda_provider = DefaultCudaExecutionProvider();
+  if (cuda_provider == nullptr) {
+    return;
+  }
   auto cuda_allocator = cuda_provider->CreatePreferredAllocators()[0];
   {
     auto cuda_transfer = cuda_provider->GetDataTransfer();
diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc
index ba6546b791baf..9d83c789c5124 100644
--- a/onnxruntime/test/providers/base_tester.cc
+++ b/onnxruntime/test/providers/base_tester.cc
@@ -532,8 +532,12 @@ void BaseTester::Run(ExpectResult expect_result, const std::string& expected_fai
   // remove nullptr in execution_providers.
   // it's a little ugly but we need to do this because DefaultXXXExecutionProvider() can return nullptr in Runtime.
   // And there're many places adding DefaultXXXExecutionProvider() to execution_providers directly.
-  if (execution_providers != nullptr || execution_providers->empty()) {
+  if (execution_providers != nullptr) {
     execution_providers->erase(std::remove(execution_providers->begin(), execution_providers->end(), nullptr), execution_providers->end());
+    if (execution_providers->size() == 0) {
+      // In fact, no ep is needed to run
+      return;
+    }
   }
 
   Run(so, expect_result, expected_failure_string, excluded_provider_types, run_options, execution_providers, options);
diff --git a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
index 05cfb5c13d689..7e1a2384d7fc6 100644
--- a/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/grid_sample_test.cc
@@ -15,11 +15,13 @@ std::vector<std::unique_ptr<IExecutionProvider>> GetExecutionProviders(int opset
   execution_providers.emplace_back(DefaultCpuExecutionProvider());
 
 #ifdef USE_CUDA
-  if (opset_version < 20) {
-    execution_providers.emplace_back(DefaultCudaExecutionProvider());
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    if (opset_version < 20) {
+      execution_providers.emplace_back(DefaultCudaExecutionProvider());
 #ifdef ENABLE_CUDA_NHWC_OPS
-    execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
+      execution_providers.push_back(DefaultCudaNHWCExecutionProvider());
 #endif
+    }
   }
 #endif
 

From 839dcbf4332546aae37d997800842a99c33d3c86 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 00:06:03 +0800
Subject: [PATCH 41/65] check cudaep 2

---
 onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
index ce474cc75431b..4f5967c23cb37 100644
--- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
@@ -61,10 +61,9 @@ void RunTestForInference(const std::vector<int64_t>& input_dims, bool has_ratio
 
   std::vector<std::unique_ptr<IExecutionProvider>> test_eps;
 #ifdef USE_CUDA
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    test_eps.emplace_back(DefaultCudaExecutionProvider());
   }
-  test_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
   test_eps.emplace_back(DefaultRocmExecutionProvider());
 #endif
@@ -125,7 +124,9 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {
 
     std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
 #ifdef USE_CUDA
-    dropout_eps.emplace_back(DefaultCudaExecutionProvider());
+    if (DefaultCudaExecutionProvider() != nullptr) {
+        dropout_eps.emplace_back(DefaultCudaExecutionProvider());
+    }
 #elif USE_ROCM
     dropout_eps.emplace_back(DefaultRocmExecutionProvider());
 #endif

From 08064f2e280c4d5bb797af77ff4900dc4882e578 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 09:45:52 +0800
Subject: [PATCH 42/65] update tests

---
 .../contrib_ops/bitmask_dropout_op_test.cc    |  5 ++-
 .../test/contrib_ops/tensor_op_test.cc        | 20 +++++++++-
 .../test/framework/memcpy_transformer_test.cc | 37 +++++++++++++++++++
 .../providers/compare_provider_test_utils.cc  |  5 +++
 .../providers/cpu/tensor/gather_op_test.cc    | 13 ++++---
 5 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
index 4f5967c23cb37..26b0e3a4dd7a9 100644
--- a/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
+++ b/onnxruntime/test/contrib_ops/bitmask_dropout_op_test.cc
@@ -124,9 +124,10 @@ void RunTestForTraining(const std::vector<int64_t>& input_dims) {
 
     std::vector<std::unique_ptr<IExecutionProvider>> dropout_eps;
 #ifdef USE_CUDA
-    if (DefaultCudaExecutionProvider() != nullptr) {
-        dropout_eps.emplace_back(DefaultCudaExecutionProvider());
+    if (DefaultCudaExecutionProvider() == nullptr) {
+      return;
     }
+    dropout_eps.emplace_back(DefaultCudaExecutionProvider());
 #elif USE_ROCM
     dropout_eps.emplace_back(DefaultRocmExecutionProvider());
 #endif
diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc
index bc2ff5f4f724d..a86860ceffa41 100644
--- a/onnxruntime/test/contrib_ops/tensor_op_test.cc
+++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc
@@ -121,7 +121,15 @@ void MeanVarianceNormalizationAcrossChannels(bool across_channels, bool normaliz
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
+  } else if (DefaultDmlExecutionProvider() == nullptr) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
+  }
+#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
+#endif
 }
 
 void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_variance) {
@@ -188,7 +196,15 @@ void MeanVarianceNormalizationPerChannel(bool across_channels, bool normalize_va
   test.AddAttribute("normalize_variance", normalize_variance ? one : zero);
   test.AddInput<float>("input", {N, C, H, W}, X);
   test.AddOutput<float>("output", {N, C, H, W}, result);
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kCudaExecutionProvider, kTensorrtExecutionProvider});
+  } else if (DefaultDmlExecutionProvider() == nullptr) {
+    test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kDmlExecutionProvider, kTensorrtExecutionProvider});
+  }
+#else
   test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider, kTensorrtExecutionProvider});  // OpenVINO doesn't support MVN operator below opset 9. TensorRT doesn't support opset 8 of MVN operator.
+#endif
 }
 
 TEST(MVNContribOpTest, MeanVarianceNormalizationCPUTest_Version1_TO_8) {
@@ -230,7 +246,9 @@ TEST(UnfoldTensorOpTest, LastDim) {
 
   std::vector<std::unique_ptr<IExecutionProvider>> execution_providers;
 #ifdef USE_CUDA
-  execution_providers.push_back(DefaultCudaExecutionProvider());
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    execution_providers.push_back(DefaultCudaExecutionProvider());
+   }
 #endif
   execution_providers.push_back(DefaultCpuExecutionProvider());
   tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);
diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc
index 6e86e5b58aead..3bb6bb2ffd097 100644
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@@ -106,12 +106,24 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
+  }
+#else
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
+#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
   ASSERT_STATUS_OK(test_registry_manager.RegisterKernels(execution_providers));
 
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   MemcpyTransformer transformer({onnxruntime::kCudaExecutionProvider}, test_registry_manager);
 
   bool modified = false;
@@ -161,7 +173,14 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
+
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -281,7 +300,13 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
+
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -323,7 +348,13 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices)
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
+
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
@@ -425,7 +456,13 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
+
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
                                            std::make_unique<CPUExecutionProvider>(CPUExecutionProviderInfo())));
   KernelRegistryManager test_registry_manager;
diff --git a/onnxruntime/test/providers/compare_provider_test_utils.cc b/onnxruntime/test/providers/compare_provider_test_utils.cc
index 386a5656d8a01..9acb37c24ddd0 100644
--- a/onnxruntime/test/providers/compare_provider_test_utils.cc
+++ b/onnxruntime/test/providers/compare_provider_test_utils.cc
@@ -53,6 +53,11 @@ void CompareOpTester::CompareWithCPU(const std::string& target_provider_type,
   SetTestFunctionCalled();
 
   std::unique_ptr<IExecutionProvider> target_execution_provider = GetExecutionProvider(target_provider_type);
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (target_execution_provider == nullptr) {
+    return;
+  }
+#endif
   ASSERT_TRUE(target_execution_provider != nullptr) << "provider_type " << target_provider_type
                                                     << " is not supported.";
 
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index be79a6d29d539..ef5ab61e2eb01 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -123,13 +123,16 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
                          0.0f, 0.0f, 0.0f, 0.0f});
 
   // On GPU, just set the value to 0 instead of report error. exclude all other providers
-  test
 #if defined(USE_CUDA)
-      .ConfigEp(DefaultCudaExecutionProvider())
-#else
-      .ConfigEp(DefaultRocmExecutionProvider())
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    test.ConfigEp(DefaultCudaExecutionProvider())
+        .ConfigEp(DefaultRocmExecutionProvider())
+        .RunWithConfig();
+  } else {
+    test.ConfigEp(DefaultRocmExecutionProvider())
+        .RunWithConfig();
+  }
 #endif
-      .RunWithConfig();
 }
 #endif
 

From be93bd966a594bf713019c2d31e9c115e21071b2 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 09:46:51 +0800
Subject: [PATCH 43/65] lint

---
 onnxruntime/test/contrib_ops/tensor_op_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/contrib_ops/tensor_op_test.cc b/onnxruntime/test/contrib_ops/tensor_op_test.cc
index a86860ceffa41..d5e2ddebfe67f 100644
--- a/onnxruntime/test/contrib_ops/tensor_op_test.cc
+++ b/onnxruntime/test/contrib_ops/tensor_op_test.cc
@@ -248,7 +248,7 @@ TEST(UnfoldTensorOpTest, LastDim) {
 #ifdef USE_CUDA
   if (DefaultCudaExecutionProvider() != nullptr) {
     execution_providers.push_back(DefaultCudaExecutionProvider());
-   }
+  }
 #endif
   execution_providers.push_back(DefaultCpuExecutionProvider());
   tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers);

From 8e77a6c051cadb7f98395e305e0551d6e81b8e76 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 10:51:24 +0800
Subject: [PATCH 44/65] update tests

---
 .../test/contrib_ops/beam_search_test.cc      | 10 ++++
 .../test/framework/allocation_planner_test.cc | 55 +++++++++++++++++++
 .../test/framework/inference_session_test.cc  | 14 ++++-
 3 files changed, 76 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 5f94d30112f0e..ee02c5f1078fe 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -73,6 +73,11 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
   const char* const output_names[] = {"sequences"};
 
   Ort::SessionOptions session_options;
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() != nullptr) {
+    return;
+  }
+#endif
 #ifdef USE_CUDA
   OrtCUDAProviderOptionsV2 cuda_options;
   cuda_options.use_tf32 = false;
@@ -166,6 +171,11 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get());
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
+#if defined(USE_CUDA) && defined(USE_DML)
+    if (DefaultCudaExecutionProvider() != nullptr) {
+      return;
+    }
+#endif
 #ifdef USE_CUDA
     OrtCUDAProviderOptionsV2 cuda_options;
     cuda_options.use_tf32 = false;
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 407f61b8eb1ef..9d9e3c825b05f 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -1279,6 +1279,12 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
 // Test MultiStream scenario for the graph:
 // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep)
 TEST_F(PlannerTest, MultiStream) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   ONNX_NAMESPACE::TensorProto tensor;
   tensor.add_dims(1);
   tensor.add_float_data(1.0f);
@@ -1297,6 +1303,7 @@ TEST_F(PlannerTest, MultiStream) {
   onnxruntime::ProviderInfo_CUDA& ep = onnxruntime::GetProviderInfo_CUDA();
   auto epFactory = ep.CreateExecutionProviderFactory(epi);
   std::unique_ptr<IExecutionProvider> execution_provider = epFactory->CreateProvider();
+
   ORT_THROW_IF_ERROR(GetExecutionProviders().Add("CUDAExecutionProvider", std::move(execution_provider)));
 
   CreatePlan({}, false);
@@ -1324,6 +1331,11 @@ TEST_F(PlannerTest, MultiStream) {
 //      node3
 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2
 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input("Graph_input"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
@@ -1365,6 +1377,11 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // stream 1: node2 (CPU EP)
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan[0]->steps_.size(), 5) << "stream 0 has 5 steps";
@@ -1386,6 +1403,11 @@ TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
 // TODO(leca): there is a bug in the corresponding graph that node2 will be visited twice when traversing node1's output nodes
 // (see: for (auto it = node->OutputNodesBegin(); it != node->OutputNodesEnd(); ++it) in BuildExecutionPlan()). We can just break the loop and don't need the extra variables once it is fixed
 TEST_F(PlannerTest, MultiStreamMultiOutput) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("RNN").Provider(kCudaExecutionProvider).SinceVersion(7).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1), Arg(Graph_input2), Arg(Graph_input3)}, output1{Arg(Arg1), Arg(Arg2)}, input2{Arg(Arg1), Arg(Arg2)}, output2{Arg(Arg3)};
@@ -1423,6 +1445,11 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) {
 // TODO(leca): the ideal case is there is only 1 wait step before launching node3,
 // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one
 TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
   std::vector<onnxruntime::NodeArg*> input1{Arg(Graph_input1)}, input2{Arg(Graph_input2)}, output1{Arg(Arg1)}, output2{Arg(Arg2)}, input3{Arg(Arg1), Arg(Arg2)}, output3{Arg(Arg3)};
@@ -1460,6 +1487,11 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream)
 
 #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM)
 TEST_F(PlannerTest, ParaPlanCreation) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   TypeProto graph_in_type;
   graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
   auto* graph_in_shape = graph_in_type.mutable_tensor_type()->mutable_shape();
@@ -1901,6 +1933,12 @@ TEST_F(PlannerTest, ParaPlanCreation) {
 }
 
 TEST_F(PlannerTest, TestMultiStreamConfig) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   const char* type = "DeviceBasedPartitioner";
   constexpr size_t type_len = 22;
 
@@ -1974,6 +2012,12 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) {
 
 // Load with partition config where a node is missing, session load expected to fail.
 TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -1994,6 +2038,11 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
 
 // Load with partition config where streams and devices has mismatch
 TEST_F(PlannerTest, TestMultiStreamMismatchDevice) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json";
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
@@ -2082,6 +2131,12 @@ TEST_F(PlannerTest, TestCpuIf) {
 //    onnx.save(model, 'issue_19480.onnx')
 //
 TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   SessionOptions sess_opt;
   sess_opt.graph_optimization_level = TransformerLevel::Default;
 
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index ac5a5933cc107..06ab4e3ece099 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -2113,6 +2113,11 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 #ifdef USE_CUDA
 // disable it, since we are going to enable parallel execution with cuda ep
 TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
   SessionOptions so;
@@ -2120,9 +2125,6 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
   so.session_logid = "InferenceSessionTests.TestParallelExecutionWithCudaProvider";
   InferenceSession session_object{so, GetEnvironment()};
 
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
   ASSERT_STATUS_OK(session_object.RegisterExecutionProvider(DefaultCudaExecutionProvider()));
 
   ASSERT_STATUS_OK(session_object.Load(model_uri));
@@ -2139,6 +2141,12 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
 }
 
 TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
+
   OrtArenaCfg arena_cfg;
   arena_cfg.arena_extend_strategy = 1;  // kSameAsRequested
 

From ff784463a6d4aa11d0a2cf461c55f8c0975b29ad Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 11:05:21 +0800
Subject: [PATCH 45/65] typo

---
 onnxruntime/test/contrib_ops/beam_search_test.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index ee02c5f1078fe..09d4fd470affd 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -74,7 +74,7 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
 
   Ort::SessionOptions session_options;
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() != nullptr) {
+  if (DefaultCudaExecutionProvider() == nullptr) {
     return;
   }
 #endif
@@ -172,7 +172,7 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #if defined(USE_CUDA) && defined(USE_DML)
-    if (DefaultCudaExecutionProvider() != nullptr) {
+    if (DefaultCudaExecutionProvider() == nullptr) {
       return;
     }
 #endif

From 9e1bafc8ff17b031cb019ae884f316875cc2b05a Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 15:38:13 +0800
Subject: [PATCH 46/65] cuda log

---
 onnxruntime/core/providers/cuda/cuda_allocator.cc          | 3 +++
 onnxruntime/core/providers/cuda/cuda_execution_provider.cc | 7 +++++++
 onnxruntime/core/providers/cuda/cuda_provider_factory.cc   | 7 ++++++-
 onnxruntime/core/providers/cuda/cuda_stream_handle.cc      | 3 +++
 4 files changed, 19 insertions(+), 1 deletion(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc
index 8c96d8f57a0ba..557f449799052 100644
--- a/onnxruntime/core/providers/cuda/cuda_allocator.cc
+++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc
@@ -24,6 +24,9 @@ void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
 }
 
 void CUDAAllocator::SetDevice(bool throw_when_fail) const {
+#if defined(USE_CUDA) && defined(USE_DML)
+  LOGS_DEFAULT(WARNING) << "CUDA SetDevice is called";
+#endif
   int current_device;
   auto cuda_err = cudaGetDevice(&current_device);
   if (cuda_err == cudaSuccess) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index d3f01c1f7adc1..fef5dfe93097b 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -173,6 +173,9 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi
 CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t /*gpu_mem_limit*/,
                                                           ArenaExtendStrategy /*arena_extend_strategy*/, CUDAExecutionProviderExternalAllocatorInfo /*external_allocator_info*/,
                                                           OrtArenaCfg* /*default_memory_arena_cfg*/) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  LOGS_DEFAULT(WARNING) << "CUDA PerThreadContext is called";
+#endif
   CUDA_CALL_THROW(cudaSetDevice(device_id));
 #ifndef USE_CUDA_MINIMAL
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
@@ -279,6 +282,10 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
   ORT_ENFORCE(info_.prefer_nhwc == 0, "This build does not support NHWC layout");
 #endif
 
+#if defined(USE_CUDA) && defined(USE_DML)
+  LOGS_DEFAULT(WARNING) << "CUDA ctor is called";
+#endif
+
   CUDA_CALL_THROW(cudaSetDevice(info_.device_id));
 
   // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 83a5d02d16c6c..24d4daa336817 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -52,6 +52,9 @@ std::unique_ptr<IExecutionProvider> CUDAProviderFactory::CreateProvider() {
 
 struct ProviderInfo_CUDA_Impl final : ProviderInfo_CUDA {
   OrtStatus* SetCurrentGpuDeviceId(_In_ int device_id) override {
+#if defined(USE_CUDA) && defined(USE_DML)
+    LOGS_DEFAULT(WARNING) << "CUDA SetCurrentGpuDeviceId is called";
+#endif
     int num_devices;
     auto cuda_err = ::cudaGetDeviceCount(&num_devices);
     if (cuda_err != cudaSuccess) {
@@ -112,7 +115,9 @@ struct ProviderInfo_CUDA_Impl final : ProviderInfo_CUDA {
 
   void CopyGpuToCpu(void* dst_ptr, const void* src_ptr, const size_t size, const OrtMemoryInfo& dst_location, const OrtMemoryInfo& src_location) override {
     ORT_ENFORCE(dst_location.device.Type() == OrtDevice::CPU);
-
+#if defined(USE_CUDA) && defined(USE_DML)
+    LOGS_DEFAULT(WARNING) << "CUDA CopyGpuToCpu is called";
+#endif
     // Current CUDA device.
     int device;
     CUDA_CALL_THROW(cudaGetDevice(&device));
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index e9b159516dad9..9be2f9ab3199c 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -245,6 +245,9 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudnnHandle_t external_cudnn_handle,
                                cublasHandle_t external_cublas_handle,
                                const CUDAExecutionProviderInfo& ep_info) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  LOGS_DEFAULT(WARNING) << "CUDA RegisterCudaStreamHandles is called";
+#endif
   // wait cuda notification on cuda ep
   stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitCudaNotificationOnDevice);
   // wait cuda notification on cpu ep

From 64ade9ba0efd862fc9a87f8ca1229ec7c2b26e05 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 17:35:45 +0800
Subject: [PATCH 47/65] update 1 test case

---
 .../providers/cpu/tensor/gather_op_test.cc     | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index ef5ab61e2eb01..ae838f10b4153 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -122,17 +122,19 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
                          4.0f, 5.0f, 6.0f, 7.0f,
                          0.0f, 0.0f, 0.0f, 0.0f});
 
+#if defined(USE_CUDA) && defined(USE_DML)
+  if (DefaultCudaExecutionProvider() == nullptr) {
+    return;
+  }
+#endif
   // On GPU, just set the value to 0 instead of report error. exclude all other providers
+  test
 #if defined(USE_CUDA)
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    test.ConfigEp(DefaultCudaExecutionProvider())
-        .ConfigEp(DefaultRocmExecutionProvider())
-        .RunWithConfig();
-  } else {
-    test.ConfigEp(DefaultRocmExecutionProvider())
-        .RunWithConfig();
-  }
+      .ConfigEp(DefaultCudaExecutionProvider())
+#else
+      .ConfigEp(DefaultRocmExecutionProvider())
 #endif
+      .RunWithConfig();
 }
 #endif
 

From aee739253012c152afffce5d2af2e468c74e4624 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 17:39:10 +0800
Subject: [PATCH 48/65] update

---
 tools/ci_build/github/azure-pipelines/templates/win-ci.yml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 8e38604f90fce..88bcdf3927248 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -401,7 +401,7 @@ stages:
               arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
               workingDirectory: '$(Build.BinariesDirectory)'
             env:
-              NO_CUDA_TEST: 1
+              NO_CUDA_TEST: '1'
           - task: PythonScript@0
             displayName: 'test excludes DML'
             condition: and(succeeded(), eq('${{ parameters.runTests}}', true))
@@ -410,7 +410,7 @@ stages:
               arguments: '--config RelWithDebInfo --use_binskim_compliant_compile_flags --enable_lto --disable_rtti --build_dir $(Build.BinariesDirectory) --skip_submodule_sync --build_shared_lib --test --enable_onnx_tests  $(TelemetryOption) '
               workingDirectory: '$(Build.BinariesDirectory)'
             env:
-              NO_DML_TEST: 1
+              NO_DML_TEST: '1'
         - ${{ else }}:
           - task: PythonScript@0
             displayName: 'test'

From a8c6e929cb9da44fa78e6196344f739fa8041fb7 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 20:17:50 +0800
Subject: [PATCH 49/65] Revert "cuda log"

This reverts commit 9e1bafc8ff17b031cb019ae884f316875cc2b05a.
---
 onnxruntime/core/providers/cuda/cuda_allocator.cc          | 3 ---
 onnxruntime/core/providers/cuda/cuda_execution_provider.cc | 7 -------
 onnxruntime/core/providers/cuda/cuda_provider_factory.cc   | 7 +------
 onnxruntime/core/providers/cuda/cuda_stream_handle.cc      | 3 ---
 4 files changed, 1 insertion(+), 19 deletions(-)

diff --git a/onnxruntime/core/providers/cuda/cuda_allocator.cc b/onnxruntime/core/providers/cuda/cuda_allocator.cc
index 557f449799052..8c96d8f57a0ba 100644
--- a/onnxruntime/core/providers/cuda/cuda_allocator.cc
+++ b/onnxruntime/core/providers/cuda/cuda_allocator.cc
@@ -24,9 +24,6 @@ void CUDAAllocator::CheckDevice(bool throw_when_fail) const {
 }
 
 void CUDAAllocator::SetDevice(bool throw_when_fail) const {
-#if defined(USE_CUDA) && defined(USE_DML)
-  LOGS_DEFAULT(WARNING) << "CUDA SetDevice is called";
-#endif
   int current_device;
   auto cuda_err = cudaGetDevice(&current_device);
   if (cuda_err == cudaSuccess) {
diff --git a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
index fef5dfe93097b..d3f01c1f7adc1 100644
--- a/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
+++ b/onnxruntime/core/providers/cuda/cuda_execution_provider.cc
@@ -173,9 +173,6 @@ AllocatorPtr CUDAExecutionProvider::CreateCudaAllocator(OrtDevice::DeviceId devi
 CUDAExecutionProvider::PerThreadContext::PerThreadContext(OrtDevice::DeviceId device_id, cudaStream_t stream, size_t /*gpu_mem_limit*/,
                                                           ArenaExtendStrategy /*arena_extend_strategy*/, CUDAExecutionProviderExternalAllocatorInfo /*external_allocator_info*/,
                                                           OrtArenaCfg* /*default_memory_arena_cfg*/) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  LOGS_DEFAULT(WARNING) << "CUDA PerThreadContext is called";
-#endif
   CUDA_CALL_THROW(cudaSetDevice(device_id));
 #ifndef USE_CUDA_MINIMAL
   CUBLAS_CALL_THROW(cublasCreate(&cublas_handle_));
@@ -282,10 +279,6 @@ CUDAExecutionProvider::CUDAExecutionProvider(const CUDAExecutionProviderInfo& in
   ORT_ENFORCE(info_.prefer_nhwc == 0, "This build does not support NHWC layout");
 #endif
 
-#if defined(USE_CUDA) && defined(USE_DML)
-  LOGS_DEFAULT(WARNING) << "CUDA ctor is called";
-#endif
-
   CUDA_CALL_THROW(cudaSetDevice(info_.device_id));
 
   // must wait GPU idle, otherwise cudaGetDeviceProperties might fail
diff --git a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
index 24d4daa336817..83a5d02d16c6c 100644
--- a/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
+++ b/onnxruntime/core/providers/cuda/cuda_provider_factory.cc
@@ -52,9 +52,6 @@ std::unique_ptr<IExecutionProvider> CUDAProviderFactory::CreateProvider() {
 
 struct ProviderInfo_CUDA_Impl final : ProviderInfo_CUDA {
   OrtStatus* SetCurrentGpuDeviceId(_In_ int device_id) override {
-#if defined(USE_CUDA) && defined(USE_DML)
-    LOGS_DEFAULT(WARNING) << "CUDA SetCurrentGpuDeviceId is called";
-#endif
     int num_devices;
     auto cuda_err = ::cudaGetDeviceCount(&num_devices);
     if (cuda_err != cudaSuccess) {
@@ -115,9 +112,7 @@ struct ProviderInfo_CUDA_Impl final : ProviderInfo_CUDA {
 
   void CopyGpuToCpu(void* dst_ptr, const void* src_ptr, const size_t size, const OrtMemoryInfo& dst_location, const OrtMemoryInfo& src_location) override {
     ORT_ENFORCE(dst_location.device.Type() == OrtDevice::CPU);
-#if defined(USE_CUDA) && defined(USE_DML)
-    LOGS_DEFAULT(WARNING) << "CUDA CopyGpuToCpu is called";
-#endif
+
     // Current CUDA device.
     int device;
     CUDA_CALL_THROW(cudaGetDevice(&device));
diff --git a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
index 9be2f9ab3199c..e9b159516dad9 100644
--- a/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
+++ b/onnxruntime/core/providers/cuda/cuda_stream_handle.cc
@@ -245,9 +245,6 @@ void RegisterCudaStreamHandles(IStreamCommandHandleRegistry& stream_handle_regis
                                cudnnHandle_t external_cudnn_handle,
                                cublasHandle_t external_cublas_handle,
                                const CUDAExecutionProviderInfo& ep_info) {
-#if defined(USE_CUDA) && defined(USE_DML)
-  LOGS_DEFAULT(WARNING) << "CUDA RegisterCudaStreamHandles is called";
-#endif
   // wait cuda notification on cuda ep
   stream_handle_registry.RegisterWaitFn(device_type, device_type, WaitCudaNotificationOnDevice);
   // wait cuda notification on cpu ep

From 603e0c20893daf7d3e49725e563aedbddb00a27d Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 20:35:40 +0800
Subject: [PATCH 50/65] update java test

---
 .../java/ai/onnxruntime/InferenceTest.java     | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 4b391987cc826..81f483fdb865f 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -690,7 +690,12 @@ public void testSymbolicDimensionAssignment() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
   public void testCUDA() throws OrtException {
-    runProvider(OrtProvider.CUDA);
+    String no_cuda_test = Optional.ofNullable(System.getenv("NO_CUDA_TEST")).orElse("0");
+    if (!no_cuda_test.equals("1")) {
+      runProvider(OrtProvider.CUDA);
+    } else {
+      System.out.println("Skipping CUDA test because NO_CUDA_TEST is set.");
+    }
   }
 
   @Test
@@ -740,14 +745,11 @@ public void testCoreML() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
-    String no_cuda_test = System.getenv("NO_CUDA_TEST");
-    if (no_cuda_test == null || no_cuda_test.isEmpty() || !no_cuda_test.equals("1")) {
-      if (System.getProperty("USE_CUDA").equals("1")) {
-        System.out.println("Skipping DirectML test because CUDA EP test is enabled.");
-        return;
-      }
+    String no_dml_test = Optional.ofNullable(System.getenv("NO_DML_TEST")).orElse("0");;
+    if (!no_dml_test.equals("1")) {
+      runProvider(OrtProvider.DIRECT_ML);
     } else {
-      runProvider(OrtProvider.CORE_ML);
+      System.out.println("Skipping DML test because NO_DML_TEST is set.");
     }
   }
 

From 31fb04ba5536751f774e42a681a3de415db6b206 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 20:45:24 +0800
Subject: [PATCH 51/65] typo

---
 java/src/test/java/ai/onnxruntime/InferenceTest.java | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 81f483fdb865f..23b1dda1f73d6 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -745,7 +745,7 @@ public void testCoreML() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
-    String no_dml_test = Optional.ofNullable(System.getenv("NO_DML_TEST")).orElse("0");;
+    String no_dml_test = Optional.ofNullable(System.getenv("NO_DML_TEST")).orElse("0");
     if (!no_dml_test.equals("1")) {
       runProvider(OrtProvider.DIRECT_ML);
     } else {

From 659131f57c2f5afb8bf2bc57d4f1c1b44fad1b02 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 22:27:14 +0800
Subject: [PATCH 52/65] java test

---
 java/src/test/java/ai/onnxruntime/InferenceTest.java     | 9 ++++++++-
 java/src/test/java/ai/onnxruntime/OnnxTensorTest.java    | 1 +
 .../ai/onnxruntime/providers/ProviderOptionsTest.java    | 6 ++++++
 3 files changed, 15 insertions(+), 1 deletion(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 23b1dda1f73d6..5fe1f69c8a0da 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -650,7 +650,14 @@ public void testProviders() {
     int providersSize = providers.size();
     assertTrue(providersSize > 0);
     assertTrue(providers.contains(OrtProvider.CPU));
-
+    String no_cuda_test = Optional.ofNullable(System.getenv("NO_CUDA_TEST")).orElse("0");
+    if (no_cuda_test.equals("1") && providers.contains(OrtProvider.CUDA)) {
+      providers.remove(OrtProvider.CUDA);
+    }
+    String no_dml_test = Optional.ofNullable(System.getenv("NO_DML_TEST")).orElse("0");
+    if (no_dml_test.equals("1") && providers.contains(OrtProvider.DIRECT_ML)) {
+      providers.remove(OrtProvider.DIRECT_ML);
+    }
     // Check that the providers are a copy of the original, note this does not enable the DNNL
     // provider
     providers.add(OrtProvider.DNNL);
diff --git a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
index 11af2726bd904..a2d9532ce93b0 100644
--- a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
+++ b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
@@ -17,6 +17,7 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
+import java.util.Optional;
 import java.util.SplittableRandom;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index 57c4eb3577fd0..d58bc87247d91 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -26,6 +26,8 @@
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.Map;
+import java.util.Optional;
+
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
@@ -35,6 +37,10 @@ public class ProviderOptionsTest {
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
   public void testCUDAOptions() throws OrtException {
+    String no_cuda_test = Optional.ofNullable(System.getenv("NO_CUDA_TEST")).orElse("0");
+    if (no_cuda_test.equals("1")) {
+      return;
+    }
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
     cudaOpts.add("gpu_mem_limit", "" + (512 * 1024 * 1024));

From f8f3ac1d43fb9d662a004df52c51f3e0aff14bcd Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 22:27:48 +0800
Subject: [PATCH 53/65] java lint

---
 java/src/test/java/ai/onnxruntime/OnnxTensorTest.java            | 1 -
 .../test/java/ai/onnxruntime/providers/ProviderOptionsTest.java  | 1 -
 2 files changed, 2 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
index a2d9532ce93b0..11af2726bd904 100644
--- a/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
+++ b/java/src/test/java/ai/onnxruntime/OnnxTensorTest.java
@@ -17,7 +17,6 @@
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.List;
-import java.util.Optional;
 import java.util.SplittableRandom;
 import org.junit.jupiter.api.Assertions;
 import org.junit.jupiter.api.Test;
diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index d58bc87247d91..e6baaa9cb0c60 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -27,7 +27,6 @@
 import java.util.HashMap;
 import java.util.Map;
 import java.util.Optional;
-
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 

From fe2f0a5928c556700508e2c5e92bcacd55c821a7 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Tue, 29 Oct 2024 23:42:57 +0800
Subject: [PATCH 54/65] split java test

---
 cmake/onnxruntime_java_unittests.cmake | 46 +++++++++++++++++---------
 1 file changed, 31 insertions(+), 15 deletions(-)

diff --git a/cmake/onnxruntime_java_unittests.cmake b/cmake/onnxruntime_java_unittests.cmake
index 7b57cf71e5ef0..0c7e2c3b1330e 100644
--- a/cmake/onnxruntime_java_unittests.cmake
+++ b/cmake/onnxruntime_java_unittests.cmake
@@ -6,26 +6,42 @@
 FILE(TO_NATIVE_PATH ${GRADLE_EXECUTABLE} GRADLE_NATIVE_PATH)
 FILE(TO_NATIVE_PATH ${BIN_DIR} BINDIR_NATIVE_PATH)
 
+function(run_java_unit_test SYSTEM_PROPERTY_DEFINITION)
+  set(GRADLE_TEST_ARGS
+      ${GRADLE_NATIVE_PATH}
+      test --rerun
+      cmakeCheck
+      --console=plain
+      -DcmakeBuildDir=${BINDIR_NATIVE_PATH}
+      -Dorg.gradle.daemon=false
+      ${SYSTEM_PROPERTY_DEFINITIONS})
+
+  if(WIN32)
+  list(PREPEND GRADLE_TEST_ARGS cmd /C)
+  endif()
+
+  message(STATUS "gradle test command args: ${GRADLE_TEST_ARGS}")
+
+  execute_process(COMMAND ${GRADLE_TEST_ARGS}
+                  WORKING_DIRECTORY ${REPO_ROOT}/java
+                  RESULT_VARIABLE HAD_ERROR)
+endfunction()
+
 message(STATUS "gradle additional system property definitions: ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}")
 
-set(GRADLE_TEST_ARGS
-    ${GRADLE_NATIVE_PATH}
-    test --rerun
-    cmakeCheck
-    --console=plain
-    -DcmakeBuildDir=${BINDIR_NATIVE_PATH}
-    -Dorg.gradle.daemon=false
-    ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
+string(FIND "${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}" "-DUSE_CUDA=1" INDEX_CUDA)
+string(FIND "${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}" "-DUSE_DML=1" INDEX_DML)
 
-if(WIN32)
-  list(PREPEND GRADLE_TEST_ARGS cmd /C)
-endif()
 
-message(STATUS "gradle test command args: ${GRADLE_TEST_ARGS}")
+if((INDEX_CUDA GREATER -1) AND (INDEX_DML GREATER -1))
+  run_java_unit_test(${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
+else()
+  string(REPLACE "-DUSE_CUDA=1" "" GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
+  run_java_unit_test(${GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS})
 
-execute_process(COMMAND ${GRADLE_TEST_ARGS}
-    WORKING_DIRECTORY ${REPO_ROOT}/java
-    RESULT_VARIABLE HAD_ERROR)
+  string(REPLACE "-DUSE_DML=1" "" GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
+  run_java_unit_test(${GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS})
+endif()
 
 if(HAD_ERROR)
   message(FATAL_ERROR "Java Unitests failed")

From b84eba7c06c4d84a9e28cda38a4deef6ec8466bc Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Wed, 30 Oct 2024 00:11:12 +0800
Subject: [PATCH 55/65] update

---
 cmake/onnxruntime_java_unittests.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cmake/onnxruntime_java_unittests.cmake b/cmake/onnxruntime_java_unittests.cmake
index 0c7e2c3b1330e..e3ff4322ddb00 100644
--- a/cmake/onnxruntime_java_unittests.cmake
+++ b/cmake/onnxruntime_java_unittests.cmake
@@ -34,13 +34,13 @@ string(FIND "${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}" "-DUSE_DML=1" INDEX_DML)
 
 
 if((INDEX_CUDA GREATER -1) AND (INDEX_DML GREATER -1))
-  run_java_unit_test(${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
-else()
   string(REPLACE "-DUSE_CUDA=1" "" GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
   run_java_unit_test(${GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS})
 
   string(REPLACE "-DUSE_DML=1" "" GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
   run_java_unit_test(${GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS})
+else()
+  run_java_unit_test(${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
 endif()
 
 if(HAD_ERROR)

From 02a981330435030c091c526cfe06cd26362411de Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Wed, 30 Oct 2024 10:13:15 +0800
Subject: [PATCH 56/65] fix onnxruntime4j

---
 cmake/onnxruntime_java_unittests.cmake     | 6 +++---
 onnxruntime/test/util/default_providers.cc | 8 ++++----
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cmake/onnxruntime_java_unittests.cmake b/cmake/onnxruntime_java_unittests.cmake
index e3ff4322ddb00..dfd6ad91c99fa 100644
--- a/cmake/onnxruntime_java_unittests.cmake
+++ b/cmake/onnxruntime_java_unittests.cmake
@@ -35,12 +35,12 @@ string(FIND "${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}" "-DUSE_DML=1" INDEX_DML)
 
 if((INDEX_CUDA GREATER -1) AND (INDEX_DML GREATER -1))
   string(REPLACE "-DUSE_CUDA=1" "" GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
-  run_java_unit_test(${GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS})
+  run_java_unit_test("${GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS}")
 
   string(REPLACE "-DUSE_DML=1" "" GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
-  run_java_unit_test(${GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS})
+  run_java_unit_test("${GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS}")
 else()
-  run_java_unit_test(${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
+  run_java_unit_test("${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}")
 endif()
 
 if(HAD_ERROR)
diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc
index ec391c397abac..4aeaf670f3484 100644
--- a/onnxruntime/test/util/default_providers.cc
+++ b/onnxruntime/test/util/default_providers.cc
@@ -123,8 +123,8 @@ std::unique_ptr<IExecutionProvider> DefaultOpenVINOExecutionProvider() {
 std::unique_ptr<IExecutionProvider> DefaultCudaExecutionProvider() {
 #ifdef USE_CUDA
 #ifdef USE_DML
-  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
-  if (no_dml_ep_test != "1") {
+  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
+  if (no_cuda_ep_test == "1") {
     return nullptr;
   }
 #endif
@@ -331,8 +331,8 @@ std::unique_ptr<IExecutionProvider> DefaultCannExecutionProvider() {
 std::unique_ptr<IExecutionProvider> DefaultDmlExecutionProvider() {
 #ifdef USE_DML
 #ifdef USE_CUDA
-  const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST");
-  if (no_cuda_ep_test != "1") {
+  const std::string no_dml_ep_test = Env::Default().GetEnvironmentVar("NO_DML_TEST");
+  if (no_dml_ep_test == "1") {
     return nullptr;
   }
 #endif

From 171c36fca2ea4c589ef21e640b48bdc07244a13d Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Wed, 30 Oct 2024 11:12:19 +0800
Subject: [PATCH 57/65] not using predefined marco for EP

---
 onnxruntime/core/session/lora_adapters.cc | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/core/session/lora_adapters.cc b/onnxruntime/core/session/lora_adapters.cc
index a095027a1d417..ef368d94b31c8 100644
--- a/onnxruntime/core/session/lora_adapters.cc
+++ b/onnxruntime/core/session/lora_adapters.cc
@@ -62,15 +62,16 @@ namespace {
 struct DataTransfer {
   std::unique_ptr<IExecutionProvider> ep;
   std::unique_ptr<IDataTransfer> data_transfer;
+  bool is_dml = false;
   Status CopyTensor(const Tensor& src, Tensor& dst) const {
     return data_transfer->CopyTensor(src, dst);
   }
   Status Sync() const {
-#if USE_DML
-    return ep->Sync();
-#else
-    return Status::OK();
-#endif
+    if (is_dml) {
+      return ep->Sync();
+    } else {
+      return Status::OK();
+    }   
   }
 };
 }  // namespace
@@ -94,6 +95,7 @@ static Status GetDataTransfer(const OrtMemoryInfo& mem_info, [[maybe_unused]] Da
 #ifdef USE_DML
     auto ep_factory = onnxruntime::DMLProviderFactoryCreator::Create(ConfigOptions{}, 0, false, false, false);
     dt.ep = ep_factory->CreateProvider();
+    dt.is_dml = true;
     dt.data_transfer = dt.ep->GetDataTransfer();
 #else
     status = ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "DML provider is not enabled in this build");

From c1e01441f7d3ccef88661d69637e0b30bff3215d Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Wed, 30 Oct 2024 11:12:58 +0800
Subject: [PATCH 58/65] update

---
 .../azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
index c2d44f4d249aa..d6b25c98936f0 100644
--- a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -50,7 +50,7 @@ stages:
     msbuildPlatform: x64
     packageName: x64-cuda
     CudaVersion: ${{ parameters.CudaVersion }}
-    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" --use_dml --build_csharp --parallel
     runTests: ${{ parameters.RunOnnxRuntimeTests }}
     buildJava: ${{ parameters.buildJava }}
     java_artifact_id: onnxruntime_gpu

From 19c4a054a3e88120e97d63ab13b46bbab4eafd1f Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 08:55:45 +0800
Subject: [PATCH 59/65] revert onnxruntime_java_unittests.cmake

---
 cmake/onnxruntime_java_unittests.cmake | 46 +++++++++-----------------
 1 file changed, 15 insertions(+), 31 deletions(-)

diff --git a/cmake/onnxruntime_java_unittests.cmake b/cmake/onnxruntime_java_unittests.cmake
index dfd6ad91c99fa..7b57cf71e5ef0 100644
--- a/cmake/onnxruntime_java_unittests.cmake
+++ b/cmake/onnxruntime_java_unittests.cmake
@@ -6,42 +6,26 @@
 FILE(TO_NATIVE_PATH ${GRADLE_EXECUTABLE} GRADLE_NATIVE_PATH)
 FILE(TO_NATIVE_PATH ${BIN_DIR} BINDIR_NATIVE_PATH)
 
-function(run_java_unit_test SYSTEM_PROPERTY_DEFINITION)
-  set(GRADLE_TEST_ARGS
-      ${GRADLE_NATIVE_PATH}
-      test --rerun
-      cmakeCheck
-      --console=plain
-      -DcmakeBuildDir=${BINDIR_NATIVE_PATH}
-      -Dorg.gradle.daemon=false
-      ${SYSTEM_PROPERTY_DEFINITIONS})
-
-  if(WIN32)
-  list(PREPEND GRADLE_TEST_ARGS cmd /C)
-  endif()
-
-  message(STATUS "gradle test command args: ${GRADLE_TEST_ARGS}")
-
-  execute_process(COMMAND ${GRADLE_TEST_ARGS}
-                  WORKING_DIRECTORY ${REPO_ROOT}/java
-                  RESULT_VARIABLE HAD_ERROR)
-endfunction()
-
 message(STATUS "gradle additional system property definitions: ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}")
 
-string(FIND "${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}" "-DUSE_CUDA=1" INDEX_CUDA)
-string(FIND "${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}" "-DUSE_DML=1" INDEX_DML)
+set(GRADLE_TEST_ARGS
+    ${GRADLE_NATIVE_PATH}
+    test --rerun
+    cmakeCheck
+    --console=plain
+    -DcmakeBuildDir=${BINDIR_NATIVE_PATH}
+    -Dorg.gradle.daemon=false
+    ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
 
+if(WIN32)
+  list(PREPEND GRADLE_TEST_ARGS cmd /C)
+endif()
 
-if((INDEX_CUDA GREATER -1) AND (INDEX_DML GREATER -1))
-  string(REPLACE "-DUSE_CUDA=1" "" GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
-  run_java_unit_test("${GRADLE_DML_YSTEM_PROPERTY_DEFINITIONS}")
+message(STATUS "gradle test command args: ${GRADLE_TEST_ARGS}")
 
-  string(REPLACE "-DUSE_DML=1" "" GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS ${GRADLE_SYSTEM_PROPERTY_DEFINITIONS})
-  run_java_unit_test("${GRADLE_CUDA_YSTEM_PROPERTY_DEFINITIONS}")
-else()
-  run_java_unit_test("${GRADLE_SYSTEM_PROPERTY_DEFINITIONS}")
-endif()
+execute_process(COMMAND ${GRADLE_TEST_ARGS}
+    WORKING_DIRECTORY ${REPO_ROOT}/java
+    RESULT_VARIABLE HAD_ERROR)
 
 if(HAD_ERROR)
   message(FATAL_ERROR "Java Unitests failed")

From 29a9a60c835e7e9bef515056033ef5b8742db167 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 09:01:52 +0800
Subject: [PATCH 60/65] restore java test and disable testDML

---
 .../java/ai/onnxruntime/InferenceTest.java    | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/InferenceTest.java b/java/src/test/java/ai/onnxruntime/InferenceTest.java
index 5fe1f69c8a0da..15d89b536b39a 100644
--- a/java/src/test/java/ai/onnxruntime/InferenceTest.java
+++ b/java/src/test/java/ai/onnxruntime/InferenceTest.java
@@ -650,14 +650,7 @@ public void testProviders() {
     int providersSize = providers.size();
     assertTrue(providersSize > 0);
     assertTrue(providers.contains(OrtProvider.CPU));
-    String no_cuda_test = Optional.ofNullable(System.getenv("NO_CUDA_TEST")).orElse("0");
-    if (no_cuda_test.equals("1") && providers.contains(OrtProvider.CUDA)) {
-      providers.remove(OrtProvider.CUDA);
-    }
-    String no_dml_test = Optional.ofNullable(System.getenv("NO_DML_TEST")).orElse("0");
-    if (no_dml_test.equals("1") && providers.contains(OrtProvider.DIRECT_ML)) {
-      providers.remove(OrtProvider.DIRECT_ML);
-    }
+
     // Check that the providers are a copy of the original, note this does not enable the DNNL
     // provider
     providers.add(OrtProvider.DNNL);
@@ -697,12 +690,7 @@ public void testSymbolicDimensionAssignment() throws OrtException {
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
   public void testCUDA() throws OrtException {
-    String no_cuda_test = Optional.ofNullable(System.getenv("NO_CUDA_TEST")).orElse("0");
-    if (!no_cuda_test.equals("1")) {
-      runProvider(OrtProvider.CUDA);
-    } else {
-      System.out.println("Skipping CUDA test because NO_CUDA_TEST is set.");
-    }
+    runProvider(OrtProvider.CUDA);
   }
 
   @Test
@@ -749,15 +737,11 @@ public void testCoreML() throws OrtException {
     runProvider(OrtProvider.CORE_ML);
   }
 
+  @Disabled("DirectML Java API hasn't been supported yet")
   @Test
   @EnabledIfSystemProperty(named = "USE_DML", matches = "1")
   public void testDirectML() throws OrtException {
-    String no_dml_test = Optional.ofNullable(System.getenv("NO_DML_TEST")).orElse("0");
-    if (!no_dml_test.equals("1")) {
-      runProvider(OrtProvider.DIRECT_ML);
-    } else {
-      System.out.println("Skipping DML test because NO_DML_TEST is set.");
-    }
+    runProvider(OrtProvider.DIRECT_ML);
   }
 
   @Test

From c5b1fc40d641592db823deac350623770249abca Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 10:16:54 +0800
Subject: [PATCH 61/65] revert one change

---
 .../java/ai/onnxruntime/providers/ProviderOptionsTest.java   | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
index e6baaa9cb0c60..57c4eb3577fd0 100644
--- a/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
+++ b/java/src/test/java/ai/onnxruntime/providers/ProviderOptionsTest.java
@@ -26,7 +26,6 @@
 import java.util.EnumSet;
 import java.util.HashMap;
 import java.util.Map;
-import java.util.Optional;
 import org.junit.jupiter.api.Test;
 import org.junit.jupiter.api.condition.EnabledIfSystemProperty;
 
@@ -36,10 +35,6 @@ public class ProviderOptionsTest {
   @Test
   @EnabledIfSystemProperty(named = "USE_CUDA", matches = "1")
   public void testCUDAOptions() throws OrtException {
-    String no_cuda_test = Optional.ofNullable(System.getenv("NO_CUDA_TEST")).orElse("0");
-    if (no_cuda_test.equals("1")) {
-      return;
-    }
     // Test standard options
     OrtCUDAProviderOptions cudaOpts = new OrtCUDAProviderOptions(0);
     cudaOpts.add("gpu_mem_limit", "" + (512 * 1024 * 1024));

From c8b24ce4fe483577d731235cd893fb9c7c513735 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 15:40:56 +0800
Subject: [PATCH 62/65] code learn

---
 onnxruntime/test/common/cuda_op_test_utils.h  |  6 +++
 .../test/contrib_ops/beam_search_test.cc      |  8 +---
 .../test/framework/allocation_planner_test.cc | 37 +++++-------------
 .../test/framework/inference_session_test.cc  |  9 ++---
 .../test/framework/memcpy_transformer_test.cc | 38 +++++++------------
 .../providers/cpu/tensor/gather_op_test.cc    |  7 ++--
 6 files changed, 38 insertions(+), 67 deletions(-)

diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 6833c3785466d..5700b936dca50 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -5,6 +5,12 @@
 
 #include "test/util/include/default_providers.h"
 
+#define SKIP_CUDA_TEST_WITH_DML                                                                                           \
+  if (DefaultCudaExecutionProvider() == nullptr) {                                                                        \
+    std::cout << "Skip cuda ep test in " << ::testing::UnitTest::GetInstance()->current_test_info()->name() << std::endl; \
+    return;                                                                                                               \
+  }
+
 namespace onnxruntime {
 namespace test {
 
diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc
index 09d4fd470affd..f6fc9ea7662cb 100644
--- a/onnxruntime/test/contrib_ops/beam_search_test.cc
+++ b/onnxruntime/test/contrib_ops/beam_search_test.cc
@@ -74,9 +74,7 @@ TEST(BeamSearchTest, GptBeamSearchFp32) {
 
   Ort::SessionOptions session_options;
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
 #ifdef USE_CUDA
   OrtCUDAProviderOptionsV2 cuda_options;
@@ -172,9 +170,7 @@ TEST(BeamSearchTest, GptBeamSearchFp16) {
   if (enable_cuda || enable_rocm) {
     Ort::SessionOptions session_options;
 #if defined(USE_CUDA) && defined(USE_DML)
-    if (DefaultCudaExecutionProvider() == nullptr) {
-      return;
-    }
+    SKIP_CUDA_TEST_WITH_DML;
 #endif
 #ifdef USE_CUDA
     OrtCUDAProviderOptionsV2 cuda_options;
diff --git a/onnxruntime/test/framework/allocation_planner_test.cc b/onnxruntime/test/framework/allocation_planner_test.cc
index 9d9e3c825b05f..a7f8a6424aa50 100644
--- a/onnxruntime/test/framework/allocation_planner_test.cc
+++ b/onnxruntime/test/framework/allocation_planner_test.cc
@@ -28,6 +28,7 @@ using json = nlohmann::json;
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_execution_provider.h"
 #include "core/providers/cuda/cuda_provider_factory.h"
+#include "test/common/cuda_op_test_utils.h"
 #endif  // USE_CUDA
 #include "core/session/onnxruntime_session_options_config_keys.h"
 using namespace ONNX_NAMESPACE;
@@ -1280,9 +1281,7 @@ TEST_F(PlannerTest, LocationPlanningForImplicitInputsWithoutExplicitConsumersInM
 // node1(CPU ep)->node2(CPU ep)->node3(CUDA ep)->node4(CPU ep)
 TEST_F(PlannerTest, MultiStream) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
 
   ONNX_NAMESPACE::TensorProto tensor;
@@ -1332,9 +1331,7 @@ TEST_F(PlannerTest, MultiStream) {
 // All 3 nodes are CUDA EP, node1 is in stream0, node2 is in stream1, node3 is in stream2
 TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernelAdd = KernelDefBuilder().SetName("Add").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
@@ -1378,9 +1375,7 @@ TEST_F(PlannerTest, MultiStream1StreamWaitFor2Streams) {
 // node1's output, which is consumed by both node2 and node3, is in CPU.
 TEST_F(PlannerTest, MultiStreamCudaEPNodeCPUOutput) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   MemcpyToHostInCuda_TransposeInCudaAndCpu("./testdata/multi_stream_models/memcpyToHost_same_stream_with_transpose.json");
   EXPECT_EQ(GetState().GetExecutionPlan()->execution_plan.size(), 2) << "2 logic streams";
@@ -1446,9 +1441,7 @@ TEST_F(PlannerTest, MultiStreamMultiOutput) {
 // as there is a specific order between node1 and node2 if they are in the same stream, thus node3 will only need to wait the latter one
 TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   std::unique_ptr<::onnxruntime::KernelDef> cudaKernel = KernelDefBuilder().SetName("Transpose").Provider(kCudaExecutionProvider).SinceVersion(1, 10).Build();
   std::string Graph_input1("Graph_input1"), Graph_input2("Graph_input2"), Graph_input3("Graph_input3"), Arg1("Arg1"), Arg2("Arg2"), Arg3("Arg3"), node1("node1"), node2("node2"), node3("node3");
@@ -1488,9 +1481,7 @@ TEST_F(PlannerTest, MultiStream2NodesSameStreamConsumedBy1NodeInDifferentStream)
 #if !defined(__wasm__) && defined(ORT_ENABLE_STREAM)
 TEST_F(PlannerTest, ParaPlanCreation) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   TypeProto graph_in_type;
   graph_in_type.mutable_tensor_type()->set_elem_type(TensorProto_DataType_FLOAT);
@@ -1934,9 +1925,7 @@ TEST_F(PlannerTest, ParaPlanCreation) {
 
 TEST_F(PlannerTest, TestMultiStreamConfig) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
 
   const char* type = "DeviceBasedPartitioner";
@@ -2013,9 +2002,7 @@ TEST_F(PlannerTest, TestMultiStreamSaveConfig) {
 // Load with partition config where a node is missing, session load expected to fail.
 TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
 
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_missing_node.json";
@@ -2039,9 +2026,7 @@ TEST_F(PlannerTest, TestMultiStreamMissingNodeConfig) {
 // Load with partition config where streams and devices has mismatch
 TEST_F(PlannerTest, TestMultiStreamMismatchDevice) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   const char* config_file_path = "./testdata/multi_stream_models/conv_add_relu_single_stream_mismatch_device.json";
   SessionOptions sess_opt;
@@ -2132,9 +2117,7 @@ TEST_F(PlannerTest, TestCpuIf) {
 //
 TEST(AllocationPlannerTest, ReusedInputCrossDifferentStreams) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
 
   SessionOptions sess_opt;
diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc
index e4f9697d5242e..9c7e6e9761728 100644
--- a/onnxruntime/test/framework/inference_session_test.cc
+++ b/onnxruntime/test/framework/inference_session_test.cc
@@ -34,6 +34,7 @@
 #ifdef USE_CUDA
 #include "core/providers/cuda/cuda_provider_factory.h"
 #include "core/providers/cuda/gpu_data_transfer.h"
+#include "test/common/cuda_op_test_utils.h"
 #endif
 #ifdef USE_TENSORRT
 #include "core/providers/tensorrt/tensorrt_provider_options.h"
@@ -2173,9 +2174,7 @@ TEST(InferenceSessionTests, TestStrictShapeInference) {
 // disable it, since we are going to enable parallel execution with cuda ep
 TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   string model_uri = "testdata/transform/fusion/fuse-conv-bn-mul-add-unsqueeze.onnx";
 
@@ -2201,9 +2200,7 @@ TEST(InferenceSessionTests, DISABLED_TestParallelExecutionWithCudaProvider) {
 
 TEST(InferenceSessionTests, TestArenaShrinkageAfterRun) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
 
   OrtArenaCfg arena_cfg;
diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc
index 3bb6bb2ffd097..aae6a3f8ab15f 100644
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@@ -9,6 +9,9 @@
 #include "default_providers.h"
 #include "gtest/gtest.h"
 #include "test_utils.h"
+#ifdef USE_CUDA
+#include "test/common/cuda_op_test_utils.h"
+#endif
 #include "test/test_environment.h"
 #include "asserts.h"
 
@@ -74,6 +77,9 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op,
 #ifdef USE_CUDA
 
 TEST(TransformerTest, MemcpyTransformerTest) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  SKIP_CUDA_TEST_WITH_DMLCUDA;
+#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -106,11 +112,7 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() != nullptr) {
-    ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
-  }
-#else
+#if defined(USE_CUDA)
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
 #endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCpuExecutionProvider,
@@ -118,12 +120,6 @@ TEST(TransformerTest, MemcpyTransformerTest) {
   KernelRegistryManager test_registry_manager;
   ASSERT_STATUS_OK(test_registry_manager.RegisterKernels(execution_providers));
 
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
-#endif
-
   MemcpyTransformer transformer({onnxruntime::kCudaExecutionProvider}, test_registry_manager);
 
   bool modified = false;
@@ -141,6 +137,9 @@ TEST(TransformerTest, MemcpyTransformerTest) {
 }
 
 TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
+#if defined(USE_CUDA) && defined(USE_DML)
+  SKIP_CUDA_TEST_WITH_DML;
+#endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;
   auto model = std::make_shared<onnxruntime::Model>("test", false, ModelMetaData(), PathString(),
@@ -173,11 +172,6 @@ TEST(TransformerTest, MemcpyTransformerTestCudaFirst) {
 
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
-#if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
-#endif
 
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
 
@@ -301,9 +295,7 @@ TEST(TransformerTest, TestInitializerDuplicationInSubgraph) {
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
 
@@ -349,9 +341,7 @@ TEST(TransformerTest, MemcpyTransformerTestGraphInputConsumedOnMultipleDevices)
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
 
@@ -457,9 +447,7 @@ TEST(TransformerTest, MemcpyTransformerTestImplicitInputConsumedOnMultipleDevice
   KernelRegistryManager kernel_registry_manager;
   ExecutionProviders execution_providers;
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   ASSERT_STATUS_OK(execution_providers.Add(onnxruntime::kCudaExecutionProvider, DefaultCudaExecutionProvider()));
 
diff --git a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
index ae838f10b4153..0f23e4c39d7e2 100644
--- a/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
+++ b/onnxruntime/test/providers/cpu/tensor/gather_op_test.cc
@@ -3,6 +3,9 @@
 
 #include "core/session/onnxruntime_session_options_config_keys.h"
 #include "gtest/gtest.h"
+#if USE_CUDA
+#include "test/common/cuda_op_test_utils.h"
+#endif
 #include "test/providers/provider_test_utils.h"
 #include "test/util/include/default_providers.h"
 
@@ -123,9 +126,7 @@ TEST(GatherOpTest, Gather_invalid_index_gpu) {
                          0.0f, 0.0f, 0.0f, 0.0f});
 
 #if defined(USE_CUDA) && defined(USE_DML)
-  if (DefaultCudaExecutionProvider() == nullptr) {
-    return;
-  }
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   // On GPU, just set the value to 0 instead of report error. exclude all other providers
   test

From 04856f4a0e61bc2ea33645ccd67841ef80978d4e Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 18:24:46 +0800
Subject: [PATCH 63/65] gtest_skip

---
 onnxruntime/test/common/cuda_op_test_utils.h | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 5700b936dca50..773369f35850b 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -7,8 +7,7 @@
 
 #define SKIP_CUDA_TEST_WITH_DML                                                                                           \
   if (DefaultCudaExecutionProvider() == nullptr) {                                                                        \
-    std::cout << "Skip cuda ep test in " << ::testing::UnitTest::GetInstance()->current_test_info()->name() << std::endl; \
-    return;                                                                                                               \
+    GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled";                                                                                                          \
   }
 
 namespace onnxruntime {

From 9630aebfe63c7f796522bfc4c993717115a4f353 Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 18:40:02 +0800
Subject: [PATCH 64/65] lint

---
 onnxruntime/test/common/cuda_op_test_utils.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h
index 773369f35850b..d3e069237217e 100644
--- a/onnxruntime/test/common/cuda_op_test_utils.h
+++ b/onnxruntime/test/common/cuda_op_test_utils.h
@@ -5,9 +5,9 @@
 
 #include "test/util/include/default_providers.h"
 
-#define SKIP_CUDA_TEST_WITH_DML                                                                                           \
-  if (DefaultCudaExecutionProvider() == nullptr) {                                                                        \
-    GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled";                                                                                                          \
+#define SKIP_CUDA_TEST_WITH_DML                                          \
+  if (DefaultCudaExecutionProvider() == nullptr) {                       \
+    GTEST_SKIP() << "CUDA Tests are not supported while DML is enabled"; \
   }
 
 namespace onnxruntime {

From 526133ad9b82af208ee164351ed2473cd23b548a Mon Sep 17 00:00:00 2001
From: Yi Zhang <your@email.com>
Date: Thu, 31 Oct 2024 20:16:16 +0800
Subject: [PATCH 65/65] typo

---
 onnxruntime/test/framework/memcpy_transformer_test.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/test/framework/memcpy_transformer_test.cc b/onnxruntime/test/framework/memcpy_transformer_test.cc
index aae6a3f8ab15f..2313f00e4d123 100644
--- a/onnxruntime/test/framework/memcpy_transformer_test.cc
+++ b/onnxruntime/test/framework/memcpy_transformer_test.cc
@@ -78,7 +78,7 @@ void ExpectCopy(const onnxruntime::Node& source, const std::string copy_op,
 
 TEST(TransformerTest, MemcpyTransformerTest) {
 #if defined(USE_CUDA) && defined(USE_DML)
-  SKIP_CUDA_TEST_WITH_DMLCUDA;
+  SKIP_CUDA_TEST_WITH_DML;
 #endif
   std::unordered_map<std::string, int> domain_to_version;
   domain_to_version[kOnnxDomain] = 7;