From fc8631e2f11d85c84ab9cc711aacb9c589b6f71a Mon Sep 17 00:00:00 2001
From: Jiajia Qin <jiajia.qin@intel.com>
Date: Tue, 28 Nov 2023 13:21:47 +0800
Subject: [PATCH 1/6] [js/web] Fix conv2dMatmul errors due to #18452 (#18562)

### Description
Currently, all conv2dMatmul with inChannels = 3 and outChannels % 4 = 0
will report compilation errors. Models, which include this kind of shape
will be impacted, like mobilenetv2-12, resnet50 .

The errors is introduced by #18452
https://github.com/microsoft/onnxruntime/pull/18452/files#diff-8b24ea43aa11b1346c0c9e327f9bce6b37a93bd8f2bf8a6392b2b263972b7ea2R200,
which accidentally pass `components` to `x`. But `x`'s components is
`innerElementSize` not `components `. And when `innerElementSize` is 3,
we should use `1` in current design.
---
 .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts  |  5 +--
 js/web/test/data/ops/conv.jsonc               | 32 ++++++++++++++++++-
 2 files changed, 34 insertions(+), 3 deletions(-)

diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
index 22f942a0d9ab4..3638938df7dbe 100644
--- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
+++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts
@@ -180,7 +180,7 @@ export const createConv2DMatMulProgramInfo =
 
       LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`);
 
-      const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0];
+      const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1;
 
       const tileAOuter = workGroupSize[1] * elementsPerThread[1];
       const tileBOuter = workGroupSize[0] * elementsPerThread[0];
@@ -197,7 +197,8 @@ export const createConv2DMatMulProgramInfo =
       const components = isVec4 ? 4 : 1;
       const programUniforms: ProgramUniform[] =
           [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}];
-      const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components);
+      const x =
+          inputVariable('x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize);
       const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components);
       const inputVariables = [x, w];
 
diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc
index 219e15eb4648f..2e8eaaba191d0 100644
--- a/js/web/test/data/ops/conv.jsonc
+++ b/js/web/test/data/ops/conv.jsonc
@@ -126,7 +126,7 @@
     ]
   },
   {
-    "name": "conv with bias addition C",
+    "name": "conv with bias addition C - NHWC",
     "operator": "Conv",
     "inputShapeDefinitions": "rankOnly",
     "opset": { "domain": "", "version": 17 },
@@ -158,6 +158,36 @@
             "type": "float32"
           }
         ]
+      },
+      {
+        "name": "inChannel = 3, outChannel = 4",
+        "inputs": [
+          {
+            "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10],
+            "dims": [1, 3, 3, 3],
+            "type": "float32"
+          },
+          {
+            "data": [
+              1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+              10, 11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8
+            ],
+            "dims": [4, 3, 2, 2],
+            "type": "float32"
+          },
+          {
+            "data": [5, 6, 7, 8],
+            "dims": [4],
+            "type": "float32"
+          }
+        ],
+        "outputs": [
+          {
+            "data": [360, 334, 271, 323, 909, 963, 1024, 1028, 683, 655, 576, 650, 473, 508, 570, 677],
+            "dims": [1, 4, 2, 2],
+            "type": "float32"
+          }
+        ]
       }
     ]
   },

From 3f42fbad2e42cf03c01eb0428b06e24f4ad2d427 Mon Sep 17 00:00:00 2001
From: Ran Gal <79867742+galran@users.noreply.github.com>
Date: Mon, 27 Nov 2023 23:54:38 -0800
Subject: [PATCH 2/6] deleted the unused random_device variables because they
 caused a warning that was treated like an error. (#18543)

deleted the unused random_device variables because they caused a warning
that was treated like an error.

**_Please check if the declaration is required for the random number
generation. if so, there need to be a dummy reference to the variable or
turning off the warning as error behavior._**

### Description
<!-- Describe your changes. -->


### Motivation and Context
<!-- - Why is this change required? What problem does it solve?
- If it fixes an open issue, please link to the issue here. -->
---
 orttraining/orttraining/test/gradient/optimizer_ops_test.cc     | 2 --
 .../test/training_ops/cpu/reduction/reduction_ops_test.cc       | 1 -
 2 files changed, 3 deletions(-)

diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
index c100730aacc44..bfb59f1525e47 100644
--- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
+++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc
@@ -1542,7 +1542,6 @@ TEST(OptimizerTest, LambOptimizerTestLarge) {
     std::vector<float> m(size);
     std::vector<float> v(size);
 
-    std::random_device random_device;
     std::mt19937 random_engine(0);
     std::uniform_real_distribution<float> dist(0.1f, 1.0f);
     for (int i = 0; i < size; ++i) {
@@ -1581,7 +1580,6 @@ TEST(OptimizerTest, LambOptimizerTestLarge) {
 
 TEST(OptimizerTest, LambOptimizerMultiTensorRatio) {
   constexpr int group_count = 127;
-  std::random_device random_device;
   std::mt19937 random_engine(0);
   std::uniform_real_distribution<float> dist(0.1f, 1.0f);
   std::uniform_int_distribution<int64_t> dist_int(1, 1228);
diff --git a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
index be8b0aaa0bce1..60c3ecbcce8ce 100644
--- a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
+++ b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc
@@ -275,7 +275,6 @@ void TestMultiTensorReduce(
   test.SetDeterminism(use_determinism);
 
   // Set up random number generator.
-  std::random_device random_device;
   std::mt19937 random_engine(0);
   std::uniform_real_distribution<float> dist(min, max);
   std::uniform_int_distribution<int64_t> dist_int(min_tensor_size, max_tensor_size);

From 94a6020a7f59f22101653988a36bca02593eb816 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= <xadupre@users.noreply.github.com>
Date: Tue, 28 Nov 2023 03:56:00 -0800
Subject: [PATCH 3/6] Improve parallelization of TfIdfVectorizer, Reduce memory
 consumption (#18539)

### Description

TfIdfVectorizer has two steps: first search for n-grams in the input,
second, weight the results. The second step was not parallelized. The PR
adresses that issue. Before two vectors were of the size of the output
were allocated to compute the results. The first one, frequencies, was
used as an intermediate vector between the two steps. This vector is now
broken into multiple small vectors, one per thread. The memory
consumption is then reduced for batches with a number of rows > the
number of threads.

### Motivation and Context
Performance and memory consumption.

For one model, the improvment is +15% faster (4 cores, model size is
~6Mb, batch size is 100). Here is another benchmark on
a machine with 32 cores with different size of vocabularies and batch
sizes. The tested TfIdfVectorizer only deals with unigram and processes
sequences of 10 tokens (integers).


![image](https://github.com/microsoft/onnxruntime/assets/22452781/0bb9abe9-ed81-44da-b5c4-ad2a12f129bd)
---
 .../core/providers/cpu/nn/tfidfvectorizer.cc  | 154 ++++++++----------
 .../core/providers/cpu/nn/tfidfvectorizer.h   |   7 +-
 2 files changed, 71 insertions(+), 90 deletions(-)

diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
index f36b75c508da0..eb245a4c9ba0c 100644
--- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
+++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc
@@ -141,14 +141,11 @@ struct TfIdfVectorizer::Impl {
   Impl(const Impl&) = delete;
   Impl& operator=(const Impl&) = delete;
 
-  void IncrementCount(size_t ngram_id, size_t row_num,
-                      std::vector<uint32_t>& frequencies) const {
+  inline size_t OutputIdToIncrement(size_t ngram_id) const {
     assert(ngram_id != 0);
     --ngram_id;
     assert(ngram_id < ngram_indexes_.size());
-    size_t output_idx = row_num * output_size_ + SafeInt<size_t>(ngram_indexes_[ngram_id]);
-    assert(output_idx < frequencies.size());
-    ++frequencies[output_idx];
+    return SafeInt<size_t>(ngram_indexes_[ngram_id]);
   }
 };
 
@@ -252,77 +249,17 @@ TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), imp
 
 TfIdfVectorizer::~TfIdfVectorizer() = default;
 
-void TfIdfVectorizer::OutputResult(OpKernelContext* ctx, size_t B, const std::vector<uint32_t>& frequences) const {
-  const Impl& impl = *impl_;
-  std::vector<int64_t> output_dims;
-  if (B == 0) {
-    output_dims.push_back(impl.output_size_);
-    B = 1;  // For use in the loops below
-  } else {
-    output_dims.push_back(B);
-    output_dims.push_back(impl.output_size_);
-  }
-
-  const auto row_size = impl.output_size_;
-
-  TensorShape output_shape(output_dims);
-  assert(frequences.size() == static_cast<size_t>(output_shape.Size()));
-
-  auto Y = ctx->Output(0, output_shape);
-  auto output_data = Y->MutableData<float>();
-  const auto& w = impl.weights_;
-  switch (impl.weighting_criteria_) {
-    case kTF: {
-      for (auto f : frequences) {
-        *output_data++ = static_cast<float>(f);
-      }
-    } break;
-    case kIDF: {
-      if (!w.empty()) {
-        const auto* freqs = frequences.data();
-        for (size_t batch = 0; batch < B; ++batch) {
-          for (size_t i = 0; i < row_size; ++i) {
-            *output_data++ = (*freqs++ > 0) ? w[i] : 0;
-          }
-        }
-      } else {
-        for (auto f : frequences) {
-          *output_data++ = (f > 0) ? 1.0f : 0;
-        }
-      }
-    } break;
-    case kTFIDF: {
-      if (!w.empty()) {
-        const auto* freqs = frequences.data();
-        for (size_t batch = 0; batch < B; ++batch) {
-          for (size_t i = 0; i < row_size; ++i) {
-            *output_data++ = *freqs++ * w[i];
-          }
-        }
-      } else {
-        for (auto f : frequences) {
-          *output_data++ = static_cast<float>(f);
-        }
-      }
-    } break;
-    case kNone:  // fall-through
-    default:
-      assert(false);
-  }
-}
-
-void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_t row_size,
-                                  std::vector<uint32_t>& frequencies) const {
-  auto X = ctx->Input<Tensor>(0);
-  const auto elem_size = X->DataType()->Size();
-
-  const void* const row_begin = AdvanceElementPtr(X->DataRaw(), row_num * row_size, elem_size);
+void TfIdfVectorizer::ComputeImpl(const void* x_data_raw, size_t elem_size, ptrdiff_t row_num, size_t row_size,
+                                  bool is_input_string, gsl::span<float> output_data,
+                                  std::function<void(size_t, gsl::span<float>&)>& fn_weight) const {
+  const void* const row_begin = AdvanceElementPtr(x_data_raw, row_num * row_size, elem_size);
   const void* const row_end = AdvanceElementPtr(row_begin, row_size, elem_size);
 
   const auto& impl = *impl_;
   const auto max_gram_length = impl.max_gram_length_;
   const auto max_skip_distance = impl.max_skip_count_ + 1;  // Convert to distance
   auto start_ngram_size = impl.min_gram_length_;
+  size_t output_idx;
 
   for (auto skip_distance = 1; skip_distance <= max_skip_distance; ++skip_distance) {
     auto ngram_start = row_begin;
@@ -336,7 +273,7 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_
       }
 
       auto ngram_item = ngram_start;
-      if (X->IsDataTypeString()) {
+      if (is_input_string) {
         const std::string* str_item = reinterpret_cast<const std::string*>(ngram_item);
         const StrMap* str_map = &impl.str_map_;
         for (auto ngram_size = 1;
@@ -349,7 +286,8 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_
             break;
           }
           if (ngram_size >= start_ngram_size && hit->second->id_ != 0) {
-            impl.IncrementCount(hit->second->id_, row_num, frequencies);
+            output_idx = impl.OutputIdToIncrement(hit->second->id_);
+            fn_weight(output_idx, output_data);
           }
           str_map = &hit->second->leafs_;
         }
@@ -360,13 +298,14 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_
              ngram_size <= max_gram_length &&
              ngram_item < ngram_row_end;
              ++ngram_size, ngram_item = AdvanceElementPtr(ngram_item, skip_distance, elem_size)) {
-          int64_t val = (X->IsDataType<int32_t>()) ? int64_t{*reinterpret_cast<const int32_t*>(ngram_item)} : *reinterpret_cast<const int64_t*>(ngram_item);
+          int64_t val = (elem_size == 4) ? int64_t{*reinterpret_cast<const int32_t*>(ngram_item)} : *reinterpret_cast<const int64_t*>(ngram_item);
           auto hit = int_map->find(val);
           if (hit == int_map->end()) {
             break;
           }
           if (ngram_size >= start_ngram_size && hit->second->id_ != 0) {
-            impl.IncrementCount(hit->second->id_, row_num, frequencies);
+            output_idx = impl.OutputIdToIncrement(hit->second->id_);
+            fn_weight(output_idx, output_data);
           }
           int_map = &hit->second->leafs_;
         }
@@ -412,31 +351,76 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const {
   }
 
   assert((num_rows * C) == total_items);
-  // Frequency holder allocate [B..output_size_]
-  // and init all to zero
-  std::vector<uint32_t> frequencies;
-  frequencies.resize(num_rows * impl_->output_size_, 0);
+  const Impl& impl = *impl_;
+  TensorShapeVector output_dims;
+  if (B == 0) {
+    output_dims.push_back(impl.output_size_);
+    B = 1;  // For use in the loops below
+  } else {
+    output_dims.push_back(B);
+    output_dims.push_back(impl.output_size_);
+  }
+  TensorShape output_shape(output_dims);
+
+  auto Y = ctx->Output(0, output_shape);
+  auto output_data = Y->MutableData<float>();
+  const bool is_input_string = X->IsDataTypeString();
 
   if (total_items == 0 ||
-      (X->IsDataTypeString() && impl_->str_map_.empty()) ||
+      (is_input_string && impl_->str_map_.empty()) ||
       ((X->IsDataType<int32_t>() || X->IsDataType<int64_t>()) && impl_->int64_map_.empty())) {
     // TfidfVectorizer may receive an empty input when it follows a Tokenizer
     // (for example for a string containing only stopwords).
     // TfidfVectorizer returns a zero tensor of shape
     // {b_dim, output_size} when b_dim is the number of received observations
     // and output_size the is the maximum value in ngram_indexes attribute plus 1.
-    OutputResult(ctx, B, frequencies);
+    memset(output_data, 0, static_cast<size_t>(output_shape.Size() * sizeof(float)));
     return Status::OK();
   }
 
-  std::function<void(ptrdiff_t)> fn = [this, ctx, C, &frequencies](ptrdiff_t row_num) {
-    ComputeImpl(ctx, row_num, C, frequencies);
-  };
+  auto x_data_raw = ctx->Input<Tensor>(0)->DataRaw();
+  const auto elem_size = X->DataType()->Size();
+  int32_t num_batches = std::min<int32_t>(concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()) * 2, num_rows);
 
-  concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), num_rows, std::move(fn), 0);
+  const auto& w = impl.weights_;
+  std::function<void(size_t, gsl::span<float>&)> fn_weight;
 
-  OutputResult(ctx, B, frequencies);
+  switch (impl.weighting_criteria_) {
+    case kTF:
+      fn_weight = [](size_t i, gsl::span<float>& out) { out[i] += 1.0f; };
+      break;
+    case kIDF:
+      if (!w.empty()) {
+        fn_weight = [&w](size_t i, gsl::span<float>& out) { out[i] = w[i]; };
+      } else {
+        fn_weight = [](size_t i, gsl::span<float>& out) { out[i] = 1.0f; };
+      }
+      break;
+    case kTFIDF:
+      if (!w.empty()) {
+        fn_weight = [&w](size_t i, gsl::span<float>& out) { out[i] += w[i]; };
+      } else {
+        fn_weight = [](size_t i, gsl::span<float>& out) { out[i] += 1.0f; };
+      }
+      break;
+    case kNone:  // fall-through
+    default:
+      assert(false);
+  }
+
+  std::function<void(ptrdiff_t)> fn = [this, C, output_data, x_data_raw, elem_size,
+                                       is_input_string, num_batches, num_rows, &fn_weight](ptrdiff_t batch_num) {
+    // Frequency holder allocate [B..output_size_] and init all to zero.
+    auto work = concurrency::ThreadPool::PartitionWork(batch_num, num_batches, static_cast<size_t>(num_rows));
+    std::vector<uint32_t> frequencies(this->impl_->output_size_);
+    for (auto row_num = work.start; row_num < work.end; ++row_num) {
+      auto out = gsl::span<float>(output_data + row_num * this->impl_->output_size_, this->impl_->output_size_);
+      std::fill(out.begin(), out.end(), 0.0f);
+      ComputeImpl(x_data_raw, elem_size, row_num, C, is_input_string, out, fn_weight);
+    }
+  };
 
+  concurrency::ThreadPool::TrySimpleParallelFor(ctx->GetOperatorThreadPool(), num_batches, std::move(fn));
   return Status::OK();
 }
 
diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h
index 45db40d893231..14488d91c23e9 100644
--- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h
+++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h
@@ -19,11 +19,8 @@ class TfIdfVectorizer final : public OpKernel {
   Status Compute(OpKernelContext* ctx) const override;
 
  private:
-  void ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_t row_size,
-                   std::vector<uint32_t>& frequencies) const;
-
-  // Apply weighing criteria and output
-  void OutputResult(OpKernelContext* ctx, size_t b_dim, const std::vector<uint32_t>& frequences) const;
+  void ComputeImpl(const void* x_data_raw, size_t elem_size, ptrdiff_t row_num, size_t row_size, bool is_input_string,
+                   gsl::span<float> output_data, std::function<void(size_t, gsl::span<float>&)>& fn_weight) const;
 
   struct Impl;
   std::unique_ptr<Impl> impl_;

From 3ea27c29253aad7c02015e2af6d37dedafe2c9c3 Mon Sep 17 00:00:00 2001
From: Jian Chen <cjian@microsoft.com>
Date: Tue, 28 Nov 2023 09:03:46 -0800
Subject: [PATCH 4/6] Create a new Nuget Package pipeline for CUDA 12 (#18135)

---
 .../c-api-noopenmp-packaging-pipelines.yml    |  18 +-
 .../cuda-packaging-pipeline.yml               | 175 ++++++++++++++
 .../azure-pipelines/linux-gpu-ci-pipeline.yml |  29 ++-
 .../linux-gpu-tensorrt-ci-pipeline.yml        |  28 ++-
 .../nuget/templates/test_linux.yml            |  15 +-
 .../nuget/templates/test_win.yml              |  18 +-
 .../py-cuda-packaging-pipeline.yml            |   2 +-
 .../stages/nuget-combine-cuda-stage.yml       | 228 ++++++++++++++++++
 .../nuget-linux-cuda-packaging-stage.yml      | 161 +++++++++++++
 .../stages/nuget-win-cuda-packaging-stage.yml | 147 +++++++++++
 .../jobs/download_win_gpu_library.yml         |   1 -
 .../linux-gpu-tensorrt-packaging-pipeline.yml |  35 ++-
 .../azure-pipelines/templates/win-ci.yml      |  49 +++-
 .../github/linux/build_cuda_c_api_package.sh  |   2 +-
 .../linux/build_tensorrt_c_api_package.sh     |   2 +-
 .../docker/Dockerfile.manylinux2_28_cuda      |   1 +
 ...ckerfile.package_ubi8_cuda11_8_tensorrt8_6 |   9 +-
 ...8_6 => Dockerfile.package_ubuntu_2004_gpu} |  18 +-
 .../inference/x64/default/gpu/Dockerfile      |   4 +-
 19 files changed, 889 insertions(+), 53 deletions(-)
 create mode 100644 tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
 create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
 rename tools/ci_build/github/linux/docker/{Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 => Dockerfile.package_ubuntu_2004_gpu} (50%)

diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
index 0eccd71e47f46..67fa78da003a3 100644
--- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
+++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml
@@ -60,6 +60,14 @@ parameters:
   type: string
   default: '--use_azure'
 
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '11.8'
+  values:
+    - 11.8
+    - 12.2
+
 resources:
   repositories:
   - repository: onnxruntime-inference-examples # The name used to reference this repository in the checkout step
@@ -146,7 +154,13 @@ stages:
     timeoutInMinutes: 120
     pool: 'Onnxruntime-Linux-GPU'
     variables:
-      CUDA_VERSION: '11.8'
+      - name: CUDA_VERSION_MAJOR
+        ${{ if eq(parameters.CudaVersion, '11.8') }}:
+          value: '11'
+        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+          value: '12'
+      - name: CUDA_VERSION
+        value: ${{ parameters.CudaVersion }}
     steps:
     - template: templates/set-version-number-variables-step.yml
     - template: templates/get-docker-image-steps.yml
@@ -154,7 +168,7 @@ stages:
         Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
         Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
         DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
-        Repository: onnxruntimecuda11centosbuild
+        Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build
 
     - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
       workingDirectory: $(Build.SourcesDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
new file mode 100644
index 0000000000000..8a9592282cd46
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml
@@ -0,0 +1,175 @@
+parameters:
+  - name: RunOnnxRuntimeTests
+    displayName: Run Tests?
+    type: boolean
+    default: true
+
+  - name: UseIncreasedTimeoutForTests
+    displayName: Increase timeout for tests? Set it to false if you are doing an Onnx Runtime release.
+    type: boolean
+    default: false
+
+  - name: DoCompliance
+    displayName: Run Compliance Tasks?
+    type: boolean
+    default: true
+
+  - name: DoEsrp
+    displayName: Run code sign tasks? Must be true if you are doing an ONNX Runtime release
+    type: boolean
+    default: true
+
+  - name: IsReleaseBuild
+    displayName: Is a release build? Set it to true if you are doing an ONNX Runtime release.
+    type: boolean
+    default: false
+
+  - name: PreReleaseVersionSuffixString
+    displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package.
+    type: string
+    values:
+      - alpha
+      - beta
+      - rc
+      - none
+    default: none
+
+  - name: PreReleaseVersionSuffixNumber
+    displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package.
+    type: number
+    default: 0
+
+  # these 2 parameters are used for debugging.
+  - name: SpecificArtifact
+    displayName: Use Specific Artifact (Debugging only)
+    type: boolean
+    default: false
+
+  - name: BuildId
+    displayName: Pipeline BuildId, you could find it in the URL
+    type: string
+    default: '0'
+
+  - name: CudaVersion
+    displayName: CUDA version
+    type: string
+    default: '12.2'
+    values:
+      - 11.8
+      - 12.2
+
+variables:
+  - name: ReleaseVersionSuffix
+    value: ''
+  - name: docker_base_image
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+  - name: linux_trt_version
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: 8.6.1.6-1.cuda11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: 8.6.1.6-1.cuda12.0
+  - name: win_trt_home
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0
+  - name: win_cuda_home
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: $(Agent.TempDirectory)\v11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: $(Agent.TempDirectory)\v12.2
+resources:
+  repositories:
+    - repository: onnxruntime-inference-examples # The name used to reference this repository in the checkout step
+      type: github
+      endpoint: ort-examples
+      name: microsoft/onnxruntime-inference-examples
+    - repository: manylinux
+      type: Github
+      endpoint: Microsoft
+      name: pypa/manylinux
+      ref: 5eda9aded5462201e6310105728d33016e637ea7
+
+stages:
+# Set ReleaseVersionSuffix
+  - stage: Set_ReleaseVersionSuffix
+    jobs:
+      - job: Set_Variables
+        pool:
+          vmImage: ubuntu-latest
+        steps:
+          - checkout: none
+          - bash: |
+              # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote.
+              set +x
+              if [[ "${{ parameters.IsReleaseBuild }}" = True && "${{ parameters.PreReleaseVersionSuffixString }}" != "none"  ]]; then
+                if [[ "${{ parameters.PreReleaseVersionSuffixNumber }}" -eq 0 ]]; then
+                  echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]-${{ parameters.PreReleaseVersionSuffixString }}"
+                else
+                  echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]-${{ parameters.PreReleaseVersionSuffixString }}.${{ parameters.PreReleaseVersionSuffixNumber }}"
+                fi
+              else
+                echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]"
+              fi
+            name: Set_Release_Version_Suffix
+          - bash: echo $(ReleaseVersionSuffix)
+            name: Debug_Release_Version_Suffix
+  # this is needed for certain artifacts to be published
+  - stage: Linux_C_API_Packaging_CPU_x64
+    dependsOn: [ ]
+    jobs:
+    - template: templates/c-api-linux-cpu.yml
+      parameters:
+        BaseImage: 'registry.access.redhat.com/ubi8/ubi'
+        OnnxruntimeArch: 'x64'
+        OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
+        OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all'
+        OnnxruntimeNodejsBindingArch: 'x64'
+        PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU'
+        PackageJava: false
+        PackageNodeJS: false
+  # Nuget Packaging
+
+  - template: stages/nuget-linux-cuda-packaging-stage.yml
+    parameters:
+      CudaVersion: ${{ parameters.CudaVersion }}
+      docker_base_image: ${{ variables.docker_base_image }}
+      linux_trt_version: ${{ variables.linux_trt_version }}
+  - template: stages/nuget-win-cuda-packaging-stage.yml
+    parameters:
+      RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }}
+      UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
+      CudaVersion: ${{ parameters.CudaVersion }}
+      win_trt_home: ${{ variables.win_trt_home }}
+      win_cuda_home: ${{ variables.win_cuda_home }}
+  - template: stages/nuget-combine-cuda-stage.yml
+    parameters:
+      DoCompliance: ${{ parameters.DoCompliance }}
+      DoEsrp: ${{ parameters.DoEsrp }}
+      IsReleaseBuild: ${{ parameters.IsReleaseBuild }}
+  # Testing
+  ## Windows GPU Testing
+  - template: nuget/templates/test_win.yml
+    parameters:
+      AgentPool: 'onnxruntime-Win2022-GPU-T4'
+      NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
+      ArtifactSuffix: 'GPU'
+      StageSuffix: 'GPU'
+      Skipx86Tests: 'true'
+      CudaVersion: ${{ parameters.CudaVersion }}
+  ## Linux GPU Testing
+  - template: nuget/templates/test_linux.yml
+    parameters:
+      AgentPool: Onnxruntime-Linux-GPU
+      ArtifactSuffix: 'GPU'
+      StageSuffix: 'GPU'
+      NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu'
+      SpecificArtifact: ${{ parameters.specificArtifact }}
+      CudaVersion: ${{ parameters.CudaVersion }}
+      BuildId: ${{ parameters.BuildId }}
+
+## Win/Linux GPU Combined Publishing
+#- template: templates/publish-nuget.yml
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
index 9e1fae343c84e..0993a81a02249 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml
@@ -26,7 +26,14 @@ pr:
     - 'js/web'
     - 'onnxruntime/core/providers/js'
 #### end trigger ####
-
+parameters:
+  - name: CudaVersion
+    displayName: CUDA version
+    type: string
+    default: '11.8'
+    values:
+      - 11.8
+      - 12.2
 resources:
   repositories:
   - repository: manylinux
@@ -37,6 +44,17 @@ resources:
 
 variables:
   - template: templates/common-variables.yml
+  - name: docker_base_image
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+
+  - name: linux_trt_version
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: 8.6.1.6-1.cuda11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: 8.6.1.6-1.cuda12.0
 
 jobs:
 - job: Linux_Build
@@ -55,15 +73,14 @@ jobs:
   - checkout: self
     clean: true
     submodules: none
-
   - template: templates/get-docker-image-steps.yml
     parameters:
       Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
       --network=host 
-      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 
-      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 
+      --build-arg BASEIMAGE=$(docker_base_image)
+      --build-arg TRT_VERSION=$(linux_trt_version) 
       --build-arg BUILD_UID=$( id -u )
       "
       Repository: onnxruntimecuda11build
@@ -163,8 +180,8 @@ jobs:
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
       --network=host 
-      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 
+      --build-arg BASEIMAGE=$(docker_base_image)
+      --build-arg TRT_VERSION=$(linux_trt_version)
       --build-arg BUILD_UID=$( id -u )
       "
       Repository: onnxruntimecuda11build
diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
index 517c8d638c935..4ca11a4d1565b 100644
--- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml
@@ -26,7 +26,14 @@ pr:
     - 'js/web'
     - 'onnxruntime/core/providers/js'
 #### end trigger ####
-
+parameters:
+  - name: CudaVersion
+    displayName: CUDA version
+    type: string
+    default: '11.8'
+    values:
+      - 11.8
+      - 12.2
 resources:
   repositories:
   - repository: manylinux
@@ -34,7 +41,17 @@ resources:
     endpoint: Microsoft
     name: pypa/manylinux
     ref: 5eda9aded5462201e6310105728d33016e637ea7
-
+variables:
+  - name: docker_base_image
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
+  - name: linux_trt_version
+    ${{ if eq(parameters.CudaVersion, '11.8') }}:
+      value: 8.6.1.6-1.cuda11.8
+    ${{ if eq(parameters.CudaVersion, '12.2') }}:
+      value: 8.6.1.6-1.cuda12.0
 jobs:
 - job: Linux_Build
   timeoutInMinutes: 180
@@ -61,8 +78,8 @@ jobs:
       Context: tools/ci_build/github/linux/docker
       DockerBuildArgs: "
       --network=host
-      --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-      --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8
+      --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+      --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
       --build-arg BUILD_UID=$( id -u )
       "
       Repository: onnxruntimetensorrt86gpubuild
@@ -99,7 +116,8 @@ jobs:
                       --build_shared_lib \
                       --parallel \
                       --build_wheel \
-                      --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \
+                      --enable_onnx_tests \
+                      --use_cuda --cuda_home=/usr/local/cuda-${{ parameters.CudaVersion }} --cudnn_home=/usr/local/cuda-${{ parameters.CudaVersion }} \
                       --enable_pybind --build_java \
                       --use_tensorrt --tensorrt_home /usr \
                       --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
index 64fa29f06553e..1e609b052b8d3 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml
@@ -7,7 +7,7 @@ parameters:
   SpecificArtifact: false
   CustomOpArtifactName: 'onnxruntime-linux-x64'
   BuildId: '0'
-
+  CudaVersion: '11.8'
 stages:
 - stage: NuGet_Test_Linux_${{ parameters.StageSuffix }}
   dependsOn:
@@ -54,9 +54,18 @@ stages:
     - ${{if contains(parameters.StageSuffix , 'GPU') }}:
       - template: ../../templates/get-docker-image-steps.yml
         parameters:
-          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
+          Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
           Context: tools/ci_build/github/linux/docker/
-          DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )"
+          ${{ if eq(parameters.CudaVersion, '12.2') }}:
+            DockerBuildArgs: "
+            --build-arg BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04
+            --build-arg TRT_VERSION=8.6.1.6-1+cuda12.0
+            --build-arg BUILD_UID=$( id -u )
+            "
+          ${{ else }}:
+            DockerBuildArgs: "
+            --build-arg BUILD_UID=$( id -u )
+            "
           Repository: onnxruntimepackagestest
       - bash: |
           docker run --rm \
diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml
index 0b9ded10ddd3e..4f693d45cb76f 100644
--- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml
+++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml
@@ -8,6 +8,7 @@ parameters:
   # the parent pipeline.
   TestDataArtifactSuffix: ''
   Skipx86Tests: 'false'
+  CudaVersion: ''
 
 stages:
 - stage: NuGet_Test_Win_${{ parameters.StageSuffix }}
@@ -27,6 +28,10 @@ stages:
       value: 'ON'
     - name: runCodesignValidationInjection
       value: false
+    - name: CUDA_MODULE_LOADINGL
+      value: 'LAZY'
+    - name: GRADLE_OPTS
+      value: '-Dorg.gradle.daemon=false'
 
     steps:
     - task: UsePythonVersion@0
@@ -39,13 +44,12 @@ stages:
       displayName: Use Nuget 5.7.0
       inputs:
         versionSpec: 5.7.0
-
-    - task: BatchScript@1
-      displayName: 'setup env'
-      inputs:
-        filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\setup_env_gpu.bat'
-        modifyEnvironment: true
-        workingFolder: '$(Build.BinariesDirectory)'
+    - ${{ if ne( parameters.CudaVersion, '') }}:
+      - template: ../../templates/jobs/download_win_gpu_library.yml
+        parameters:
+          DownloadCUDA: true
+          DownloadTRT: true
+          CudaVersion: ${{ parameters.CudaVersion }}
 
     - task: BatchScript@1
       displayName: 'Setup Visual Studio env vars'
diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
index aee42d3675087..91179d141498b 100644
--- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml
@@ -31,7 +31,7 @@ resources:
       ref: 5eda9aded5462201e6310105728d33016e637ea7
 
 stages:
-  - template: stages/py-cuda-packaging-stage.yml
+  - template: stages/py-nuget-combine-cuda-stage.yml
     parameters:
       enable_linux_gpu: ${{ parameters.enable_linux_gpu }}
       enable_windows_gpu: ${{ parameters.enable_windows_gpu }}
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
new file mode 100644
index 0000000000000..b69e75856c39f
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml
@@ -0,0 +1,228 @@
+parameters:
+- name: DoCompliance
+  type: boolean
+  default: true
+
+- name: DoEsrp
+  type: boolean
+  default: true
+
+- name: IsReleaseBuild
+  type: boolean
+  default: false
+
+stages:
+######## Nuget ########
+# Win/Linux CUDA Combined packaging
+- stage: NuGet_Packaging_GPU
+  dependsOn:
+    - Set_ReleaseVersionSuffix
+    - Windows_Packaging_gpu
+    - Windows_Packaging_tensorrt
+    - Linux_C_API_Packaging_CPU_x64
+    - Linux_C_API_Packaging_GPU_x64
+    - Linux_C_API_Packaging_GPU_TensorRT_x64
+  condition: succeeded()
+  jobs:
+    - job:
+      workspace:
+        clean: all
+      # we need to use the 2022 pool to create the nuget package with both pre-net6+Xamarin and net6 targets.
+      # VS2019 has no support for net6 and we need to use msbuild (from the VS install) to do the packing
+      pool: 'Azure-Pipelines-EO-Windows2022-aiinfra'
+      variables:
+        breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
+        ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']]
+
+      steps:
+        - checkout: self
+          submodules: true
+  # Download the all artifacts
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact from Linux_C_API_Packaging_GPU_x64 Stage'
+          inputs:
+            artifactName: 'onnxruntime-win-x64-cuda'
+            targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact from Linux_C_API_Packaging_GPU_TensorRT_x64 Stage'
+          inputs:
+            artifactName: 'onnxruntime-win-x64-tensorrt'
+            targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact from Windows_Packaging_gpu Stage'
+          inputs:
+            artifactName: 'onnxruntime-linux-x64-cuda'
+            targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact from Windows_Packaging_tensorrt Stage'
+          inputs:
+            artifactName: 'onnxruntime-linux-x64-tensorrt'
+            targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact - protoc from Windows_Packaging_(cpu|gpu) Stage'
+          inputs:
+            artifactName: 'drop-extra'
+            targetPath: '$(Build.BinariesDirectory)/extra-artifact'
+
+        # Reconstruct the build dir
+        - task: PowerShell@2
+          displayName: 'PS: Extract nuget files gpu'
+          inputs:
+            targetType: filePath
+            filePath: $(Build.SourcesDirectory)\tools\ci_build\github\windows\extract_nuget_files_gpu.ps1
+
+        - script: |
+            dir
+          workingDirectory: '$(Build.BinariesDirectory)/nuget-artifact'
+          displayName: 'List artifacts'
+
+        - script: |
+            mklink /D /J models C:\local\models
+          workingDirectory: '$(Build.BinariesDirectory)'
+          displayName: 'Create models link'
+
+        - task: NuGetToolInstaller@0
+          displayName: Use Nuget 6.2.1
+          inputs:
+            versionSpec: 6.2.1
+
+        - task: PowerShell@2
+          displayName: Install .NET 6 workloads
+          inputs:
+            targetType: 'inline'
+            script: |
+              dotnet workload install android ios macos
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+        - task: PowerShell@2
+          displayName: Build .NET 6 targets using dotnet
+          inputs:
+            targetType: 'inline'
+            # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path
+            #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\
+            # which is inconsistent with the msbuild output path for the pre-.net6 targets
+            #   e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0
+            # and makes it harder to do the packing
+            #
+            # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine.
+            script: |
+              dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6 -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+        - task: MSBuild@1
+          displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets'
+          inputs:
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+            platform: 'Any CPU'
+            configuration: RelWithDebInfo
+            msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"'
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+        - task: MSBuild@1
+          displayName: 'Build C# for pre-.net6 targets'
+          inputs:
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+            configuration: RelWithDebInfo
+            platform: 'Any CPU'
+            msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+        - template: ../templates/win-esrp-dll.yml
+          parameters:
+            FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
+            DisplayName: 'ESRP - Sign C# dlls'
+            DoEsrp: ${{ parameters.DoEsrp }}
+
+        - task: MSBuild@1
+          displayName: Update projects.assets.json with combined list of all target frameworks
+          inputs:
+            solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj'
+            platform: 'Any CPU'
+            configuration: RelWithDebInfo
+            msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu'
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+        - task: MSBuild@1
+          displayName: 'Build Nuget Packages'
+          inputs:
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj'
+            configuration: RelWithDebInfo
+            platform: 'Any CPU'
+            msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)'
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+        - task: BatchScript@1
+          displayName: 'Add TensorRT header file to the native nuGet package'
+          inputs:
+            filename: $(Build.SourcesDirectory)\tools\ci_build\github\windows\bundle_nuget_with_native_headers.bat
+            workingFolder: $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo
+
+        - task: CopyFiles@2
+          displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+            Contents: '*.snupkg'
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - task: CopyFiles@2
+          displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo'
+            Contents: '*.nupkg'
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - task: CopyFiles@2
+          displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo'
+            Contents: '*.nupkg'
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - template: ../templates/esrp_nuget.yml
+          parameters:
+            DisplayName: 'ESRP - sign NuGet package'
+            FolderPath: '$(Build.ArtifactStagingDirectory)'
+            DoEsrp: ${{ parameters.DoEsrp }}
+
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'nuget'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg'
+            PlatformsSupported: 'win-x64,linux-x64'
+            VerifyNugetSigning: false
+
+        - task: PublishPipelineArtifact@0
+          displayName: 'Publish Pipeline NuGet Artifact'
+          inputs:
+            artifactName: 'drop-signed-nuget-GPU'
+            targetPath: '$(Build.ArtifactStagingDirectory)'
+
+
+        - task: MSBuild@1
+          displayName: 'Clean C#'
+          inputs:
+            solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln'
+            platform: 'Any CPU'
+            configuration: RelWithDebInfo
+            msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu'
+            workingDirectory: '$(Build.SourcesDirectory)\csharp'
+
+
+        - task: RoslynAnalyzers@2
+          displayName: 'Run Roslyn Analyzers'
+          inputs:
+            userProvideBuildInfo: msBuildInfo
+            msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\msbuild.exe" $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln -p:configuration="RelWithDebInfo" -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu'
+          condition: and(succeeded(), eq('${{ parameters.DoCompliance }}', true))
+
+        - template: ../templates/component-governance-component-detection-steps.yml
+          parameters:
+            condition: 'succeeded'
+
+        - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3
+          displayName: 'Clean Agent Directories'
+          condition: always()
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
new file mode 100644
index 0000000000000..140a377ca72a3
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml
@@ -0,0 +1,161 @@
+parameters:
+- name: CudaVersion
+  type: string
+  default: '11.8'
+- name: docker_base_image
+  type: string
+- name: linux_trt_version
+  type: string
+
+stages:
+  # Linux CUDA without TensorRT Packaging
+- stage: Linux_C_API_Packaging_GPU_x64
+  dependsOn: []
+  jobs:
+  - job:
+    workspace:
+      clean: all
+    timeoutInMinutes: 120
+    pool: 'Onnxruntime-Linux-GPU'
+    variables:
+      - name: CUDA_VERSION_MAJOR
+        ${{ if eq(parameters.CudaVersion, '11.8') }}:
+          value: '11'
+        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+          value: '12'
+      - name: CUDA_VERSION
+        value: ${{ parameters.CudaVersion }}
+    steps:
+    - template: ../templates/set-version-number-variables-step.yml
+    - template: ../templates/get-docker-image-steps.yml
+      parameters:
+        Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
+        Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu
+        DockerBuildArgs: "
+        --build-arg BUILD_UID=$( id -u )
+        --build-arg BASEIMAGE=${{ parameters.docker_base_image }}
+        "
+        Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}build
+
+    - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+      workingDirectory: $(Build.SourcesDirectory)
+      displayName: 'Build and Test'
+
+    - template: ../templates/c-api-artifacts-package-and-publish-steps-posix.yml
+      parameters:
+        buildConfig: 'Release'
+        artifactName: 'onnxruntime-linux-x64-cuda-$(OnnxRuntimeVersion)'
+        artifactNameNoVersionString: 'onnxruntime-linux-x64-cuda'
+        libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)'
+
+    - template: ../templates/component-governance-component-detection-steps.yml
+      parameters:
+        condition: 'succeeded'
+    - template: ../templates/clean-agent-build-directory-step.yml
+# Linux CUDA with TensorRT Packaging
+- template: ../templates/linux-gpu-tensorrt-packaging-pipeline.yml
+  parameters:
+    artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)'
+    artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt'
+    buildJava: false
+    buildJavaOption: '--build_java'
+    buildNodejs: false
+    buildNodejsOption: '--build_nodejs'
+    CudaVersion: ${{ parameters.CudaVersion }}
+# Linux CUDA Combined Testing and Publishing
+- stage: Linux_Packaging_combined_GPU
+  dependsOn:
+    - Linux_C_API_Packaging_GPU_x64
+    - Linux_C_API_Packaging_GPU_TensorRT_x64
+  condition: succeeded()
+  jobs:
+    - job:
+      workspace:
+        clean: all
+      pool: 'Onnxruntime-Linux-GPU'
+
+      steps:
+        - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
+          submodules: false
+        - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
+          submodules: false
+        - checkout: manylinux                      # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux
+          submodules: false
+
+        - script: |
+            set -e -x
+            cd $(Build.SourcesDirectory)
+            mv manylinux onnxruntime
+            ls
+
+        - template: ../templates/with-container-registry-steps.yml
+          parameters:
+            Steps:
+              - script: |
+                  tools/ci_build/get_docker_image.py \
+                    --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \
+                    --context tools/ci_build/github/linux/docker \
+                    --docker-build-args "--network=host --build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ parameters.linux_trt_version }} --build-arg BUILD_UID=$( id -u )" \
+                    --container-registry onnxruntimebuildcache \
+                    --multiple_repos \
+                    --repository onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build
+                displayName: "Get onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda"
+                workingDirectory: $(Build.SourcesDirectory)/onnxruntime
+            ContainerRegistry: onnxruntimebuildcache
+
+        - template: ../templates/set-version-number-variables-step.yml
+          parameters:
+            versionFileDirectory: '$(Build.SourcesDirectory)/onnxruntime'
+            workingDirectory: '$(Build.SourcesDirectory)/onnxruntime'
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact - Combined GPU'
+          inputs:
+            artifactName: 'onnxruntime-linux-x64-cuda'
+            targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
+
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact - Combined GPU'
+          inputs:
+            artifactName: 'onnxruntime-linux-x64-tensorrt'
+            targetPath: '$(Build.BinariesDirectory)/tgz-artifacts'
+
+        - task: ShellScript@2
+          displayName: 'Shell Script'
+          inputs:
+            scriptPath: 'onnxruntime/tools/ci_build/github/linux/extract_and_bundle_gpu_package.sh'
+            args: '-a $(Build.BinariesDirectory)/tgz-artifacts'
+            workingDirectory: '$(Build.BinariesDirectory)/tgz-artifacts'
+
+        - task: ArchiveFiles@2
+          inputs:
+            rootFolderOrFile: '$(Build.BinariesDirectory)/tgz-artifacts/onnxruntime-linux-x64-gpu'
+            includeRootFolder: false
+            archiveType: 'tar' # Options: zip, 7z, tar, wim
+            tarCompression: 'gz'
+            archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
+            replaceExistingArchive: true
+
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'tarball'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
+            ScriptPath: '$(Build.SourcesDirectory)/onnxruntime/tools/nuget/validate_package.py'
+            PlatformsSupported: 'linux-x64'
+            VerifyNugetSigning: false
+            workingDirectory: '$(Build.ArtifactStagingDirectory)'
+
+
+        - task: CmdLine@2
+          displayName: 'Test C API application for GPU package'
+          inputs:
+            script: |
+              docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume $(Build.SourcesDirectory):/src_dir \
+              --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build \
+              /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet
+            workingDirectory: '$(Build.ArtifactStagingDirectory)'
+
+        - task: PublishPipelineArtifact@1
+          inputs:
+            targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz'
+            artifactName: 'onnxruntime-linux-x64-gpu'
diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
new file mode 100644
index 0000000000000..3fb653c6b4405
--- /dev/null
+++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml
@@ -0,0 +1,147 @@
+parameters:
+- name: RunOnnxRuntimeTests
+  type: boolean
+  default: true
+
+- name: UseIncreasedTimeoutForTests
+  type: boolean
+  default: false
+
+- name: DoCompliance
+  type: boolean
+  default: true
+
+- name: DoEsrp
+  type: boolean
+  default: true
+
+- name: CudaVersion
+  type: string
+  default: '11.8'
+- name: win_cuda_home
+  type: string
+- name: win_trt_home
+  type: string
+
+stages:
+# Windows CUDA without TensorRT Packaging
+- template: ../templates/win-ci.yml
+  parameters:
+    ort_build_pool_name: 'onnxruntime-Win2022-GPU-T4'
+    DoCompliance: ${{ parameters.DoCompliance }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    stage_name_suffix: gpu
+    buildArch: x64
+    msbuildPlatform: x64
+    packageName: x64-cuda
+    CudaVersion: ${{ parameters.CudaVersion }}
+    buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    runTests: ${{ parameters.RunOnnxRuntimeTests }}
+    buildJava: false
+    java_artifact_id: onnxruntime_gpu
+    PublishProtoc: true
+# Windows CUDA with TensorRT Packaging
+- template: ../templates/win-ci.yml
+  parameters:
+    ort_build_pool_name: 'onnxruntime-Win2022-GPU-T4'
+    DoCompliance: ${{ parameters.DoCompliance }}
+    DoEsrp: ${{ parameters.DoEsrp }}
+    stage_name_suffix: tensorrt
+    buildArch: x64
+    msbuildPlatform: x64
+    CudaVersion: ${{ parameters.CudaVersion }}
+    packageName: x64-tensorrt
+    buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }}  --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80"
+    runTests: ${{ parameters.RunOnnxRuntimeTests }}
+    buildJava: false
+    java_artifact_id: onnxruntime_gpu
+    UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }}
+
+# Windows CUDA Combined Testing and Publishing
+- stage: Windows_Packaging_combined_GPU
+  dependsOn:
+    - Windows_Packaging_gpu
+    - Windows_Packaging_tensorrt
+  condition: succeeded()
+
+  jobs:
+    - job:
+      workspace:
+        clean: all
+      pool: 'onnxruntime-Win2022-GPU-T4'
+      variables:
+        CUDA_MODULE_LOADINGL: 'LAZY'
+        GRADLE_OPTS: '-Dorg.gradle.daemon=false'
+      steps:
+        - checkout: self                           # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime
+        - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples
+          submodules: false
+        - script: dir $(Build.SourcesDirectory)
+        - template: ../templates/jobs/download_win_gpu_library.yml
+          parameters:
+            DownloadCUDA: true
+            DownloadTRT: true
+            CudaVersion: ${{ parameters.CudaVersion }}
+
+        - template: ../templates/set-version-number-variables-step.yml
+          parameters:
+            versionFileDirectory: '$(Build.SourcesDirectory)\onnxruntime'
+            workingDirectory: '$(Build.SourcesDirectory)\onnxruntime'
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact - onnxruntime-win-x64-cuda'
+          inputs:
+            artifactName: 'onnxruntime-win-x64-cuda'
+            targetPath: '$(Build.BinariesDirectory)/zip-artifacts'
+
+        - task: DownloadPipelineArtifact@2
+          displayName: 'Download Pipeline Artifact - onnxruntime-win-x64-tensorrt'
+          inputs:
+            artifactName: 'onnxruntime-win-x64-tensorrt'
+            targetPath: '$(Build.BinariesDirectory)/zip-artifacts'
+
+        - task: PowerShell@2
+          displayName: 'PowerShell Script'
+          inputs:
+            targetType: filePath
+            filePath: $(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\extract_zip_files_gpu.ps1
+
+        - script: |
+            dir
+          workingDirectory: '$(Build.BinariesDirectory)/zip-artifacts'
+          displayName: 'List artifacts'
+
+        - task: BatchScript@1
+          displayName: 'Bundle CUDA/TRT EP binaries'
+          inputs:
+            filename: $(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\bundle_dlls_gpu.bat
+            workingFolder: $(Build.BinariesDirectory)\zip-artifacts
+
+        - task: CopyFiles@2
+          displayName: 'Copy zip file to: $(Build.ArtifactStagingDirectory)'
+          inputs:
+            SourceFolder: '$(Build.BinariesDirectory)\zip-artifacts'
+            Contents: 'onnxruntime-win-x64-gpu-*.zip'
+            TargetFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - template: ../templates/validate-package.yml
+          parameters:
+            PackageType: 'zip'
+            PackagePath: '$(Build.ArtifactStagingDirectory)'
+            PackageName: 'onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip'
+            ScriptPath: '$(Build.SourcesDirectory)\onnxruntime\tools\nuget\validate_package.py'
+            PlatformsSupported: 'win-x64'
+            VerifyNugetSigning: false
+            workingDirectory: '$(Build.ArtifactStagingDirectory)'
+
+        - task: BatchScript@1
+          displayName: 'Test C API application for GPU package'
+          inputs:
+            filename: $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet\run_capi_application.bat
+            arguments: $(Build.SourcesDirectory)\onnxruntime $(Build.ArtifactStagingDirectory)\onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet
+            workingFolder: '$(Build.ArtifactStagingDirectory)'
+
+        - task: PublishPipelineArtifact@0
+          displayName: 'Publish Pipeline Combined GPU Package Artifact'
+          inputs:
+            artifactName: 'onnxruntime-win-x64-gpu'
+            targetPath: '$(Build.ArtifactStagingDirectory)'
\ No newline at end of file
diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
index ff7f0957e94ba..b7ae9ffa3c219 100644
--- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml
@@ -13,7 +13,6 @@ parameters:
       - 12.2
 
 steps:
-
   - ${{ if eq(parameters.DownloadCUDA, true) }}:
     - powershell: |
         azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ parameters.CudaVersion }} $(Agent.TempDirectory)
diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
index 85562d7758ab2..7693e8f2cd21c 100644
--- a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml
@@ -23,12 +23,33 @@ parameters:
   type: string
   default: ''
 
+- name: CudaVersion
+  displayName: CUDA version
+  type: string
+  default: '11.8'
+  values:
+    - 11.8
+    - 12.2
+
+
+
 # We only have CUDA/TRT on x64. We do not have a build for CUDA/TRT for ARM64.
 # Therefore this file does not have an `OnnxruntimeNodejsBindingArch` parameter
   
 stages:
 - stage: Linux_C_API_Packaging_GPU_TensorRT_x64
   dependsOn: []
+  variables:
+    - name: linux_trt_version
+      ${{ if eq(parameters.CudaVersion, '11.8') }}:
+        value: 8.6.1.6-1.cuda11.8
+      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        value: 8.6.1.6-1.cuda12.0
+    - name: docker_base_image
+      ${{ if eq(parameters.CudaVersion, '11.8') }}:
+        value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+      ${{ if eq(parameters.CudaVersion, '12.2') }}:
+        value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8
   jobs:
   - job:
     dependsOn: []
@@ -37,7 +58,13 @@ stages:
     timeoutInMinutes:  180
     pool: 'Onnxruntime-Linux-GPU'
     variables:
-      CUDA_VERSION: '11.8'
+      - name: CUDA_VERSION_MAJOR
+        ${{ if eq(parameters.CudaVersion, '11.8') }}:
+          value: '11'
+        ${{ if eq(parameters.CudaVersion, '12.2') }}:
+          value: '12'
+      - name: CUDA_VERSION
+        value: ${{ parameters.CudaVersion }}
     steps:
       - checkout: self
         clean: true
@@ -48,11 +75,11 @@ stages:
           Context: tools/ci_build/github/linux/docker
           DockerBuildArgs: "
           --network=host
-          --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-          --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8
+          --build-arg BASEIMAGE=${{ variables.docker_base_image }}
+          --build-arg TRT_VERSION=${{ variables.linux_trt_version }}
           --build-arg BUILD_UID=$( id -u )
           "
-          Repository: onnxruntimecuda118xtrt86build
+          Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build
       - template: set-version-number-variables-step.yml
 
       - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
index 8d28b4ce580b4..0fb6966c141db 100644
--- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml
@@ -11,6 +11,7 @@ parameters:
 
 - name: EnvSetupScript
   type: string
+  default: ''
 
 - name: buildArch
   type: string
@@ -63,11 +64,24 @@ parameters:
   type: boolean
   default: false
 
+- name: PublishProtoc
+  type: boolean
+  default: false
+
+- name: CudaVersion
+  type: string
+  default: '11.8'
+  values:
+      - 11.8
+      - 12.2
+
 stages:
 - stage: Windows_Packaging_${{ parameters.stage_name_suffix }}
   dependsOn: []
   variables:
+    GRADLE_OPTS: '-Dorg.gradle.daemon=false'
     VSGenerator: 'Visual Studio 17 2022'
+    CUDA_MODULE_LOADING: 'LAZY'
   jobs:
   - job:
     workspace:
@@ -102,12 +116,26 @@ stages:
         condition: and(succeeded(), eq('${{ parameters.buildNodejs}}', true))
         inputs:
           versionSpec: '18.x'
+      - ${{ if ne(parameters.EnvSetupScript, '') }}:
+        - template: jobs/set-winenv.yml
+          parameters:
+            EnvSetupScript: ${{ parameters.EnvSetupScript }}
+            ${{ if contains(parameters.buildparameter, 'use_cuda') }}:
+              DownloadCUDA: true
 
-      - template: jobs/set-winenv.yml
-        parameters:
-          EnvSetupScript: ${{ parameters.EnvSetupScript }}
-          ${{ if contains(parameters.buildparameter, 'use_cuda') }}:
-            DownloadCUDA: true
+      - ${{ if eq(parameters.EnvSetupScript, '') }}:
+        - template: jobs/download_win_gpu_library.yml
+          parameters:
+            CudaVersion: ${{ parameters.CudaVersion }}
+            ${{ if contains(parameters.buildparameter, 'use_cuda') }}:
+              DownloadCUDA: true
+            ${{ if contains(parameters.buildparameter, 'use_tensorrt') }}:
+              DownloadCUDA: true
+              DownloadTRT: true
+      - powershell: |
+          Write-Host "##vso[task.prependpath]C:\Program Files (x86)\dotnet"
+        displayName: 'Append dotnet x86  Directory to PATH'
+        condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86'))
 
       - template: download-deps.yml
 
@@ -178,9 +206,11 @@ stages:
             artifactName: 'drop-onnxruntime-nodejs-win-${{ parameters.packageName }}'
             DoEsrp: ${{ parameters.DoEsrp }}
 
-      #Upload protoc.exe, which will be used in nuget build for generating C# files
+      # Upload protoc.exe, which will be used in nuget build for generating C# files
+      # TODO: We need to make this step independent of the packageName, so that it can be used in test_win.yml
       - task: PublishPipelineArtifact@1
-        condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64'))
+        displayName: Publish protoc as drop-extra
+        condition: and(succeeded(), or(eq('${{ parameters.packageName}}', 'x64'), eq('${{ parameters.PublishProtoc}}', true)))
         inputs:
           targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe'
           artifactName: 'drop-extra${{ parameters.artifact_name_suffix }}'
@@ -194,9 +224,10 @@ stages:
           Contents: 'custom_op_library.dll'
           TargetFolder: '$(Build.ArtifactStagingDirectory)/testdata'
 
-      #To be used in test_win.yml
+      #To be used in test_win.
+      # TODO: Do we need to publish protoc twice?
       - task: PublishPipelineArtifact@1
-        condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64'))
+        condition: and(succeeded(), or(eq('${{ parameters.packageName}}', 'x64'), eq('${{ parameters.PublishProtoc}}', true)))
         inputs:
           targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe'
           artifactName: 'drop-nuget${{ parameters.artifact_name_suffix }}'
diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
index 5cd1c8c243050..2ec8bc82ae048 100755
--- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh
@@ -4,7 +4,7 @@ export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protect
 export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
 docker run --gpus all -e CFLAGS -e CXXFLAGS  -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \
 $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
---volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda11centosbuild \
+--volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \
 /usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \
 --skip_submodule_sync  --parallel --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \
 --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \
diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
index 18a32e3599391..5bf6a69170074 100755
--- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
+++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh
@@ -4,6 +4,6 @@ export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protect
 export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all"
 mkdir -p $HOME/.onnx
 docker run --gpus all -e CFLAGS -e CXXFLAGS -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \
---volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \
+--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \
 /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \
 --skip_submodule_sync --parallel --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80'
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
index d4aa9b269095f..8f265b208cd47 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
+++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda
@@ -8,6 +8,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
 ARG DEVTOOLSET_ROOTPATH=/usr
 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64
 ARG PREPEND_PATH=/usr/local/cuda/binet
+ARG TRT_VERSION=8.6.1.6-1.cuda11.8
 
 #Build manylinux docker image begin
 FROM $BASEIMAGE AS runtime_base
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
index bbdb411b790a0..8ef8e05b8ac77 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6
@@ -5,8 +5,10 @@
 # Dockerfile to Test ONNX Runtime on UBI8 with CUDA 11.8 and TensorRT 8.6
 
 # Build base image with required system packages
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8 AS base
-
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+ARG TRT_VERSION=8.6.1.6-1.cuda11.8
+FROM $BASEIMAGE AS base
+ARG TRT_VERSION
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 
 RUN dnf install -y bash wget &&\
@@ -26,8 +28,7 @@ RUN pip3 install setuptools>=68.2.2
 
 # Install TensorRT
 RUN dnf install -y libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8
-RUN v="8.6.1.6-1+cuda11.8" &&\
-    dnf downgrade -y libnvinfer8-${v} libnvinfer8-${v} libnvonnxparsers8-${v} libnvparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-lean8-${v} libnvinfer-vc-plugin8-${v} libnvinfer-dispatch8-${v} &&\
+RUN dnf downgrade -y libnvinfer8-${TRT_VERSION} libnvinfer8-${TRT_VERSION} libnvonnxparsers8-${TRT_VERSION} libnvparsers8-${TRT_VERSION} libnvinfer-plugin8-${TRT_VERSION} libnvinfer-lean8-${TRT_VERSION} libnvinfer-vc-plugin8-${TRT_VERSION} libnvinfer-dispatch8-${TRT_VERSION} &&\
     dnf install -y dnf-plugin-versionlock &&\
     dnf versionlock libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8
 RUN dnf clean dbcache
diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
similarity index 50%
rename from tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
rename to tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
index 83a974469234f..9b9dc9ecae822 100644
--- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6
+++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu
@@ -5,11 +5,16 @@
 # Dockerfile to run ONNXRuntime with TensorRT integration
 
 # Build base image with required system packages
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base
-
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04
+ARG TRT_VERSION=8.6.1.6-1+cuda11.8
+ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64
+FROM $BASEIMAGE AS base
+ARG TRT_VERSION
 ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH}
 ENV DEBIAN_FRONTEND=noninteractive
 
+ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH}
+
 RUN apt-get update &&\
     apt-get install -y git bash wget
 
@@ -24,12 +29,11 @@ RUN apt-get install -y --no-install-recommends \
 RUN pip install --upgrade pip
 
 # Install TensorRT
-RUN v="8.6.1.6-1+cuda11.8" &&\
-    apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
+RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\
     apt-get update &&\
-    apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\
-        libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v}  libnvinfer-dispatch-dev=${v}\
-        python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v}
+    apt-get install -y libnvinfer8=${TRT_VERSION} libnvonnxparsers8=${TRT_VERSION} libnvparsers8=${TRT_VERSION} libnvinfer-plugin8=${TRT_VERSION} libnvinfer-lean8=${TRT_VERSION} libnvinfer-vc-plugin8=${TRT_VERSION} libnvinfer-dispatch8=${TRT_VERSION}\
+        libnvinfer-headers-dev=${TRT_VERSION} libnvinfer-headers-plugin-dev=${TRT_VERSION} libnvinfer-dev=${TRT_VERSION} libnvonnxparsers-dev=${TRT_VERSION} libnvparsers-dev=${TRT_VERSION} libnvinfer-plugin-dev=${TRT_VERSION} libnvinfer-lean-dev=${TRT_VERSION} libnvinfer-vc-plugin-dev=${TRT_VERSION}  libnvinfer-dispatch-dev=${TRT_VERSION}\
+        python3-libnvinfer=${TRT_VERSION} libnvinfer-samples=${TRT_VERSION} tensorrt-dev=${TRT_VERSION} tensorrt-libs=${TRT_VERSION}
 
 ADD scripts /tmp/scripts
 RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts
diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
index 318791072f46d..b1ff40e8effef 100644
--- a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
+++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile
@@ -2,8 +2,8 @@
 # Licensed under the MIT License.
 
 # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline
-FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8
-
+ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8
+FROM $BASEIMAGE
 ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin
 ENV LANG=en_US.UTF-8
 ENV LC_ALL=en_US.UTF-8

From a6d872640764ea50ec460f7a717e5b369921f8b4 Mon Sep 17 00:00:00 2001
From: Yi Zhang <zhanyi@microsoft.com>
Date: Wed, 29 Nov 2023 01:04:25 +0800
Subject: [PATCH 5/6] Update ADO windows image to custom image (#18598)

### Description
Update Azure-Pipelines-EO-Windows2022-aiinfra to
onnxruntime-win-CPU-2022 in Nuget_Package_CPU.
To make the debugging easier, use flex-downloadPipelineArtifact

### Motivation and Context
Azure-Pipelines-EO-Windows2022-aiinfra is using 1ES window-latest image.
The pipeline might be failed by unexpected upgrade.
Verified:
https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=384425&view=results

### P.S.
I think we should replace all Azure-Pipelines-EO-Windows2022-aiinfra.
---
 .../azure-pipelines/templates/c-api-cpu.yml   | 126 ++++++++++--------
 1 file changed, 72 insertions(+), 54 deletions(-)

diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
index 4ce39ecc35bfb..cfd2931665d17 100644
--- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
+++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml
@@ -304,9 +304,7 @@ stages:
   - job:
     workspace:
       clean: all
-    # we need to use the 2022 pool to create the nuget package with both pre-net6+Xamarin and net6 targets.
-    # VS2019 has no support for net6 and we need to use msbuild (from the VS install) to do the packing
-    pool: 'Azure-Pipelines-EO-Windows2022-aiinfra'
+    pool: 'onnxruntime-Win-CPU-2022'
     variables:
       OrtPackageId: ${{ parameters.OrtNugetPackageId }}
       breakCodesignValidationInjection: ${{ parameters.DoEsrp }}
@@ -315,66 +313,86 @@ stages:
     steps:
     - checkout: self
       submodules: true
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download win-x64 Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-win-x64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download win-x86 Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-win-x86'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Pipeline Artifact - Win x64'
+        ArtifactName: 'onnxruntime-win-x64'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download win-arm64 Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-win-arm64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download win-x86 Pipeline Artifact'
+        ArtifactName: 'onnxruntime-win-x86'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download win-arm Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-win-arm'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download win-arm64 Pipeline Artifact'
+        ArtifactName: 'onnxruntime-win-arm64'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download osx-x64 Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-osx'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download win-arm Pipeline Artifact'
+        ArtifactName: 'onnxruntime-win-arm'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download linux-x64 Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-linux-x64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download osx-x64 Pipeline Artifact'
+        ArtifactName: 'onnxruntime-osx'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download Pipeline Artifact - NuGet'
-      inputs:
-        artifactName: 'onnxruntime-linux-aarch64'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download linux-x64 Pipeline Artifact'
+        ArtifactName: 'onnxruntime-linux-x64'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download iOS Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-ios-full-xcframework'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download linux-aarch64 Pipeline Artifact'
+        ArtifactName: 'onnxruntime-linux-aarch64'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@2
-      displayName: 'Download android-full-aar Pipeline Artifact'
-      inputs:
-        artifactName: 'onnxruntime-android-full-aar'
-        patterns: '**/*.aar'
-        targetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download iOS Pipeline Artifact'
+        ArtifactName: 'onnxruntime-ios-full-xcframework'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
-    - task: DownloadPipelineArtifact@0
-      displayName: 'Download drop-extra Pipeline Artifact'
-      inputs:
-        artifactName: 'drop-extra'
-        targetPath: '$(Build.BinariesDirectory)/extra-artifact'
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download Android-full-aar Pipeline Artifact'
+        ArtifactName: 'onnxruntime-android-full-aar'
+        TargetPath: '$(Build.BinariesDirectory)/nuget-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
+
+    - template: flex-downloadPipelineArtifact.yml
+      parameters:
+        StepName: 'Download drop-extra Pipeline Artifact'
+        ArtifactName: 'drop-extra'
+        TargetPath: '$(Build.BinariesDirectory)/extra-artifact'
+        SpecificArtifact: ${{ parameters.specificArtifact }}
+        BuildId: ${{ parameters.BuildId }}
 
     - script: |
        dir

From 0b7048e7d621b271b0ab4748e566f57d11b49be5 Mon Sep 17 00:00:00 2001
From: Sheil Kumar <smk2007@gmail.com>
Date: Tue, 28 Nov 2023 09:26:48 -0800
Subject: [PATCH 6/6] Update winml to use #cores - #soc cores by Default as the
 number of intraopthreads (#18384)

Update winml to use #cores - #soc cores by Default as the number of
intraopthreads

---------

Co-authored-by: Sheil Kumar <sheilk@microsoft.com>
---
 cmake/winml.cmake                             |  2 +
 winml/lib/Api/HardwareCoreEnumerator.cpp      | 90 +++++++++++++++++++
 winml/lib/Api/HardwareCoreEnumerator.h        | 11 +++
 winml/lib/Api/LearningModelDevice.cpp         |  3 +-
 winml/lib/Api/LearningModelSessionOptions.cpp | 11 ++-
 winml/lib/Api/LearningModelSessionOptions.h   |  4 +-
 .../test/api/LearningModelSessionAPITest.cpp  |  6 --
 7 files changed, 117 insertions(+), 10 deletions(-)
 create mode 100644 winml/lib/Api/HardwareCoreEnumerator.cpp
 create mode 100644 winml/lib/Api/HardwareCoreEnumerator.h

diff --git a/cmake/winml.cmake b/cmake/winml.cmake
index 395996f0fa4b9..268ee3960e75a 100644
--- a/cmake/winml.cmake
+++ b/cmake/winml.cmake
@@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api
   ${winml_lib_api_dir}/impl/TensorKindFrom.h
   ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h
   ${winml_lib_api_dir}/NumericData.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.cpp
+  ${winml_lib_api_dir}/HardwareCoreEnumerator.h
   ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp
   ${winml_lib_api_dir}/ImageFeatureDescriptor.h
   ${winml_lib_api_dir}/ImageFeatureValue.cpp
diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp
new file mode 100644
index 0000000000000..a89ac561f8860
--- /dev/null
+++ b/winml/lib/Api/HardwareCoreEnumerator.cpp
@@ -0,0 +1,90 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#include "lib/Api/pch/pch.h"
+
+#include "HardwareCoreEnumerator.h"
+
+namespace WINMLP {
+
+struct LogicalProcessorInformation {
+  std::unique_ptr<char[]> Buffer;
+  size_t Length;
+};
+
+struct CoreCounter {
+  uint32_t PhysicalCores = 0;
+  uint32_t SocDieCores = 0;
+};
+
+static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) {
+  DWORD length = 0;
+  DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length);
+
+  assert(rc == FALSE);
+
+  auto processorInformationBytes = std::make_unique<char[]>(length);
+
+  rc = GetLogicalProcessorInformationEx(
+    relationship, reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(processorInformationBytes.get()), &length
+  );
+
+  assert(rc == TRUE);
+
+  return {std::move(processorInformationBytes), length};
+}
+
+uint32_t CountSetBits(DWORD input) {
+  uint32_t c;
+  for (c = 0; input; c++) {
+    input &= input - 1;
+  }
+  return c;
+}
+
+static CoreCounter GetNumberOPhysicalAndEngineeringCores() {
+  auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll);
+
+  CoreCounter cores;
+  DWORD dwLevel2GroupMask = 0;
+  DWORD dwLevel3GroupMask = 0;
+  size_t read = 0;
+  PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL;
+
+  while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length
+  ) {
+    currentProcessorInfo =
+      reinterpret_cast<PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX>(logicalProcessorInformation.Buffer.get() + read);
+    if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) {
+      break;
+    }
+
+    switch (currentProcessorInfo->Relationship) {
+      case RelationProcessorCore:
+        cores.PhysicalCores++;
+        break;
+      case RelationCache:
+        if (currentProcessorInfo->Cache.Level == 2) {
+          dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
+        } else if (currentProcessorInfo->Cache.Level == 3) {
+          dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask;
+        }
+        break;
+    }
+
+    read += currentProcessorInfo->Size;
+  }
+
+  cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask);
+  return cores;
+}
+
+uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() {
+  // # of physical cores = # of P cores + # of E Cores + # of Soc Cores.
+  // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores.
+  auto cores = GetNumberOPhysicalAndEngineeringCores();
+  // We want to use the number of physical cores, but exclude soc cores
+  return cores.PhysicalCores - cores.SocDieCores;
+}
+
+}  // namespace WINMLP
diff --git a/winml/lib/Api/HardwareCoreEnumerator.h b/winml/lib/Api/HardwareCoreEnumerator.h
new file mode 100644
index 0000000000000..6861ba7d46bcf
--- /dev/null
+++ b/winml/lib/Api/HardwareCoreEnumerator.h
@@ -0,0 +1,11 @@
+﻿// Copyright (c) Microsoft Corporation. All rights reserved.
+// Licensed under the MIT License.
+
+#pragma once
+
+namespace WINMLP {
+struct HardwareCoreEnumerator {
+  HardwareCoreEnumerator() = delete;
+  static uint32_t DefaultIntraOpNumThreads();
+};
+}  // namespace WINMLP
diff --git a/winml/lib/Api/LearningModelDevice.cpp b/winml/lib/Api/LearningModelDevice.cpp
index c9c6f5bc70ee2..9f48ee03886e1 100644
--- a/winml/lib/Api/LearningModelDevice.cpp
+++ b/winml/lib/Api/LearningModelDevice.cpp
@@ -7,6 +7,7 @@
 #include <D3d11_4.h>
 #include <d3d11on12.h>
 #include "D3DDeviceCache.h"
+#include "HardwareCoreEnumerator.h"
 
 #include "ConverterResourceStore.h"
 
@@ -131,7 +132,7 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) {
 
 uint32_t LearningModelDevice::NumberOfIntraOpThreads() {
   if (IsCpuDevice()) {
-    return std::thread::hardware_concurrency();
+    return HardwareCoreEnumerator::DefaultIntraOpNumThreads();
   } else {
     // GPU sessions should not rely on intra op threads.
     // Creating a large thread pool is unnecessary and wasteful, and can cause
diff --git a/winml/lib/Api/LearningModelSessionOptions.cpp b/winml/lib/Api/LearningModelSessionOptions.cpp
index 2ff9c6d1d56d0..374200fb3b9f8 100644
--- a/winml/lib/Api/LearningModelSessionOptions.cpp
+++ b/winml/lib/Api/LearningModelSessionOptions.cpp
@@ -3,11 +3,20 @@
 
 #include "lib/Api/pch/pch.h"
 #include "LearningModelSessionOptions.h"
+#include "HardwareCoreEnumerator.h"
 
 namespace WINMLP {
+
+LearningModelSessionOptions::LearningModelSessionOptions() {
+  intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads();
+}
+
 LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options)
   : batch_size_override_(options.batch_size_override_),
-    close_model_on_session_creation_(options.close_model_on_session_creation_) {
+    close_model_on_session_creation_(options.close_model_on_session_creation_),
+    named_dim_overrides_(options.named_dim_overrides_),
+    intra_op_num_threads_override_(options.intra_op_num_threads_override_),
+    custom_ops_lib_paths_(options.custom_ops_lib_paths_) {
 }
 
 uint32_t LearningModelSessionOptions::BatchSizeOverride() {
diff --git a/winml/lib/Api/LearningModelSessionOptions.h b/winml/lib/Api/LearningModelSessionOptions.h
index 5fc7e54997403..21d0242735f94 100644
--- a/winml/lib/Api/LearningModelSessionOptions.h
+++ b/winml/lib/Api/LearningModelSessionOptions.h
@@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
                                        LearningModelSessionOptions,
                                        ILearningModelSessionOptionsNative,
                                        ILearningModelSessionOptionsNative1> {
-  LearningModelSessionOptions() = default;
+  LearningModelSessionOptions();
 
   LearningModelSessionOptions(const LearningModelSessionOptions& options);
 
@@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT<
   // The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations.
   // The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest.
   // WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool
-  uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency();
+  uint32_t intra_op_num_threads_override_;
 
   bool allow_thread_spinning_ = true;
 
diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp
index 4ec79b8a0f4c6..d6e70e35e3a6d 100644
--- a/winml/test/api/LearningModelSessionAPITest.cpp
+++ b/winml/test/api/LearningModelSessionAPITest.cpp
@@ -2195,12 +2195,6 @@ static void SetIntraOpNumThreads() {
   auto binding = LearningModelBinding(session);
   binding.Bind(L"input", tensor_input);
   WINML_EXPECT_NO_THROW(session.Evaluate(binding, L""));
-
-  // Check to verify that the default number of threads in LearningModelSession is equal to the number of logical cores.
-  session = LearningModelSession(model, device);
-  nativeSession = session.as<ILearningModelSessionNative>();
-  WINML_EXPECT_NO_THROW(nativeSession->GetIntraOpNumThreads(&numIntraOpThreads));
-  WINML_EXPECT_EQUAL(std::thread::hardware_concurrency(), numIntraOpThreads);
 }
 
 static void SetIntraOpThreadSpinning() {