From fc8631e2f11d85c84ab9cc711aacb9c589b6f71a Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Tue, 28 Nov 2023 13:21:47 +0800 Subject: [PATCH 1/6] [js/web] Fix conv2dMatmul errors due to #18452 (#18562) ### Description Currently, all conv2dMatmul with inChannels = 3 and outChannels % 4 = 0 will report compilation errors. Models, which include this kind of shape will be impacted, like mobilenetv2-12, resnet50 . The errors is introduced by #18452 https://github.com/microsoft/onnxruntime/pull/18452/files#diff-8b24ea43aa11b1346c0c9e327f9bce6b37a93bd8f2bf8a6392b2b263972b7ea2R200, which accidentally pass `components` to `x`. But `x`'s components is `innerElementSize` not `components `. And when `innerElementSize` is 3, we should use `1` in current design. --- .../webgpu/ops/3rd-party/conv2d_mm_webgpu.ts | 5 +-- js/web/test/data/ops/conv.jsonc | 32 ++++++++++++++++++- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts index 22f942a0d9ab4..3638938df7dbe 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/3rd-party/conv2d_mm_webgpu.ts @@ -180,7 +180,7 @@ export const createConv2DMatMulProgramInfo = LOG_DEBUG('verbose', () => `[conv2d_mm_webgpu] dispatch = ${dispatch}`); - const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : elementsPerThread[0]; + const innerElementSize = isVec4 ? (isChannelsLast && inChannels % 4 !== 0 ? 3 : 4) : 1; const tileAOuter = workGroupSize[1] * elementsPerThread[1]; const tileBOuter = workGroupSize[0] * elementsPerThread[0]; @@ -197,7 +197,8 @@ export const createConv2DMatMulProgramInfo = const components = isVec4 ? 4 : 1; const programUniforms: ProgramUniform[] = [{type: 'int32', data: dimAOuter}, {type: 'int32', data: dimBOuter}, {type: 'int32', data: dimInner}]; - const x = inputVariable('x', inputs[0].dataType, inputs[0].dims.length, components); + const x = + inputVariable('x', inputs[0].dataType, inputs[0].dims.length, innerElementSize === 3 ? 1 : innerElementSize); const w = inputVariable('w', inputs[1].dataType, inputs[1].dims.length, components); const inputVariables = [x, w]; diff --git a/js/web/test/data/ops/conv.jsonc b/js/web/test/data/ops/conv.jsonc index 219e15eb4648f..2e8eaaba191d0 100644 --- a/js/web/test/data/ops/conv.jsonc +++ b/js/web/test/data/ops/conv.jsonc @@ -126,7 +126,7 @@ ] }, { - "name": "conv with bias addition C", + "name": "conv with bias addition C - NHWC", "operator": "Conv", "inputShapeDefinitions": "rankOnly", "opset": { "domain": "", "version": 17 }, @@ -158,6 +158,36 @@ "type": "float32" } ] + }, + { + "name": "inChannel = 3, outChannel = 4", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 10], + "dims": [1, 3, 3, 3], + "type": "float32" + }, + { + "data": [ + 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 1, 1, 1, 1, 2, 3, 4, 5, 6, 7, 8, 9, + 10, 11, 12, 13, 14, 15, 16, 17, 1, 2, 3, 4, 5, 6, 7, 8 + ], + "dims": [4, 3, 2, 2], + "type": "float32" + }, + { + "data": [5, 6, 7, 8], + "dims": [4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [360, 334, 271, 323, 909, 963, 1024, 1028, 683, 655, 576, 650, 473, 508, 570, 677], + "dims": [1, 4, 2, 2], + "type": "float32" + } + ] } ] }, From 3f42fbad2e42cf03c01eb0428b06e24f4ad2d427 Mon Sep 17 00:00:00 2001 From: Ran Gal <79867742+galran@users.noreply.github.com> Date: Mon, 27 Nov 2023 23:54:38 -0800 Subject: [PATCH 2/6] deleted the unused random_device variables because they caused a warning that was treated like an error. (#18543) deleted the unused random_device variables because they caused a warning that was treated like an error. **_Please check if the declaration is required for the random number generation. if so, there need to be a dummy reference to the variable or turning off the warning as error behavior._** ### Description ### Motivation and Context --- orttraining/orttraining/test/gradient/optimizer_ops_test.cc | 2 -- .../test/training_ops/cpu/reduction/reduction_ops_test.cc | 1 - 2 files changed, 3 deletions(-) diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc index c100730aacc44..bfb59f1525e47 100644 --- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc +++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc @@ -1542,7 +1542,6 @@ TEST(OptimizerTest, LambOptimizerTestLarge) { std::vector m(size); std::vector v(size); - std::random_device random_device; std::mt19937 random_engine(0); std::uniform_real_distribution dist(0.1f, 1.0f); for (int i = 0; i < size; ++i) { @@ -1581,7 +1580,6 @@ TEST(OptimizerTest, LambOptimizerTestLarge) { TEST(OptimizerTest, LambOptimizerMultiTensorRatio) { constexpr int group_count = 127; - std::random_device random_device; std::mt19937 random_engine(0); std::uniform_real_distribution dist(0.1f, 1.0f); std::uniform_int_distribution dist_int(1, 1228); diff --git a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc index be8b0aaa0bce1..60c3ecbcce8ce 100644 --- a/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/reduction/reduction_ops_test.cc @@ -275,7 +275,6 @@ void TestMultiTensorReduce( test.SetDeterminism(use_determinism); // Set up random number generator. - std::random_device random_device; std::mt19937 random_engine(0); std::uniform_real_distribution dist(min, max); std::uniform_int_distribution dist_int(min_tensor_size, max_tensor_size); From 94a6020a7f59f22101653988a36bca02593eb816 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Xavier=20Dupr=C3=A9?= Date: Tue, 28 Nov 2023 03:56:00 -0800 Subject: [PATCH 3/6] Improve parallelization of TfIdfVectorizer, Reduce memory consumption (#18539) ### Description TfIdfVectorizer has two steps: first search for n-grams in the input, second, weight the results. The second step was not parallelized. The PR adresses that issue. Before two vectors were of the size of the output were allocated to compute the results. The first one, frequencies, was used as an intermediate vector between the two steps. This vector is now broken into multiple small vectors, one per thread. The memory consumption is then reduced for batches with a number of rows > the number of threads. ### Motivation and Context Performance and memory consumption. For one model, the improvment is +15% faster (4 cores, model size is ~6Mb, batch size is 100). Here is another benchmark on a machine with 32 cores with different size of vocabularies and batch sizes. The tested TfIdfVectorizer only deals with unigram and processes sequences of 10 tokens (integers). ![image](https://github.com/microsoft/onnxruntime/assets/22452781/0bb9abe9-ed81-44da-b5c4-ad2a12f129bd) --- .../core/providers/cpu/nn/tfidfvectorizer.cc | 154 ++++++++---------- .../core/providers/cpu/nn/tfidfvectorizer.h | 7 +- 2 files changed, 71 insertions(+), 90 deletions(-) diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc index f36b75c508da0..eb245a4c9ba0c 100644 --- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc +++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.cc @@ -141,14 +141,11 @@ struct TfIdfVectorizer::Impl { Impl(const Impl&) = delete; Impl& operator=(const Impl&) = delete; - void IncrementCount(size_t ngram_id, size_t row_num, - std::vector& frequencies) const { + inline size_t OutputIdToIncrement(size_t ngram_id) const { assert(ngram_id != 0); --ngram_id; assert(ngram_id < ngram_indexes_.size()); - size_t output_idx = row_num * output_size_ + SafeInt(ngram_indexes_[ngram_id]); - assert(output_idx < frequencies.size()); - ++frequencies[output_idx]; + return SafeInt(ngram_indexes_[ngram_id]); } }; @@ -252,77 +249,17 @@ TfIdfVectorizer::TfIdfVectorizer(const OpKernelInfo& info) : OpKernel(info), imp TfIdfVectorizer::~TfIdfVectorizer() = default; -void TfIdfVectorizer::OutputResult(OpKernelContext* ctx, size_t B, const std::vector& frequences) const { - const Impl& impl = *impl_; - std::vector output_dims; - if (B == 0) { - output_dims.push_back(impl.output_size_); - B = 1; // For use in the loops below - } else { - output_dims.push_back(B); - output_dims.push_back(impl.output_size_); - } - - const auto row_size = impl.output_size_; - - TensorShape output_shape(output_dims); - assert(frequences.size() == static_cast(output_shape.Size())); - - auto Y = ctx->Output(0, output_shape); - auto output_data = Y->MutableData(); - const auto& w = impl.weights_; - switch (impl.weighting_criteria_) { - case kTF: { - for (auto f : frequences) { - *output_data++ = static_cast(f); - } - } break; - case kIDF: { - if (!w.empty()) { - const auto* freqs = frequences.data(); - for (size_t batch = 0; batch < B; ++batch) { - for (size_t i = 0; i < row_size; ++i) { - *output_data++ = (*freqs++ > 0) ? w[i] : 0; - } - } - } else { - for (auto f : frequences) { - *output_data++ = (f > 0) ? 1.0f : 0; - } - } - } break; - case kTFIDF: { - if (!w.empty()) { - const auto* freqs = frequences.data(); - for (size_t batch = 0; batch < B; ++batch) { - for (size_t i = 0; i < row_size; ++i) { - *output_data++ = *freqs++ * w[i]; - } - } - } else { - for (auto f : frequences) { - *output_data++ = static_cast(f); - } - } - } break; - case kNone: // fall-through - default: - assert(false); - } -} - -void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_t row_size, - std::vector& frequencies) const { - auto X = ctx->Input(0); - const auto elem_size = X->DataType()->Size(); - - const void* const row_begin = AdvanceElementPtr(X->DataRaw(), row_num * row_size, elem_size); +void TfIdfVectorizer::ComputeImpl(const void* x_data_raw, size_t elem_size, ptrdiff_t row_num, size_t row_size, + bool is_input_string, gsl::span output_data, + std::function&)>& fn_weight) const { + const void* const row_begin = AdvanceElementPtr(x_data_raw, row_num * row_size, elem_size); const void* const row_end = AdvanceElementPtr(row_begin, row_size, elem_size); const auto& impl = *impl_; const auto max_gram_length = impl.max_gram_length_; const auto max_skip_distance = impl.max_skip_count_ + 1; // Convert to distance auto start_ngram_size = impl.min_gram_length_; + size_t output_idx; for (auto skip_distance = 1; skip_distance <= max_skip_distance; ++skip_distance) { auto ngram_start = row_begin; @@ -336,7 +273,7 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_ } auto ngram_item = ngram_start; - if (X->IsDataTypeString()) { + if (is_input_string) { const std::string* str_item = reinterpret_cast(ngram_item); const StrMap* str_map = &impl.str_map_; for (auto ngram_size = 1; @@ -349,7 +286,8 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_ break; } if (ngram_size >= start_ngram_size && hit->second->id_ != 0) { - impl.IncrementCount(hit->second->id_, row_num, frequencies); + output_idx = impl.OutputIdToIncrement(hit->second->id_); + fn_weight(output_idx, output_data); } str_map = &hit->second->leafs_; } @@ -360,13 +298,14 @@ void TfIdfVectorizer::ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_ ngram_size <= max_gram_length && ngram_item < ngram_row_end; ++ngram_size, ngram_item = AdvanceElementPtr(ngram_item, skip_distance, elem_size)) { - int64_t val = (X->IsDataType()) ? int64_t{*reinterpret_cast(ngram_item)} : *reinterpret_cast(ngram_item); + int64_t val = (elem_size == 4) ? int64_t{*reinterpret_cast(ngram_item)} : *reinterpret_cast(ngram_item); auto hit = int_map->find(val); if (hit == int_map->end()) { break; } if (ngram_size >= start_ngram_size && hit->second->id_ != 0) { - impl.IncrementCount(hit->second->id_, row_num, frequencies); + output_idx = impl.OutputIdToIncrement(hit->second->id_); + fn_weight(output_idx, output_data); } int_map = &hit->second->leafs_; } @@ -412,31 +351,76 @@ Status TfIdfVectorizer::Compute(OpKernelContext* ctx) const { } assert((num_rows * C) == total_items); - // Frequency holder allocate [B..output_size_] - // and init all to zero - std::vector frequencies; - frequencies.resize(num_rows * impl_->output_size_, 0); + const Impl& impl = *impl_; + TensorShapeVector output_dims; + if (B == 0) { + output_dims.push_back(impl.output_size_); + B = 1; // For use in the loops below + } else { + output_dims.push_back(B); + output_dims.push_back(impl.output_size_); + } + TensorShape output_shape(output_dims); + + auto Y = ctx->Output(0, output_shape); + auto output_data = Y->MutableData(); + const bool is_input_string = X->IsDataTypeString(); if (total_items == 0 || - (X->IsDataTypeString() && impl_->str_map_.empty()) || + (is_input_string && impl_->str_map_.empty()) || ((X->IsDataType() || X->IsDataType()) && impl_->int64_map_.empty())) { // TfidfVectorizer may receive an empty input when it follows a Tokenizer // (for example for a string containing only stopwords). // TfidfVectorizer returns a zero tensor of shape // {b_dim, output_size} when b_dim is the number of received observations // and output_size the is the maximum value in ngram_indexes attribute plus 1. - OutputResult(ctx, B, frequencies); + memset(output_data, 0, static_cast(output_shape.Size() * sizeof(float))); return Status::OK(); } - std::function fn = [this, ctx, C, &frequencies](ptrdiff_t row_num) { - ComputeImpl(ctx, row_num, C, frequencies); - }; + auto x_data_raw = ctx->Input(0)->DataRaw(); + const auto elem_size = X->DataType()->Size(); + int32_t num_batches = std::min(concurrency::ThreadPool::DegreeOfParallelism(ctx->GetOperatorThreadPool()) * 2, num_rows); - concurrency::ThreadPool::TryBatchParallelFor(ctx->GetOperatorThreadPool(), num_rows, std::move(fn), 0); + const auto& w = impl.weights_; + std::function&)> fn_weight; - OutputResult(ctx, B, frequencies); + switch (impl.weighting_criteria_) { + case kTF: + fn_weight = [](size_t i, gsl::span& out) { out[i] += 1.0f; }; + break; + case kIDF: + if (!w.empty()) { + fn_weight = [&w](size_t i, gsl::span& out) { out[i] = w[i]; }; + } else { + fn_weight = [](size_t i, gsl::span& out) { out[i] = 1.0f; }; + } + break; + case kTFIDF: + if (!w.empty()) { + fn_weight = [&w](size_t i, gsl::span& out) { out[i] += w[i]; }; + } else { + fn_weight = [](size_t i, gsl::span& out) { out[i] += 1.0f; }; + } + break; + case kNone: // fall-through + default: + assert(false); + } + + std::function fn = [this, C, output_data, x_data_raw, elem_size, + is_input_string, num_batches, num_rows, &fn_weight](ptrdiff_t batch_num) { + // Frequency holder allocate [B..output_size_] and init all to zero. + auto work = concurrency::ThreadPool::PartitionWork(batch_num, num_batches, static_cast(num_rows)); + std::vector frequencies(this->impl_->output_size_); + for (auto row_num = work.start; row_num < work.end; ++row_num) { + auto out = gsl::span(output_data + row_num * this->impl_->output_size_, this->impl_->output_size_); + std::fill(out.begin(), out.end(), 0.0f); + ComputeImpl(x_data_raw, elem_size, row_num, C, is_input_string, out, fn_weight); + } + }; + concurrency::ThreadPool::TrySimpleParallelFor(ctx->GetOperatorThreadPool(), num_batches, std::move(fn)); return Status::OK(); } diff --git a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h index 45db40d893231..14488d91c23e9 100644 --- a/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h +++ b/onnxruntime/core/providers/cpu/nn/tfidfvectorizer.h @@ -19,11 +19,8 @@ class TfIdfVectorizer final : public OpKernel { Status Compute(OpKernelContext* ctx) const override; private: - void ComputeImpl(OpKernelContext* ctx, ptrdiff_t row_num, size_t row_size, - std::vector& frequencies) const; - - // Apply weighing criteria and output - void OutputResult(OpKernelContext* ctx, size_t b_dim, const std::vector& frequences) const; + void ComputeImpl(const void* x_data_raw, size_t elem_size, ptrdiff_t row_num, size_t row_size, bool is_input_string, + gsl::span output_data, std::function&)>& fn_weight) const; struct Impl; std::unique_ptr impl_; From 3ea27c29253aad7c02015e2af6d37dedafe2c9c3 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Tue, 28 Nov 2023 09:03:46 -0800 Subject: [PATCH 4/6] Create a new Nuget Package pipeline for CUDA 12 (#18135) --- .../c-api-noopenmp-packaging-pipelines.yml | 18 +- .../cuda-packaging-pipeline.yml | 175 ++++++++++++++ .../azure-pipelines/linux-gpu-ci-pipeline.yml | 29 ++- .../linux-gpu-tensorrt-ci-pipeline.yml | 28 ++- .../nuget/templates/test_linux.yml | 15 +- .../nuget/templates/test_win.yml | 18 +- .../py-cuda-packaging-pipeline.yml | 2 +- .../stages/nuget-combine-cuda-stage.yml | 228 ++++++++++++++++++ .../nuget-linux-cuda-packaging-stage.yml | 161 +++++++++++++ .../stages/nuget-win-cuda-packaging-stage.yml | 147 +++++++++++ .../jobs/download_win_gpu_library.yml | 1 - .../linux-gpu-tensorrt-packaging-pipeline.yml | 35 ++- .../azure-pipelines/templates/win-ci.yml | 49 +++- .../github/linux/build_cuda_c_api_package.sh | 2 +- .../linux/build_tensorrt_c_api_package.sh | 2 +- .../docker/Dockerfile.manylinux2_28_cuda | 1 + ...ckerfile.package_ubi8_cuda11_8_tensorrt8_6 | 9 +- ...8_6 => Dockerfile.package_ubuntu_2004_gpu} | 18 +- .../inference/x64/default/gpu/Dockerfile | 4 +- 19 files changed, 889 insertions(+), 53 deletions(-) create mode 100644 tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml create mode 100644 tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml rename tools/ci_build/github/linux/docker/{Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 => Dockerfile.package_ubuntu_2004_gpu} (50%) diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 0eccd71e47f46..67fa78da003a3 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -60,6 +60,14 @@ parameters: type: string default: '--use_azure' +- name: CudaVersion + displayName: CUDA version + type: string + default: '11.8' + values: + - 11.8 + - 12.2 + resources: repositories: - repository: onnxruntime-inference-examples # The name used to reference this repository in the checkout step @@ -146,7 +154,13 @@ stages: timeoutInMinutes: 120 pool: 'Onnxruntime-Linux-GPU' variables: - CUDA_VERSION: '11.8' + - name: CUDA_VERSION_MAJOR + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: '11' + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: '12' + - name: CUDA_VERSION + value: ${{ parameters.CudaVersion }} steps: - template: templates/set-version-number-variables-step.yml - template: templates/get-docker-image-steps.yml @@ -154,7 +168,7 @@ stages: Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" - Repository: onnxruntimecuda11centosbuild + Repository: onnxruntimecuda$(CUDA_VERSION_MAJOR)build - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh workingDirectory: $(Build.SourcesDirectory) diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml new file mode 100644 index 0000000000000..8a9592282cd46 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -0,0 +1,175 @@ +parameters: + - name: RunOnnxRuntimeTests + displayName: Run Tests? + type: boolean + default: true + + - name: UseIncreasedTimeoutForTests + displayName: Increase timeout for tests? Set it to false if you are doing an Onnx Runtime release. + type: boolean + default: false + + - name: DoCompliance + displayName: Run Compliance Tasks? + type: boolean + default: true + + - name: DoEsrp + displayName: Run code sign tasks? Must be true if you are doing an ONNX Runtime release + type: boolean + default: true + + - name: IsReleaseBuild + displayName: Is a release build? Set it to true if you are doing an ONNX Runtime release. + type: boolean + default: false + + - name: PreReleaseVersionSuffixString + displayName: Suffix added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the type of pre-release package. + type: string + values: + - alpha + - beta + - rc + - none + default: none + + - name: PreReleaseVersionSuffixNumber + displayName: Number added to pre-release package version. Only used if IsReleaseBuild is true. Denotes the sequence of a pre-release package. + type: number + default: 0 + + # these 2 parameters are used for debugging. + - name: SpecificArtifact + displayName: Use Specific Artifact (Debugging only) + type: boolean + default: false + + - name: BuildId + displayName: Pipeline BuildId, you could find it in the URL + type: string + default: '0' + + - name: CudaVersion + displayName: CUDA version + type: string + default: '12.2' + values: + - 11.8 + - 12.2 + +variables: + - name: ReleaseVersionSuffix + value: '' + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: 8.6.1.6-1.cuda11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: 8.6.1.6-1.cuda12.0 + - name: win_trt_home + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 + - name: win_cuda_home + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: $(Agent.TempDirectory)\v11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: $(Agent.TempDirectory)\v12.2 +resources: + repositories: + - repository: onnxruntime-inference-examples # The name used to reference this repository in the checkout step + type: github + endpoint: ort-examples + name: microsoft/onnxruntime-inference-examples + - repository: manylinux + type: Github + endpoint: Microsoft + name: pypa/manylinux + ref: 5eda9aded5462201e6310105728d33016e637ea7 + +stages: +# Set ReleaseVersionSuffix + - stage: Set_ReleaseVersionSuffix + jobs: + - job: Set_Variables + pool: + vmImage: ubuntu-latest + steps: + - checkout: none + - bash: | + # Do not output ##vso[] commands with `set -x` or they may be parsed again and include a trailing quote. + set +x + if [[ "${{ parameters.IsReleaseBuild }}" = True && "${{ parameters.PreReleaseVersionSuffixString }}" != "none" ]]; then + if [[ "${{ parameters.PreReleaseVersionSuffixNumber }}" -eq 0 ]]; then + echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]-${{ parameters.PreReleaseVersionSuffixString }}" + else + echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]-${{ parameters.PreReleaseVersionSuffixString }}.${{ parameters.PreReleaseVersionSuffixNumber }}" + fi + else + echo "##vso[task.setvariable variable=ReleaseVersionSuffix;isOutput=true]" + fi + name: Set_Release_Version_Suffix + - bash: echo $(ReleaseVersionSuffix) + name: Debug_Release_Version_Suffix + # this is needed for certain artifacts to be published + - stage: Linux_C_API_Packaging_CPU_x64 + dependsOn: [ ] + jobs: + - template: templates/c-api-linux-cpu.yml + parameters: + BaseImage: 'registry.access.redhat.com/ubi8/ubi' + OnnxruntimeArch: 'x64' + OnnxruntimeCFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all' + OnnxruntimeCXXFlags: '-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all' + OnnxruntimeNodejsBindingArch: 'x64' + PoolName: 'onnxruntime-Ubuntu2004-AMD-CPU' + PackageJava: false + PackageNodeJS: false + # Nuget Packaging + + - template: stages/nuget-linux-cuda-packaging-stage.yml + parameters: + CudaVersion: ${{ parameters.CudaVersion }} + docker_base_image: ${{ variables.docker_base_image }} + linux_trt_version: ${{ variables.linux_trt_version }} + - template: stages/nuget-win-cuda-packaging-stage.yml + parameters: + RunOnnxRuntimeTests: ${{ parameters.RunOnnxRuntimeTests }} + UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} + CudaVersion: ${{ parameters.CudaVersion }} + win_trt_home: ${{ variables.win_trt_home }} + win_cuda_home: ${{ variables.win_cuda_home }} + - template: stages/nuget-combine-cuda-stage.yml + parameters: + DoCompliance: ${{ parameters.DoCompliance }} + DoEsrp: ${{ parameters.DoEsrp }} + IsReleaseBuild: ${{ parameters.IsReleaseBuild }} + # Testing + ## Windows GPU Testing + - template: nuget/templates/test_win.yml + parameters: + AgentPool: 'onnxruntime-Win2022-GPU-T4' + NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' + ArtifactSuffix: 'GPU' + StageSuffix: 'GPU' + Skipx86Tests: 'true' + CudaVersion: ${{ parameters.CudaVersion }} + ## Linux GPU Testing + - template: nuget/templates/test_linux.yml + parameters: + AgentPool: Onnxruntime-Linux-GPU + ArtifactSuffix: 'GPU' + StageSuffix: 'GPU' + NugetPackageName: 'Microsoft.ML.OnnxRuntime.Gpu' + SpecificArtifact: ${{ parameters.specificArtifact }} + CudaVersion: ${{ parameters.CudaVersion }} + BuildId: ${{ parameters.BuildId }} + +## Win/Linux GPU Combined Publishing +#- template: templates/publish-nuget.yml diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 9e1fae343c84e..0993a81a02249 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -26,7 +26,14 @@ pr: - 'js/web' - 'onnxruntime/core/providers/js' #### end trigger #### - +parameters: + - name: CudaVersion + displayName: CUDA version + type: string + default: '11.8' + values: + - 11.8 + - 12.2 resources: repositories: - repository: manylinux @@ -37,6 +44,17 @@ resources: variables: - template: templates/common-variables.yml + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 + + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: 8.6.1.6-1.cuda11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: 8.6.1.6-1.cuda12.0 jobs: - job: Linux_Build @@ -55,15 +73,14 @@ jobs: - checkout: self clean: true submodules: none - - template: templates/get-docker-image-steps.yml parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda Context: tools/ci_build/github/linux/docker DockerBuildArgs: " --network=host - --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 + --build-arg BASEIMAGE=$(docker_base_image) + --build-arg TRT_VERSION=$(linux_trt_version) --build-arg BUILD_UID=$( id -u ) " Repository: onnxruntimecuda11build @@ -163,8 +180,8 @@ jobs: Context: tools/ci_build/github/linux/docker DockerBuildArgs: " --network=host - --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 + --build-arg BASEIMAGE=$(docker_base_image) + --build-arg TRT_VERSION=$(linux_trt_version) --build-arg BUILD_UID=$( id -u ) " Repository: onnxruntimecuda11build diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 517c8d638c935..4ca11a4d1565b 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -26,7 +26,14 @@ pr: - 'js/web' - 'onnxruntime/core/providers/js' #### end trigger #### - +parameters: + - name: CudaVersion + displayName: CUDA version + type: string + default: '11.8' + values: + - 11.8 + - 12.2 resources: repositories: - repository: manylinux @@ -34,7 +41,17 @@ resources: endpoint: Microsoft name: pypa/manylinux ref: 5eda9aded5462201e6310105728d33016e637ea7 - +variables: + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: 8.6.1.6-1.cuda11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: 8.6.1.6-1.cuda12.0 jobs: - job: Linux_Build timeoutInMinutes: 180 @@ -61,8 +78,8 @@ jobs: Context: tools/ci_build/github/linux/docker DockerBuildArgs: " --network=host - --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 + --build-arg BASEIMAGE=${{ variables.docker_base_image }} + --build-arg TRT_VERSION=${{ variables.linux_trt_version }} --build-arg BUILD_UID=$( id -u ) " Repository: onnxruntimetensorrt86gpubuild @@ -99,7 +116,8 @@ jobs: --build_shared_lib \ --parallel \ --build_wheel \ - --enable_onnx_tests --use_cuda --cuda_version=11.8 --cuda_home=/usr/local/cuda-11.8 --cudnn_home=/usr/local/cuda-11.8 \ + --enable_onnx_tests \ + --use_cuda --cuda_home=/usr/local/cuda-${{ parameters.CudaVersion }} --cudnn_home=/usr/local/cuda-${{ parameters.CudaVersion }} \ --enable_pybind --build_java \ --use_tensorrt --tensorrt_home /usr \ --cmake_extra_defines CMAKE_CUDA_ARCHITECTURES=75 \ diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml index 64fa29f06553e..1e609b052b8d3 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_linux.yml @@ -7,7 +7,7 @@ parameters: SpecificArtifact: false CustomOpArtifactName: 'onnxruntime-linux-x64' BuildId: '0' - + CudaVersion: '11.8' stages: - stage: NuGet_Test_Linux_${{ parameters.StageSuffix }} dependsOn: @@ -54,9 +54,18 @@ stages: - ${{if contains(parameters.StageSuffix , 'GPU') }}: - template: ../../templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu Context: tools/ci_build/github/linux/docker/ - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" + ${{ if eq(parameters.CudaVersion, '12.2') }}: + DockerBuildArgs: " + --build-arg BASEIMAGE=nvidia/cuda:12.2.2-cudnn8-devel-ubuntu20.04 + --build-arg TRT_VERSION=8.6.1.6-1+cuda12.0 + --build-arg BUILD_UID=$( id -u ) + " + ${{ else }}: + DockerBuildArgs: " + --build-arg BUILD_UID=$( id -u ) + " Repository: onnxruntimepackagestest - bash: | docker run --rm \ diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml index 0b9ded10ddd3e..4f693d45cb76f 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/test_win.yml @@ -8,6 +8,7 @@ parameters: # the parent pipeline. TestDataArtifactSuffix: '' Skipx86Tests: 'false' + CudaVersion: '' stages: - stage: NuGet_Test_Win_${{ parameters.StageSuffix }} @@ -27,6 +28,10 @@ stages: value: 'ON' - name: runCodesignValidationInjection value: false + - name: CUDA_MODULE_LOADINGL + value: 'LAZY' + - name: GRADLE_OPTS + value: '-Dorg.gradle.daemon=false' steps: - task: UsePythonVersion@0 @@ -39,13 +44,12 @@ stages: displayName: Use Nuget 5.7.0 inputs: versionSpec: 5.7.0 - - - task: BatchScript@1 - displayName: 'setup env' - inputs: - filename: '$(Build.SourcesDirectory)\tools\ci_build\github\windows\setup_env_gpu.bat' - modifyEnvironment: true - workingFolder: '$(Build.BinariesDirectory)' + - ${{ if ne( parameters.CudaVersion, '') }}: + - template: ../../templates/jobs/download_win_gpu_library.yml + parameters: + DownloadCUDA: true + DownloadTRT: true + CudaVersion: ${{ parameters.CudaVersion }} - task: BatchScript@1 displayName: 'Setup Visual Studio env vars' diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml index aee42d3675087..91179d141498b 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml @@ -31,7 +31,7 @@ resources: ref: 5eda9aded5462201e6310105728d33016e637ea7 stages: - - template: stages/py-cuda-packaging-stage.yml + - template: stages/py-nuget-combine-cuda-stage.yml parameters: enable_linux_gpu: ${{ parameters.enable_linux_gpu }} enable_windows_gpu: ${{ parameters.enable_windows_gpu }} diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml new file mode 100644 index 0000000000000..b69e75856c39f --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-combine-cuda-stage.yml @@ -0,0 +1,228 @@ +parameters: +- name: DoCompliance + type: boolean + default: true + +- name: DoEsrp + type: boolean + default: true + +- name: IsReleaseBuild + type: boolean + default: false + +stages: +######## Nuget ######## +# Win/Linux CUDA Combined packaging +- stage: NuGet_Packaging_GPU + dependsOn: + - Set_ReleaseVersionSuffix + - Windows_Packaging_gpu + - Windows_Packaging_tensorrt + - Linux_C_API_Packaging_CPU_x64 + - Linux_C_API_Packaging_GPU_x64 + - Linux_C_API_Packaging_GPU_TensorRT_x64 + condition: succeeded() + jobs: + - job: + workspace: + clean: all + # we need to use the 2022 pool to create the nuget package with both pre-net6+Xamarin and net6 targets. + # VS2019 has no support for net6 and we need to use msbuild (from the VS install) to do the packing + pool: 'Azure-Pipelines-EO-Windows2022-aiinfra' + variables: + breakCodesignValidationInjection: ${{ parameters.DoEsrp }} + ReleaseVersionSuffix: $[stageDependencies.Setup.Set_Variables.outputs['Set_Release_Version_Suffix.ReleaseVersionSuffix']] + + steps: + - checkout: self + submodules: true + # Download the all artifacts + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact from Linux_C_API_Packaging_GPU_x64 Stage' + inputs: + artifactName: 'onnxruntime-win-x64-cuda' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact from Linux_C_API_Packaging_GPU_TensorRT_x64 Stage' + inputs: + artifactName: 'onnxruntime-win-x64-tensorrt' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact from Windows_Packaging_gpu Stage' + inputs: + artifactName: 'onnxruntime-linux-x64-cuda' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact from Windows_Packaging_tensorrt Stage' + inputs: + artifactName: 'onnxruntime-linux-x64-tensorrt' + targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact - protoc from Windows_Packaging_(cpu|gpu) Stage' + inputs: + artifactName: 'drop-extra' + targetPath: '$(Build.BinariesDirectory)/extra-artifact' + + # Reconstruct the build dir + - task: PowerShell@2 + displayName: 'PS: Extract nuget files gpu' + inputs: + targetType: filePath + filePath: $(Build.SourcesDirectory)\tools\ci_build\github\windows\extract_nuget_files_gpu.ps1 + + - script: | + dir + workingDirectory: '$(Build.BinariesDirectory)/nuget-artifact' + displayName: 'List artifacts' + + - script: | + mklink /D /J models C:\local\models + workingDirectory: '$(Build.BinariesDirectory)' + displayName: 'Create models link' + + - task: NuGetToolInstaller@0 + displayName: Use Nuget 6.2.1 + inputs: + versionSpec: 6.2.1 + + - task: PowerShell@2 + displayName: Install .NET 6 workloads + inputs: + targetType: 'inline' + script: | + dotnet workload install android ios macos + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + - task: PowerShell@2 + displayName: Build .NET 6 targets using dotnet + inputs: + targetType: 'inline' + # we don't specify 'Any CPU' as the platform here because if we do it gets added to the output path + # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\Any CPU\RelWithDebInfo\net6.0-ios\ + # which is inconsistent with the msbuild output path for the pre-.net6 targets + # e.g. csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo\monoandroid11.0 + # and makes it harder to do the packing + # + # 'Any CPU' is the default (first 'mixed' platform specified in the csproj) so this should be fine. + script: | + dotnet build .\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj -p:SelectedTargets=Net6 -p:Configuration=RelWithDebInfo -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix) + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + - task: MSBuild@1 + displayName: 'Restore NuGet Packages and create project.assets.json for pre-.net6 targets' + inputs: + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + platform: 'Any CPU' + configuration: RelWithDebInfo + msbuildArguments: '-t:restore -p:SelectedTargets=PreNet6 -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu"' + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + - task: MSBuild@1 + displayName: 'Build C# for pre-.net6 targets' + inputs: + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + configuration: RelWithDebInfo + platform: 'Any CPU' + msbuildArguments: '-p:SelectedTargets=PreNet6 -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId="Microsoft.ML.OnnxRuntime.Gpu" -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + - template: ../templates/win-esrp-dll.yml + parameters: + FolderPath: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo' + DisplayName: 'ESRP - Sign C# dlls' + DoEsrp: ${{ parameters.DoEsrp }} + + - task: MSBuild@1 + displayName: Update projects.assets.json with combined list of all target frameworks + inputs: + solution: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\Microsoft.ML.OnnxRuntime.csproj' + platform: 'Any CPU' + configuration: RelWithDebInfo + msbuildArguments: '-t:restore -p:SelectedTargets=All -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu' + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + - task: MSBuild@1 + displayName: 'Build Nuget Packages' + inputs: + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.proj' + configuration: RelWithDebInfo + platform: 'Any CPU' + msbuildArguments: '-t:CreatePackage -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu -p:IsReleaseBuild=${{ parameters.IsReleaseBuild }} -p:ReleaseVersionSuffix=$(ReleaseVersionSuffix)' + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + - task: BatchScript@1 + displayName: 'Add TensorRT header file to the native nuGet package' + inputs: + filename: $(Build.SourcesDirectory)\tools\ci_build\github\windows\bundle_nuget_with_native_headers.bat + workingFolder: $(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo + + - task: CopyFiles@2 + displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' + Contents: '*.snupkg' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - task: CopyFiles@2 + displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' + Contents: '*.nupkg' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - task: CopyFiles@2 + displayName: 'Copy nuget packages to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.SourcesDirectory)\csharp\src\Microsoft.ML.OnnxRuntime\bin\RelWithDebInfo' + Contents: '*.nupkg' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - template: ../templates/esrp_nuget.yml + parameters: + DisplayName: 'ESRP - sign NuGet package' + FolderPath: '$(Build.ArtifactStagingDirectory)' + DoEsrp: ${{ parameters.DoEsrp }} + + - template: ../templates/validate-package.yml + parameters: + PackageType: 'nuget' + PackagePath: '$(Build.ArtifactStagingDirectory)' + PackageName: 'Microsoft.ML.OnnxRuntime.*nupkg' + PlatformsSupported: 'win-x64,linux-x64' + VerifyNugetSigning: false + + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline NuGet Artifact' + inputs: + artifactName: 'drop-signed-nuget-GPU' + targetPath: '$(Build.ArtifactStagingDirectory)' + + + - task: MSBuild@1 + displayName: 'Clean C#' + inputs: + solution: '$(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln' + platform: 'Any CPU' + configuration: RelWithDebInfo + msbuildArguments: '-t:Clean -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu' + workingDirectory: '$(Build.SourcesDirectory)\csharp' + + + - task: RoslynAnalyzers@2 + displayName: 'Run Roslyn Analyzers' + inputs: + userProvideBuildInfo: msBuildInfo + msBuildCommandline: '"C:\Program Files\Microsoft Visual Studio\2022\Enterprise\MSBuild\Current\Bin\msbuild.exe" $(Build.SourcesDirectory)\csharp\OnnxRuntime.CSharp.sln -p:configuration="RelWithDebInfo" -p:Platform="Any CPU" -p:OnnxRuntimeBuildDirectory="$(Build.BinariesDirectory)" -p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu' + condition: and(succeeded(), eq('${{ parameters.DoCompliance }}', true)) + + - template: ../templates/component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml new file mode 100644 index 0000000000000..140a377ca72a3 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-linux-cuda-packaging-stage.yml @@ -0,0 +1,161 @@ +parameters: +- name: CudaVersion + type: string + default: '11.8' +- name: docker_base_image + type: string +- name: linux_trt_version + type: string + +stages: + # Linux CUDA without TensorRT Packaging +- stage: Linux_C_API_Packaging_GPU_x64 + dependsOn: [] + jobs: + - job: + workspace: + clean: all + timeoutInMinutes: 120 + pool: 'Onnxruntime-Linux-GPU' + variables: + - name: CUDA_VERSION_MAJOR + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: '11' + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: '12' + - name: CUDA_VERSION + value: ${{ parameters.CudaVersion }} + steps: + - template: ../templates/set-version-number-variables-step.yml + - template: ../templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile + Context: tools/ci_build/github/linux/docker/inference/x64/default/gpu + DockerBuildArgs: " + --build-arg BUILD_UID=$( id -u ) + --build-arg BASEIMAGE=${{ parameters.docker_base_image }} + " + Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}build + + - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_cuda_c_api_package.sh + workingDirectory: $(Build.SourcesDirectory) + displayName: 'Build and Test' + + - template: ../templates/c-api-artifacts-package-and-publish-steps-posix.yml + parameters: + buildConfig: 'Release' + artifactName: 'onnxruntime-linux-x64-cuda-$(OnnxRuntimeVersion)' + artifactNameNoVersionString: 'onnxruntime-linux-x64-cuda' + libraryName: 'libonnxruntime.so.$(OnnxRuntimeVersion)' + + - template: ../templates/component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + - template: ../templates/clean-agent-build-directory-step.yml +# Linux CUDA with TensorRT Packaging +- template: ../templates/linux-gpu-tensorrt-packaging-pipeline.yml + parameters: + artifactName: 'onnxruntime-linux-x64-tensorrt-$(OnnxRuntimeVersion)' + artifactNameNoVersionString: 'onnxruntime-linux-x64-tensorrt' + buildJava: false + buildJavaOption: '--build_java' + buildNodejs: false + buildNodejsOption: '--build_nodejs' + CudaVersion: ${{ parameters.CudaVersion }} +# Linux CUDA Combined Testing and Publishing +- stage: Linux_Packaging_combined_GPU + dependsOn: + - Linux_C_API_Packaging_GPU_x64 + - Linux_C_API_Packaging_GPU_TensorRT_x64 + condition: succeeded() + jobs: + - job: + workspace: + clean: all + pool: 'Onnxruntime-Linux-GPU' + + steps: + - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime + submodules: false + - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples + submodules: false + - checkout: manylinux # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/manylinux + submodules: false + + - script: | + set -e -x + cd $(Build.SourcesDirectory) + mv manylinux onnxruntime + ls + + - template: ../templates/with-container-registry-steps.yml + parameters: + Steps: + - script: | + tools/ci_build/get_docker_image.py \ + --dockerfile tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda \ + --context tools/ci_build/github/linux/docker \ + --docker-build-args "--network=host --build-arg BASEIMAGE=${{ parameters.docker_base_image }} --build-arg TRT_VERSION=${{ parameters.linux_trt_version }} --build-arg BUILD_UID=$( id -u )" \ + --container-registry onnxruntimebuildcache \ + --multiple_repos \ + --repository onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build + displayName: "Get onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build image for tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda" + workingDirectory: $(Build.SourcesDirectory)/onnxruntime + ContainerRegistry: onnxruntimebuildcache + + - template: ../templates/set-version-number-variables-step.yml + parameters: + versionFileDirectory: '$(Build.SourcesDirectory)/onnxruntime' + workingDirectory: '$(Build.SourcesDirectory)/onnxruntime' + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact - Combined GPU' + inputs: + artifactName: 'onnxruntime-linux-x64-cuda' + targetPath: '$(Build.BinariesDirectory)/tgz-artifacts' + + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact - Combined GPU' + inputs: + artifactName: 'onnxruntime-linux-x64-tensorrt' + targetPath: '$(Build.BinariesDirectory)/tgz-artifacts' + + - task: ShellScript@2 + displayName: 'Shell Script' + inputs: + scriptPath: 'onnxruntime/tools/ci_build/github/linux/extract_and_bundle_gpu_package.sh' + args: '-a $(Build.BinariesDirectory)/tgz-artifacts' + workingDirectory: '$(Build.BinariesDirectory)/tgz-artifacts' + + - task: ArchiveFiles@2 + inputs: + rootFolderOrFile: '$(Build.BinariesDirectory)/tgz-artifacts/onnxruntime-linux-x64-gpu' + includeRootFolder: false + archiveType: 'tar' # Options: zip, 7z, tar, wim + tarCompression: 'gz' + archiveFile: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz' + replaceExistingArchive: true + + - template: ../templates/validate-package.yml + parameters: + PackageType: 'tarball' + PackagePath: '$(Build.ArtifactStagingDirectory)' + PackageName: 'onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz' + ScriptPath: '$(Build.SourcesDirectory)/onnxruntime/tools/nuget/validate_package.py' + PlatformsSupported: 'linux-x64' + VerifyNugetSigning: false + workingDirectory: '$(Build.ArtifactStagingDirectory)' + + + - task: CmdLine@2 + displayName: 'Test C API application for GPU package' + inputs: + script: | + docker run --gpus all -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e NVIDIA_VISIBLE_DEVICES=all --rm --volume $(Build.SourcesDirectory):/src_dir \ + --volume $(Build.ArtifactStagingDirectory):/artifact_src -e NIGHTLY_BUILD onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build \ + /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet/run_capi_application.sh -o /src_dir/onnxruntime -p /artifact_src/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz -w /src_dir/onnxruntime-inference-examples/c_cxx/squeezenet + workingDirectory: '$(Build.ArtifactStagingDirectory)' + + - task: PublishPipelineArtifact@1 + inputs: + targetPath: '$(Build.ArtifactStagingDirectory)/onnxruntime-linux-x64-gpu-$(OnnxRuntimeVersion).tgz' + artifactName: 'onnxruntime-linux-x64-gpu' diff --git a/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml new file mode 100644 index 0000000000000..3fb653c6b4405 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/stages/nuget-win-cuda-packaging-stage.yml @@ -0,0 +1,147 @@ +parameters: +- name: RunOnnxRuntimeTests + type: boolean + default: true + +- name: UseIncreasedTimeoutForTests + type: boolean + default: false + +- name: DoCompliance + type: boolean + default: true + +- name: DoEsrp + type: boolean + default: true + +- name: CudaVersion + type: string + default: '11.8' +- name: win_cuda_home + type: string +- name: win_trt_home + type: string + +stages: +# Windows CUDA without TensorRT Packaging +- template: ../templates/win-ci.yml + parameters: + ort_build_pool_name: 'onnxruntime-Win2022-GPU-T4' + DoCompliance: ${{ parameters.DoCompliance }} + DoEsrp: ${{ parameters.DoEsrp }} + stage_name_suffix: gpu + buildArch: x64 + msbuildPlatform: x64 + packageName: x64-cuda + CudaVersion: ${{ parameters.CudaVersion }} + buildparameter: --use_cuda --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" + runTests: ${{ parameters.RunOnnxRuntimeTests }} + buildJava: false + java_artifact_id: onnxruntime_gpu + PublishProtoc: true +# Windows CUDA with TensorRT Packaging +- template: ../templates/win-ci.yml + parameters: + ort_build_pool_name: 'onnxruntime-Win2022-GPU-T4' + DoCompliance: ${{ parameters.DoCompliance }} + DoEsrp: ${{ parameters.DoEsrp }} + stage_name_suffix: tensorrt + buildArch: x64 + msbuildPlatform: x64 + CudaVersion: ${{ parameters.CudaVersion }} + packageName: x64-tensorrt + buildparameter: --use_tensorrt --tensorrt_home=${{ parameters.win_trt_home }} --cuda_home=${{ parameters.win_cuda_home }} --enable_onnx_tests --enable_wcos --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80" + runTests: ${{ parameters.RunOnnxRuntimeTests }} + buildJava: false + java_artifact_id: onnxruntime_gpu + UseIncreasedTimeoutForTests: ${{ parameters.UseIncreasedTimeoutForTests }} + +# Windows CUDA Combined Testing and Publishing +- stage: Windows_Packaging_combined_GPU + dependsOn: + - Windows_Packaging_gpu + - Windows_Packaging_tensorrt + condition: succeeded() + + jobs: + - job: + workspace: + clean: all + pool: 'onnxruntime-Win2022-GPU-T4' + variables: + CUDA_MODULE_LOADINGL: 'LAZY' + GRADLE_OPTS: '-Dorg.gradle.daemon=false' + steps: + - checkout: self # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime + - checkout: onnxruntime-inference-examples # due to checkout multiple repos, the root directory is $(Build.SourcesDirectory)/onnxruntime-inference-examples + submodules: false + - script: dir $(Build.SourcesDirectory) + - template: ../templates/jobs/download_win_gpu_library.yml + parameters: + DownloadCUDA: true + DownloadTRT: true + CudaVersion: ${{ parameters.CudaVersion }} + + - template: ../templates/set-version-number-variables-step.yml + parameters: + versionFileDirectory: '$(Build.SourcesDirectory)\onnxruntime' + workingDirectory: '$(Build.SourcesDirectory)\onnxruntime' + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact - onnxruntime-win-x64-cuda' + inputs: + artifactName: 'onnxruntime-win-x64-cuda' + targetPath: '$(Build.BinariesDirectory)/zip-artifacts' + + - task: DownloadPipelineArtifact@2 + displayName: 'Download Pipeline Artifact - onnxruntime-win-x64-tensorrt' + inputs: + artifactName: 'onnxruntime-win-x64-tensorrt' + targetPath: '$(Build.BinariesDirectory)/zip-artifacts' + + - task: PowerShell@2 + displayName: 'PowerShell Script' + inputs: + targetType: filePath + filePath: $(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\extract_zip_files_gpu.ps1 + + - script: | + dir + workingDirectory: '$(Build.BinariesDirectory)/zip-artifacts' + displayName: 'List artifacts' + + - task: BatchScript@1 + displayName: 'Bundle CUDA/TRT EP binaries' + inputs: + filename: $(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\bundle_dlls_gpu.bat + workingFolder: $(Build.BinariesDirectory)\zip-artifacts + + - task: CopyFiles@2 + displayName: 'Copy zip file to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.BinariesDirectory)\zip-artifacts' + Contents: 'onnxruntime-win-x64-gpu-*.zip' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - template: ../templates/validate-package.yml + parameters: + PackageType: 'zip' + PackagePath: '$(Build.ArtifactStagingDirectory)' + PackageName: 'onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip' + ScriptPath: '$(Build.SourcesDirectory)\onnxruntime\tools\nuget\validate_package.py' + PlatformsSupported: 'win-x64' + VerifyNugetSigning: false + workingDirectory: '$(Build.ArtifactStagingDirectory)' + + - task: BatchScript@1 + displayName: 'Test C API application for GPU package' + inputs: + filename: $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet\run_capi_application.bat + arguments: $(Build.SourcesDirectory)\onnxruntime $(Build.ArtifactStagingDirectory)\onnxruntime-win-x64-gpu-$(OnnxRuntimeVersion).zip $(Build.SourcesDirectory)\onnxruntime-inference-examples\c_cxx\squeezenet + workingFolder: '$(Build.ArtifactStagingDirectory)' + + - task: PublishPipelineArtifact@0 + displayName: 'Publish Pipeline Combined GPU Package Artifact' + inputs: + artifactName: 'onnxruntime-win-x64-gpu' + targetPath: '$(Build.ArtifactStagingDirectory)' \ No newline at end of file diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml index ff7f0957e94ba..b7ae9ffa3c219 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_gpu_library.yml @@ -13,7 +13,6 @@ parameters: - 12.2 steps: - - ${{ if eq(parameters.DownloadCUDA, true) }}: - powershell: | azcopy.exe cp --recursive https://lotusscus.blob.core.windows.net/models/cuda_sdk/v${{ parameters.CudaVersion }} $(Agent.TempDirectory) diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml index 85562d7758ab2..7693e8f2cd21c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/linux-gpu-tensorrt-packaging-pipeline.yml @@ -23,12 +23,33 @@ parameters: type: string default: '' +- name: CudaVersion + displayName: CUDA version + type: string + default: '11.8' + values: + - 11.8 + - 12.2 + + + # We only have CUDA/TRT on x64. We do not have a build for CUDA/TRT for ARM64. # Therefore this file does not have an `OnnxruntimeNodejsBindingArch` parameter stages: - stage: Linux_C_API_Packaging_GPU_TensorRT_x64 dependsOn: [] + variables: + - name: linux_trt_version + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: 8.6.1.6-1.cuda11.8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: 8.6.1.6-1.cuda12.0 + - name: docker_base_image + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 jobs: - job: dependsOn: [] @@ -37,7 +58,13 @@ stages: timeoutInMinutes: 180 pool: 'Onnxruntime-Linux-GPU' variables: - CUDA_VERSION: '11.8' + - name: CUDA_VERSION_MAJOR + ${{ if eq(parameters.CudaVersion, '11.8') }}: + value: '11' + ${{ if eq(parameters.CudaVersion, '12.2') }}: + value: '12' + - name: CUDA_VERSION + value: ${{ parameters.CudaVersion }} steps: - checkout: self clean: true @@ -48,11 +75,11 @@ stages: Context: tools/ci_build/github/linux/docker DockerBuildArgs: " --network=host - --build-arg BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - --build-arg TRT_VERSION=8.6.1.6-1.cuda11.8 + --build-arg BASEIMAGE=${{ variables.docker_base_image }} + --build-arg TRT_VERSION=${{ variables.linux_trt_version }} --build-arg BUILD_UID=$( id -u ) " - Repository: onnxruntimecuda118xtrt86build + Repository: onnxruntimecuda${{ variables.CUDA_VERSION_MAJOR }}xtrt86build - template: set-version-number-variables-step.yml - script: $(Build.SourcesDirectory)/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index 8d28b4ce580b4..0fb6966c141db 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -11,6 +11,7 @@ parameters: - name: EnvSetupScript type: string + default: '' - name: buildArch type: string @@ -63,11 +64,24 @@ parameters: type: boolean default: false +- name: PublishProtoc + type: boolean + default: false + +- name: CudaVersion + type: string + default: '11.8' + values: + - 11.8 + - 12.2 + stages: - stage: Windows_Packaging_${{ parameters.stage_name_suffix }} dependsOn: [] variables: + GRADLE_OPTS: '-Dorg.gradle.daemon=false' VSGenerator: 'Visual Studio 17 2022' + CUDA_MODULE_LOADING: 'LAZY' jobs: - job: workspace: @@ -102,12 +116,26 @@ stages: condition: and(succeeded(), eq('${{ parameters.buildNodejs}}', true)) inputs: versionSpec: '18.x' + - ${{ if ne(parameters.EnvSetupScript, '') }}: + - template: jobs/set-winenv.yml + parameters: + EnvSetupScript: ${{ parameters.EnvSetupScript }} + ${{ if contains(parameters.buildparameter, 'use_cuda') }}: + DownloadCUDA: true - - template: jobs/set-winenv.yml - parameters: - EnvSetupScript: ${{ parameters.EnvSetupScript }} - ${{ if contains(parameters.buildparameter, 'use_cuda') }}: - DownloadCUDA: true + - ${{ if eq(parameters.EnvSetupScript, '') }}: + - template: jobs/download_win_gpu_library.yml + parameters: + CudaVersion: ${{ parameters.CudaVersion }} + ${{ if contains(parameters.buildparameter, 'use_cuda') }}: + DownloadCUDA: true + ${{ if contains(parameters.buildparameter, 'use_tensorrt') }}: + DownloadCUDA: true + DownloadTRT: true + - powershell: | + Write-Host "##vso[task.prependpath]C:\Program Files (x86)\dotnet" + displayName: 'Append dotnet x86 Directory to PATH' + condition: and(succeeded(), eq('${{ parameters.buildArch}}', 'x86')) - template: download-deps.yml @@ -178,9 +206,11 @@ stages: artifactName: 'drop-onnxruntime-nodejs-win-${{ parameters.packageName }}' DoEsrp: ${{ parameters.DoEsrp }} - #Upload protoc.exe, which will be used in nuget build for generating C# files + # Upload protoc.exe, which will be used in nuget build for generating C# files + # TODO: We need to make this step independent of the packageName, so that it can be used in test_win.yml - task: PublishPipelineArtifact@1 - condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64')) + displayName: Publish protoc as drop-extra + condition: and(succeeded(), or(eq('${{ parameters.packageName}}', 'x64'), eq('${{ parameters.PublishProtoc}}', true))) inputs: targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe' artifactName: 'drop-extra${{ parameters.artifact_name_suffix }}' @@ -194,9 +224,10 @@ stages: Contents: 'custom_op_library.dll' TargetFolder: '$(Build.ArtifactStagingDirectory)/testdata' - #To be used in test_win.yml + #To be used in test_win. + # TODO: Do we need to publish protoc twice? - task: PublishPipelineArtifact@1 - condition: and(succeeded(), eq('${{ parameters.packageName}}', 'x64')) + condition: and(succeeded(), or(eq('${{ parameters.packageName}}', 'x64'), eq('${{ parameters.PublishProtoc}}', true))) inputs: targetPath: '$(Build.BinariesDirectory)\RelWithDebInfo\installed\bin\protoc.exe' artifactName: 'drop-nuget${{ parameters.artifact_name_suffix }}' diff --git a/tools/ci_build/github/linux/build_cuda_c_api_package.sh b/tools/ci_build/github/linux/build_cuda_c_api_package.sh index 5cd1c8c243050..2ec8bc82ae048 100755 --- a/tools/ci_build/github/linux/build_cuda_c_api_package.sh +++ b/tools/ci_build/github/linux/build_cuda_c_api_package.sh @@ -4,7 +4,7 @@ export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protect export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" docker run --gpus all -e CFLAGS -e CXXFLAGS -e NVIDIA_VISIBLE_DEVICES=all --rm --volume \ $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ ---volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda11centosbuild \ +--volume /data/models:/build/models:ro --volume /data/onnx:/data/onnx:ro -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}build \ /usr/bin/python3.9 /onnxruntime_src/tools/ci_build/build.py --build_java --build_nodejs --build_dir /build --config Release \ --skip_submodule_sync --parallel --build_shared_lib --use_cuda --cuda_version=$CUDA_VERSION \ --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr/local/cuda-$CUDA_VERSION \ diff --git a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh index 18a32e3599391..5bf6a69170074 100755 --- a/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh +++ b/tools/ci_build/github/linux/build_tensorrt_c_api_package.sh @@ -4,6 +4,6 @@ export CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protect export CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" mkdir -p $HOME/.onnx docker run --gpus all -e CFLAGS -e CXXFLAGS -e NVIDIA_VISIBLE_DEVICES=all --rm --volume /data/onnx:/data/onnx:ro --volume $BUILD_SOURCESDIRECTORY:/onnxruntime_src --volume $BUILD_BINARIESDIRECTORY:/build \ ---volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda118xtrt86build \ +--volume /data/models:/build/models:ro --volume $HOME/.onnx:/home/onnxruntimedev/.onnx -e NIGHTLY_BUILD onnxruntimecuda${CUDA_VERSION_MAJOR}xtrt86build \ /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py --build_dir /build --config Release \ --skip_submodule_sync --parallel --build_shared_lib --build_java --build_nodejs --use_tensorrt --cuda_version=$CUDA_VERSION --cuda_home=/usr/local/cuda-$CUDA_VERSION --cudnn_home=/usr --tensorrt_home=/usr --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=60;61;70;75;80' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda index d4aa9b269095f..8f265b208cd47 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda @@ -8,6 +8,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 ARG DEVTOOLSET_ROOTPATH=/usr ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64 ARG PREPEND_PATH=/usr/local/cuda/binet +ARG TRT_VERSION=8.6.1.6-1.cuda11.8 #Build manylinux docker image begin FROM $BASEIMAGE AS runtime_base diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 index bbdb411b790a0..8ef8e05b8ac77 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda11_8_tensorrt8_6 @@ -5,8 +5,10 @@ # Dockerfile to Test ONNX Runtime on UBI8 with CUDA 11.8 and TensorRT 8.6 # Build base image with required system packages -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8 AS base - +ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 +ARG TRT_VERSION=8.6.1.6-1.cuda11.8 +FROM $BASEIMAGE AS base +ARG TRT_VERSION ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} RUN dnf install -y bash wget &&\ @@ -26,8 +28,7 @@ RUN pip3 install setuptools>=68.2.2 # Install TensorRT RUN dnf install -y libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8 -RUN v="8.6.1.6-1+cuda11.8" &&\ - dnf downgrade -y libnvinfer8-${v} libnvinfer8-${v} libnvonnxparsers8-${v} libnvparsers8-${v} libnvinfer-plugin8-${v} libnvinfer-lean8-${v} libnvinfer-vc-plugin8-${v} libnvinfer-dispatch8-${v} &&\ +RUN dnf downgrade -y libnvinfer8-${TRT_VERSION} libnvinfer8-${TRT_VERSION} libnvonnxparsers8-${TRT_VERSION} libnvparsers8-${TRT_VERSION} libnvinfer-plugin8-${TRT_VERSION} libnvinfer-lean8-${TRT_VERSION} libnvinfer-vc-plugin8-${TRT_VERSION} libnvinfer-dispatch8-${TRT_VERSION} &&\ dnf install -y dnf-plugin-versionlock &&\ dnf versionlock libnvinfer8 libnvonnxparsers8 libnvparsers8 libnvinfer-plugin8 libnvinfer-lean8 libnvinfer-vc-plugin8 libnvinfer-dispatch8 RUN dnf clean dbcache diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu similarity index 50% rename from tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 rename to tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu index 83a974469234f..9b9dc9ecae822 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_cuda11_8_tensorrt8_6 +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu @@ -5,11 +5,16 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 AS base - +ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +ARG TRT_VERSION=8.6.1.6-1+cuda11.8 +ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 +FROM $BASEIMAGE AS base +ARG TRT_VERSION ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} ENV DEBIAN_FRONTEND=noninteractive +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH} + RUN apt-get update &&\ apt-get install -y git bash wget @@ -24,12 +29,11 @@ RUN apt-get install -y --no-install-recommends \ RUN pip install --upgrade pip # Install TensorRT -RUN v="8.6.1.6-1+cuda11.8" &&\ - apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ apt-get update &&\ - apt-get install -y libnvinfer8=${v} libnvonnxparsers8=${v} libnvparsers8=${v} libnvinfer-plugin8=${v} libnvinfer-lean8=${v} libnvinfer-vc-plugin8=${v} libnvinfer-dispatch8=${v}\ - libnvinfer-headers-dev=${v} libnvinfer-headers-plugin-dev=${v} libnvinfer-dev=${v} libnvonnxparsers-dev=${v} libnvparsers-dev=${v} libnvinfer-plugin-dev=${v} libnvinfer-lean-dev=${v} libnvinfer-vc-plugin-dev=${v} libnvinfer-dispatch-dev=${v}\ - python3-libnvinfer=${v} libnvinfer-samples=${v} tensorrt-dev=${v} tensorrt-libs=${v} + apt-get install -y libnvinfer8=${TRT_VERSION} libnvonnxparsers8=${TRT_VERSION} libnvparsers8=${TRT_VERSION} libnvinfer-plugin8=${TRT_VERSION} libnvinfer-lean8=${TRT_VERSION} libnvinfer-vc-plugin8=${TRT_VERSION} libnvinfer-dispatch8=${TRT_VERSION}\ + libnvinfer-headers-dev=${TRT_VERSION} libnvinfer-headers-plugin-dev=${TRT_VERSION} libnvinfer-dev=${TRT_VERSION} libnvonnxparsers-dev=${TRT_VERSION} libnvparsers-dev=${TRT_VERSION} libnvinfer-plugin-dev=${TRT_VERSION} libnvinfer-lean-dev=${TRT_VERSION} libnvinfer-vc-plugin-dev=${TRT_VERSION} libnvinfer-dispatch-dev=${TRT_VERSION}\ + python3-libnvinfer=${TRT_VERSION} libnvinfer-samples=${TRT_VERSION} tensorrt-dev=${TRT_VERSION} tensorrt-libs=${TRT_VERSION} ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile index 318791072f46d..b1ff40e8effef 100644 --- a/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x64/default/gpu/Dockerfile @@ -2,8 +2,8 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - +ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 +FROM $BASEIMAGE ENV PATH /usr/lib/jvm/msopenjdk-11/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 From a6d872640764ea50ec460f7a717e5b369921f8b4 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 29 Nov 2023 01:04:25 +0800 Subject: [PATCH 5/6] Update ADO windows image to custom image (#18598) ### Description Update Azure-Pipelines-EO-Windows2022-aiinfra to onnxruntime-win-CPU-2022 in Nuget_Package_CPU. To make the debugging easier, use flex-downloadPipelineArtifact ### Motivation and Context Azure-Pipelines-EO-Windows2022-aiinfra is using 1ES window-latest image. The pipeline might be failed by unexpected upgrade. Verified: https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=384425&view=results ### P.S. I think we should replace all Azure-Pipelines-EO-Windows2022-aiinfra. --- .../azure-pipelines/templates/c-api-cpu.yml | 126 ++++++++++-------- 1 file changed, 72 insertions(+), 54 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 4ce39ecc35bfb..cfd2931665d17 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -304,9 +304,7 @@ stages: - job: workspace: clean: all - # we need to use the 2022 pool to create the nuget package with both pre-net6+Xamarin and net6 targets. - # VS2019 has no support for net6 and we need to use msbuild (from the VS install) to do the packing - pool: 'Azure-Pipelines-EO-Windows2022-aiinfra' + pool: 'onnxruntime-Win-CPU-2022' variables: OrtPackageId: ${{ parameters.OrtNugetPackageId }} breakCodesignValidationInjection: ${{ parameters.DoEsrp }} @@ -315,66 +313,86 @@ stages: steps: - checkout: self submodules: true - - task: DownloadPipelineArtifact@0 - displayName: 'Download win-x64 Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-win-x64' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' - - task: DownloadPipelineArtifact@0 - displayName: 'Download win-x86 Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-win-x86' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download Pipeline Artifact - Win x64' + ArtifactName: 'onnxruntime-win-x64' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@0 - displayName: 'Download win-arm64 Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-win-arm64' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download win-x86 Pipeline Artifact' + ArtifactName: 'onnxruntime-win-x86' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@0 - displayName: 'Download win-arm Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-win-arm' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download win-arm64 Pipeline Artifact' + ArtifactName: 'onnxruntime-win-arm64' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@0 - displayName: 'Download osx-x64 Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-osx' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download win-arm Pipeline Artifact' + ArtifactName: 'onnxruntime-win-arm' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@0 - displayName: 'Download linux-x64 Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-linux-x64' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download osx-x64 Pipeline Artifact' + ArtifactName: 'onnxruntime-osx' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@0 - displayName: 'Download Pipeline Artifact - NuGet' - inputs: - artifactName: 'onnxruntime-linux-aarch64' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download linux-x64 Pipeline Artifact' + ArtifactName: 'onnxruntime-linux-x64' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@2 - displayName: 'Download iOS Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-ios-full-xcframework' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download linux-aarch64 Pipeline Artifact' + ArtifactName: 'onnxruntime-linux-aarch64' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@2 - displayName: 'Download android-full-aar Pipeline Artifact' - inputs: - artifactName: 'onnxruntime-android-full-aar' - patterns: '**/*.aar' - targetPath: '$(Build.BinariesDirectory)/nuget-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download iOS Pipeline Artifact' + ArtifactName: 'onnxruntime-ios-full-xcframework' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - - task: DownloadPipelineArtifact@0 - displayName: 'Download drop-extra Pipeline Artifact' - inputs: - artifactName: 'drop-extra' - targetPath: '$(Build.BinariesDirectory)/extra-artifact' + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download Android-full-aar Pipeline Artifact' + ArtifactName: 'onnxruntime-android-full-aar' + TargetPath: '$(Build.BinariesDirectory)/nuget-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} + + - template: flex-downloadPipelineArtifact.yml + parameters: + StepName: 'Download drop-extra Pipeline Artifact' + ArtifactName: 'drop-extra' + TargetPath: '$(Build.BinariesDirectory)/extra-artifact' + SpecificArtifact: ${{ parameters.specificArtifact }} + BuildId: ${{ parameters.BuildId }} - script: | dir From 0b7048e7d621b271b0ab4748e566f57d11b49be5 Mon Sep 17 00:00:00 2001 From: Sheil Kumar Date: Tue, 28 Nov 2023 09:26:48 -0800 Subject: [PATCH 6/6] Update winml to use #cores - #soc cores by Default as the number of intraopthreads (#18384) Update winml to use #cores - #soc cores by Default as the number of intraopthreads --------- Co-authored-by: Sheil Kumar --- cmake/winml.cmake | 2 + winml/lib/Api/HardwareCoreEnumerator.cpp | 90 +++++++++++++++++++ winml/lib/Api/HardwareCoreEnumerator.h | 11 +++ winml/lib/Api/LearningModelDevice.cpp | 3 +- winml/lib/Api/LearningModelSessionOptions.cpp | 11 ++- winml/lib/Api/LearningModelSessionOptions.h | 4 +- .../test/api/LearningModelSessionAPITest.cpp | 6 -- 7 files changed, 117 insertions(+), 10 deletions(-) create mode 100644 winml/lib/Api/HardwareCoreEnumerator.cpp create mode 100644 winml/lib/Api/HardwareCoreEnumerator.h diff --git a/cmake/winml.cmake b/cmake/winml.cmake index 395996f0fa4b9..268ee3960e75a 100644 --- a/cmake/winml.cmake +++ b/cmake/winml.cmake @@ -451,6 +451,8 @@ onnxruntime_add_static_library(winml_lib_api ${winml_lib_api_dir}/impl/TensorKindFrom.h ${winml_lib_api_dir}/impl/TensorMemoryBufferReference.h ${winml_lib_api_dir}/NumericData.cpp + ${winml_lib_api_dir}/HardwareCoreEnumerator.cpp + ${winml_lib_api_dir}/HardwareCoreEnumerator.h ${winml_lib_api_dir}/ImageFeatureDescriptor.cpp ${winml_lib_api_dir}/ImageFeatureDescriptor.h ${winml_lib_api_dir}/ImageFeatureValue.cpp diff --git a/winml/lib/Api/HardwareCoreEnumerator.cpp b/winml/lib/Api/HardwareCoreEnumerator.cpp new file mode 100644 index 0000000000000..a89ac561f8860 --- /dev/null +++ b/winml/lib/Api/HardwareCoreEnumerator.cpp @@ -0,0 +1,90 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "lib/Api/pch/pch.h" + +#include "HardwareCoreEnumerator.h" + +namespace WINMLP { + +struct LogicalProcessorInformation { + std::unique_ptr Buffer; + size_t Length; +}; + +struct CoreCounter { + uint32_t PhysicalCores = 0; + uint32_t SocDieCores = 0; +}; + +static LogicalProcessorInformation GetLogicalProcessorInfos(LOGICAL_PROCESSOR_RELATIONSHIP relationship) { + DWORD length = 0; + DWORD rc = GetLogicalProcessorInformationEx(relationship, nullptr, &length); + + assert(rc == FALSE); + + auto processorInformationBytes = std::make_unique(length); + + rc = GetLogicalProcessorInformationEx( + relationship, reinterpret_cast(processorInformationBytes.get()), &length + ); + + assert(rc == TRUE); + + return {std::move(processorInformationBytes), length}; +} + +uint32_t CountSetBits(DWORD input) { + uint32_t c; + for (c = 0; input; c++) { + input &= input - 1; + } + return c; +} + +static CoreCounter GetNumberOPhysicalAndEngineeringCores() { + auto logicalProcessorInformation = GetLogicalProcessorInfos(RelationAll); + + CoreCounter cores; + DWORD dwLevel2GroupMask = 0; + DWORD dwLevel3GroupMask = 0; + size_t read = 0; + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX currentProcessorInfo = NULL; + + while ((read + FIELD_OFFSET(SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX, Processor)) < logicalProcessorInformation.Length + ) { + currentProcessorInfo = + reinterpret_cast(logicalProcessorInformation.Buffer.get() + read); + if ((read + currentProcessorInfo->Size) > logicalProcessorInformation.Length) { + break; + } + + switch (currentProcessorInfo->Relationship) { + case RelationProcessorCore: + cores.PhysicalCores++; + break; + case RelationCache: + if (currentProcessorInfo->Cache.Level == 2) { + dwLevel2GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; + } else if (currentProcessorInfo->Cache.Level == 3) { + dwLevel3GroupMask |= currentProcessorInfo->Cache.GroupMask.Mask; + } + break; + } + + read += currentProcessorInfo->Size; + } + + cores.SocDieCores = CountSetBits(dwLevel2GroupMask & ~dwLevel3GroupMask); + return cores; +} + +uint32_t HardwareCoreEnumerator::DefaultIntraOpNumThreads() { + // # of physical cores = # of P cores + # of E Cores + # of Soc Cores. + // # of logical cores = # of P cores x 2 (if hyper threading is enabled) + # of E cores + # of Soc Cores. + auto cores = GetNumberOPhysicalAndEngineeringCores(); + // We want to use the number of physical cores, but exclude soc cores + return cores.PhysicalCores - cores.SocDieCores; +} + +} // namespace WINMLP diff --git a/winml/lib/Api/HardwareCoreEnumerator.h b/winml/lib/Api/HardwareCoreEnumerator.h new file mode 100644 index 0000000000000..6861ba7d46bcf --- /dev/null +++ b/winml/lib/Api/HardwareCoreEnumerator.h @@ -0,0 +1,11 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +namespace WINMLP { +struct HardwareCoreEnumerator { + HardwareCoreEnumerator() = delete; + static uint32_t DefaultIntraOpNumThreads(); +}; +} // namespace WINMLP diff --git a/winml/lib/Api/LearningModelDevice.cpp b/winml/lib/Api/LearningModelDevice.cpp index c9c6f5bc70ee2..9f48ee03886e1 100644 --- a/winml/lib/Api/LearningModelDevice.cpp +++ b/winml/lib/Api/LearningModelDevice.cpp @@ -7,6 +7,7 @@ #include #include #include "D3DDeviceCache.h" +#include "HardwareCoreEnumerator.h" #include "ConverterResourceStore.h" @@ -131,7 +132,7 @@ LearningModelDevice::CacheThreadPool(_winml::IThreading* thread_pool) { uint32_t LearningModelDevice::NumberOfIntraOpThreads() { if (IsCpuDevice()) { - return std::thread::hardware_concurrency(); + return HardwareCoreEnumerator::DefaultIntraOpNumThreads(); } else { // GPU sessions should not rely on intra op threads. // Creating a large thread pool is unnecessary and wasteful, and can cause diff --git a/winml/lib/Api/LearningModelSessionOptions.cpp b/winml/lib/Api/LearningModelSessionOptions.cpp index 2ff9c6d1d56d0..374200fb3b9f8 100644 --- a/winml/lib/Api/LearningModelSessionOptions.cpp +++ b/winml/lib/Api/LearningModelSessionOptions.cpp @@ -3,11 +3,20 @@ #include "lib/Api/pch/pch.h" #include "LearningModelSessionOptions.h" +#include "HardwareCoreEnumerator.h" namespace WINMLP { + +LearningModelSessionOptions::LearningModelSessionOptions() { + intra_op_num_threads_override_ = HardwareCoreEnumerator::DefaultIntraOpNumThreads(); +} + LearningModelSessionOptions::LearningModelSessionOptions(const LearningModelSessionOptions& options) : batch_size_override_(options.batch_size_override_), - close_model_on_session_creation_(options.close_model_on_session_creation_) { + close_model_on_session_creation_(options.close_model_on_session_creation_), + named_dim_overrides_(options.named_dim_overrides_), + intra_op_num_threads_override_(options.intra_op_num_threads_override_), + custom_ops_lib_paths_(options.custom_ops_lib_paths_) { } uint32_t LearningModelSessionOptions::BatchSizeOverride() { diff --git a/winml/lib/Api/LearningModelSessionOptions.h b/winml/lib/Api/LearningModelSessionOptions.h index 5fc7e54997403..21d0242735f94 100644 --- a/winml/lib/Api/LearningModelSessionOptions.h +++ b/winml/lib/Api/LearningModelSessionOptions.h @@ -11,7 +11,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT< LearningModelSessionOptions, ILearningModelSessionOptionsNative, ILearningModelSessionOptionsNative1> { - LearningModelSessionOptions() = default; + LearningModelSessionOptions(); LearningModelSessionOptions(const LearningModelSessionOptions& options); @@ -72,7 +72,7 @@ struct LearningModelSessionOptions : LearningModelSessionOptionsT< // The intra operator num threads property is used to control the number of threads used in the threadpool for intra operator calculations. // The default value here is the maximum number of logical cores to ensure that the default behavior of WinML always runs the fastest. // WARNING: Setting a number higher than the maximum number of logical cores may result in an inefficient threadpool - uint32_t intra_op_num_threads_override_ = std::thread::hardware_concurrency(); + uint32_t intra_op_num_threads_override_; bool allow_thread_spinning_ = true; diff --git a/winml/test/api/LearningModelSessionAPITest.cpp b/winml/test/api/LearningModelSessionAPITest.cpp index 4ec79b8a0f4c6..d6e70e35e3a6d 100644 --- a/winml/test/api/LearningModelSessionAPITest.cpp +++ b/winml/test/api/LearningModelSessionAPITest.cpp @@ -2195,12 +2195,6 @@ static void SetIntraOpNumThreads() { auto binding = LearningModelBinding(session); binding.Bind(L"input", tensor_input); WINML_EXPECT_NO_THROW(session.Evaluate(binding, L"")); - - // Check to verify that the default number of threads in LearningModelSession is equal to the number of logical cores. - session = LearningModelSession(model, device); - nativeSession = session.as(); - WINML_EXPECT_NO_THROW(nativeSession->GetIntraOpNumThreads(&numIntraOpThreads)); - WINML_EXPECT_EQUAL(std::thread::hardware_concurrency(), numIntraOpThreads); } static void SetIntraOpThreadSpinning() {