From c02b39898063f0e1d5d37c3e32bd56cf1a151421 Mon Sep 17 00:00:00 2001 From: Jing Fang <126209182+fajin-corp@users.noreply.github.com> Date: Thu, 14 Nov 2024 18:38:59 +0000 Subject: [PATCH 01/28] [ARM] MatMulNBits Fp16 support - API change only (#22826) ### Description A break-down PR of https://github.com/microsoft/onnxruntime/pull/22651 Op API change only. - add template to functions and classes that support fp32 and fp16 - rename functions, classes and files that support fp32 and fp16 from SQNBxxx to QNBxxx ### Motivation and Context --- cmake/onnxruntime_mlas.cmake | 4 +- .../cpu/quantization/matmul_nbits.cc | 159 +++++---- onnxruntime/core/mlas/inc/mlas_qnbit.h | 82 ++--- .../mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp | 4 +- onnxruntime/core/mlas/lib/mlasi.h | 14 +- onnxruntime/core/mlas/lib/platform.cpp | 10 +- .../lib/{sqnbitgemm.cpp => qnbitgemm.cpp} | 313 ++++++++++++------ .../mlas/lib/{sqnbitgemm.h => qnbitgemm.h} | 45 +-- .../core/mlas/lib/sqnbitgemm_kernel_avx2.cpp | 30 +- .../sqnbitgemm_kernel_avx2_int8_blklen16.h | 2 +- .../sqnbitgemm_kernel_avx2_int8_blklen32.h | 2 +- .../sqnbitgemm_kernel_avx2_int8_blklen64.h | 2 +- .../mlas/lib/sqnbitgemm_kernel_avx512.cpp | 22 +- .../mlas/lib/sqnbitgemm_kernel_avx512_int8.h | 12 +- .../sqnbitgemm_kernel_avx512_int8_blklen128.h | 2 +- .../sqnbitgemm_kernel_avx512_int8_blklen16.h | 2 +- .../sqnbitgemm_kernel_avx512_int8_blklen32.h | 2 +- .../sqnbitgemm_kernel_avx512_int8_blklen64.h | 2 +- .../mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp | 18 +- .../mlas/lib/sqnbitgemm_kernel_avx_common.h | 24 +- .../lib/sqnbitgemm_kernel_avx_common_fp32.h | 2 +- .../lib/sqnbitgemm_kernel_avx_common_int8.h | 2 +- .../core/mlas/lib/sqnbitgemm_kernel_neon.cpp | 32 +- .../core/mlas/lib/sqnbitgemm_kernel_neon.h | 8 +- .../mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp | 6 +- .../mlas/lib/sqnbitgemm_kernel_neon_int8.cpp | 6 +- ...bitgemm_m1_sym_kernel_avx2_int8_blklen32.h | 2 +- ...bitgemm_m1_sym_kernel_avx2_int8_blklen64.h | 2 +- .../vsinpu/patches/mlas_crosscompiling.patch | 56 ++-- .../test/mlas/bench/bench_sqnbitgemm.cpp | 42 +-- .../test/mlas/unittest/test_sqnbitgemm.cpp | 40 +-- 31 files changed, 547 insertions(+), 402 deletions(-) rename onnxruntime/core/mlas/lib/{sqnbitgemm.cpp => qnbitgemm.cpp} (70%) rename onnxruntime/core/mlas/lib/{sqnbitgemm.h => qnbitgemm.h} (91%) diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 1d578d09b1c03..a85ea942c42a3 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -36,8 +36,8 @@ onnxruntime_add_static_library(onnxruntime_mlas ${MLAS_SRC_DIR}/qpostprocessor.cpp ${MLAS_SRC_DIR}/qlgavgpool.cpp ${MLAS_SRC_DIR}/qdwconv_kernelsize.cpp - ${MLAS_SRC_DIR}/sqnbitgemm.h - ${MLAS_SRC_DIR}/sqnbitgemm.cpp + ${MLAS_SRC_DIR}/qnbitgemm.h + ${MLAS_SRC_DIR}/qnbitgemm.cpp ${MLAS_SRC_DIR}/sqnbitgemm_q8_block.h ${MLAS_SRC_DIR}/flashattn.cpp ${MLAS_SRC_DIR}/cast.cpp diff --git a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc index 89e96543c4729..473ec51524f22 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cpu/quantization/matmul_nbits.cc @@ -32,24 +32,46 @@ constexpr size_t A = 0, bias = 5; }; -int64_t GetAccuracyLevel(size_t nbits, size_t block_size, int64_t accuracy_level_attr) { - const auto accuracy_level = std::clamp(accuracy_level_attr, - static_cast(CompMostAccurate), - static_cast(CompLeastAccurate)); - - // Find a supported accuracy level that is not less accurate than the one given. - // CompMostAccurate is always supported with the fallback implementation. - // Note: A higher numeric accuracy level value means lower accuracy, so the comparison order is reversed. - int64_t effective_accuracy_level = accuracy_level; - for (; effective_accuracy_level > CompMostAccurate; --effective_accuracy_level) { - const auto compute_type = static_cast(effective_accuracy_level); - if (MlasIsSQNBitGemmAvailable(nbits, block_size, compute_type)) { - break; - } +typedef enum { + Level1, /*!< input fp32, accumulator fp32 */ + Level2, /*!< input fp16, accumulator fp16 */ + Level3, /*!< input bf16, accumulator fp32 */ + Level4, /*!< input int8, accumulator int32 */ +} ACCURACY_LEVEL; + +// T: A data type. +template +MLAS_QNBIT_GEMM_COMPUTE_TYPE +GetComputeType(size_t nbits, size_t block_size, int64_t accuracy_level_attr) { + // For Fp32, only accuracy level 1 or 4 makes sense. + // non-ARM CPU converts Fp16 to Fp32. + // By converting Fp32 to Fp16, precision becomes worse. And due to the casting, + // there is no performance gain. + if (accuracy_level_attr == static_cast(Level4) && + MlasIsQNBitGemmAvailable(nbits, block_size, SQNBIT_CompInt8)) { + return SQNBIT_CompInt8; } - return effective_accuracy_level; + return SQNBIT_CompFp32; } + +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) +template <> +MLAS_QNBIT_GEMM_COMPUTE_TYPE +GetComputeType(size_t nbits, size_t block_size, int64_t accuracy_level_attr) { + // For Fp16, only accuracy level 2 or 4 makes sense. + // By converting Fp16 to Fp32, there is not precision increase, and the performance + // becomes worse. + if (accuracy_level_attr == static_cast(Level4) && + MlasIsQNBitGemmAvailable(nbits, block_size, HQNBIT_CompInt8)) { + return HQNBIT_CompInt8; + } + + // if HQNBIT_CompFp16 is not supported, will fallback to unpacked computation. + return HQNBIT_CompFp16; +} +#endif // !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64 + } // namespace bool GetType(const NodeArg& node_arg, int32_t& type) { @@ -74,10 +96,9 @@ class MatMulNBits final : public OpKernel { N_{narrow(info.GetAttr("N"))}, block_size_{narrow(info.GetAttr("block_size"))}, nbits_{narrow(info.GetAttr("bits"))}, - accuracy_level_{GetAccuracyLevel(nbits_, block_size_, info.GetAttr("accuracy_level"))}, has_g_idx_{info.GetInputCount() > InputIndex::g_idx && info.node().InputDefs()[InputIndex::g_idx]->Exists()}, has_bias_{info.GetInputCount() > InputIndex::bias && info.node().InputDefs()[InputIndex::bias]->Exists()}, - compute_type_{static_cast(accuracy_level_)} { + compute_type_{GetComputeType(nbits_, block_size_, info.GetAttr("accuracy_level"))} { const auto& node = info.node(); auto input_defs = node.InputDefs(); const NodeArg* zero_point_arg = @@ -109,10 +130,9 @@ class MatMulNBits final : public OpKernel { const size_t N_; const size_t block_size_; const size_t nbits_; - const int64_t accuracy_level_; const bool has_g_idx_; const bool has_bias_; - const MLAS_SQNBIT_GEMM_COMPUTE_TYPE compute_type_; + const MLAS_QNBIT_GEMM_COMPUTE_TYPE compute_type_; bool has_unquantized_zero_point_{false}; const bool column_wise_quant_{true}; IAllocatorUniquePtr packed_b_{}; @@ -143,9 +163,7 @@ class MatMulNBits final : public OpKernel { Tensor* y, AllocatorPtr& allocator, concurrency::ThreadPool* thread_pool, - const MatMulComputeHelper& helper) const { - ORT_THROW("ComputeBPacked is not supported for T1 type."); - } + const MatMulComputeHelper& helper) const; }; template @@ -158,28 +176,28 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All return Status::OK(); } - if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) { + if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) { return Status::OK(); } if (input_idx == InputIndex::B) { - packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_); + packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_); if (packed_b_size_ == 0) { return Status::OK(); } auto qptr = tensor.DataRaw(); packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); - MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr); + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), nullptr, has_zp_input_, nullptr, nullptr); is_packed = true; - } else if (compute_type_ == CompInt8) { + } else if (compute_type_ == SQNBIT_CompInt8) { #ifdef MLAS_TARGET_AMD64_IX86 if (input_idx == InputIndex::scales && packed_b_ != nullptr) { auto sptr = tensor.Data(); - MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr, - has_zp_input_, nullptr, nullptr); + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), sptr, + has_zp_input_, nullptr, nullptr); is_packed = false; } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) { auto zptr = tensor.Data(); - MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr); + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), nullptr, has_zp_input_, zptr, nullptr); is_packed = false; } #endif // MLAS_TARGET_AMD64_IX86 @@ -188,6 +206,8 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ All return Status::OK(); } +#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64) +// Non-ARM-with-fp16-intrinsics fall back fp16 to fp32. template <> Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*out*/ AllocatorPtr alloc, /*out*/ bool& is_packed, @@ -211,29 +231,29 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou return Status::OK(); } - if (!MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) { + if (!MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) { return Status::OK(); } if (input_idx == InputIndex::B) { - packed_b_size_ = MlasSQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_); + packed_b_size_ = MlasQNBitGemmPackQuantBDataSize(N_, K_, nbits_, block_size_, compute_type_); if (packed_b_size_ == 0) { return Status::OK(); } auto qptr = tensor.DataRaw(); packed_b_ = IAllocator::MakeUniquePtr(alloc, packed_b_size_, true); - MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), - nullptr, has_zp_input_, nullptr, nullptr); + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, qptr, packed_b_.get(), + nullptr, has_zp_input_, nullptr, nullptr); is_packed = true; - } else if (compute_type_ == CompInt8) { + } else if (compute_type_ == SQNBIT_CompInt8) { #ifdef MLAS_TARGET_AMD64_IX86 if (input_idx == InputIndex::scales && packed_b_ != nullptr) { - MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), - scales_fp32_.get(), has_zp_input_, nullptr, nullptr); + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), + scales_fp32_.get(), has_zp_input_, nullptr, nullptr); is_packed = false; } else if (input_idx == InputIndex::zero_points && packed_b_ != nullptr) { auto zptr = tensor.Data(); - MlasSQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), - nullptr, has_zp_input_, zptr, nullptr); + MlasQNBitGemmPackQuantBData(N_, K_, nbits_, block_size_, compute_type_, nullptr, packed_b_.get(), + nullptr, has_zp_input_, zptr, nullptr); is_packed = false; } #endif // MLAS_TARGET_AMD64_IX86 @@ -241,6 +261,7 @@ Status MatMulNBits::PrePack(const Tensor& tensor, int input_idx, /*ou return Status::OK(); } +#endif // end !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_ARM64 template Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& prepacked_buffers, int input_idx, @@ -255,20 +276,20 @@ Status MatMulNBits::UseSharedPrePackedBuffers(std::vector& return Status::OK(); } -template <> -Status MatMulNBits::ComputeBPacked(const Tensor* a, - const Tensor* scales, - const Tensor* zero_points, - const Tensor* bias, - Tensor* y, - AllocatorPtr& allocator, - concurrency::ThreadPool* thread_pool, - const MatMulComputeHelper& helper) const { - const auto* a_data = a->Data(); - const auto* scales_data = scales->Data(); +template +Status MatMulNBits::ComputeBPacked(const Tensor* a, + const Tensor* scales, + const Tensor* zero_points, + const Tensor* bias, + Tensor* y, + AllocatorPtr& allocator, + concurrency::ThreadPool* thread_pool, + const MatMulComputeHelper& helper) const { + const auto* a_data = a->Data(); + const auto* scales_data = scales->Data(); const auto* zero_points_data = zero_points == nullptr ? nullptr : zero_points->DataRaw(); - const auto* bias_data = bias == nullptr ? nullptr : bias->Data(); - auto* y_data = y->MutableData(); + const auto* bias_data = bias == nullptr ? nullptr : bias->Data(); + auto* y_data = y->MutableData(); const size_t batch_count = helper.OutputOffsets().size(); const size_t M = static_cast(helper.M()); @@ -277,19 +298,19 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a, const size_t lda = helper.Lda(false); IAllocatorUniquePtr workspace{}; - const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize( + const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize( M, N, K, batch_count, nbits_, block_size_, compute_type_); if (workspace_size > 0) { // Use reserve since no caching is needed workspace = IAllocator::MakeUniquePtr(allocator, workspace_size, true); } - InlinedVector data(batch_count); + InlinedVector> data(batch_count); for (size_t i = 0; i < batch_count; ++i) { data[i].A = a_data + helper.LeftOffsets()[i]; data[i].lda = lda; #ifdef MLAS_TARGET_AMD64_IX86 - if (compute_type_ == CompInt8) { + if (compute_type_ == SQNBIT_CompInt8) { data[i].QuantBDataWorkspace = packed_b_.get(); } #endif @@ -300,11 +321,12 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a, data[i].C = y_data + helper.OutputOffsets()[i]; data[i].ldc = N; } - MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(), - thread_pool); + MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(), + thread_pool); return Status::OK(); } +#if !defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) || !defined(MLAS_TARGET_ARM64) template <> Status MatMulNBits::ComputeBPacked(const Tensor* a, const Tensor* scales, @@ -327,7 +349,7 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a, const size_t lda = helper.Lda(false); IAllocatorUniquePtr workspace{}; - const size_t workspace_size = MlasSQNBitGemmBatchWorkspaceSize( + const size_t workspace_size = MlasQNBitGemmBatchWorkspaceSize( M, N, K, batch_count, nbits_, block_size_, compute_type_); if (workspace_size > 0) { // Use reserve since no caching is needed @@ -361,12 +383,12 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a, size_t c_size = static_cast(y->Shape().Size()); std::vector c_v(c_size); - InlinedVector data(batch_count); + InlinedVector> data(batch_count); for (size_t i = 0; i < batch_count; ++i) { data[i].A = tmp_a_data_ptr.get() + helper.LeftOffsets()[i]; data[i].lda = lda; #ifdef MLAS_TARGET_AMD64_IX86 - if (compute_type_ == CompInt8) { + if (compute_type_ == SQNBIT_CompInt8) { data[i].QuantBDataWorkspace = packed_b_.get(); } #endif @@ -377,11 +399,12 @@ Status MatMulNBits::ComputeBPacked(const Tensor* a, data[i].C = c_v.data() + helper.OutputOffsets()[i]; data[i].ldc = N; } - MlasSQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(), - thread_pool); + MlasQNBitGemmBatch(M, N, K, batch_count, nbits_, block_size_, compute_type_, data.data(), workspace.get(), + thread_pool); MlasConvertFloatToHalfBuffer(c_v.data(), y_data, c_size); return Status::OK(); } +#endif // end of !MLAS_F16VEC_INTRINSICS_SUPPORTED || !MLAS_TARGET_AMD64 template <> Status MatMulNBits::ComputeBUnpacked(const Tensor* a, @@ -517,9 +540,10 @@ Status MatMulNBits::ComputeBUnpacked(const Tensor* a, const size_t ldb = helper.Ldb(true); float* scales_ptr = nullptr; + IAllocatorUniquePtr temp_scales; if (!scales_fp32_) { auto scales_size = static_cast(scales->Shape().Size()); - auto temp_scales = IAllocator::MakeUniquePtr(allocator, scales_size, true); + temp_scales = IAllocator::MakeUniquePtr(allocator, scales_size, true); MlasConvertHalfToFloatBuffer(scales_data, temp_scales.get(), scales_size); scales_ptr = temp_scales.get(); } else { @@ -600,8 +624,9 @@ Status MatMulNBits::ComputeBUnpacked(const Tensor* a, if (bias) { float* bias_ptr = nullptr; const size_t bias_size = static_cast(bias->Shape().Size()); + IAllocatorUniquePtr bias_temp; if (!bias_fp32_) { - auto bias_temp = IAllocator::MakeUniquePtr(allocator, bias_size, true); + bias_temp = IAllocator::MakeUniquePtr(allocator, bias_size, true); MlasConvertHalfToFloatBuffer(bias->Data(), bias_temp.get(), bias_size); bias_ptr = bias_temp.get(); } else { @@ -654,11 +679,11 @@ Status MatMulNBits::Compute(OpKernelContext* ctx) const { // clang-format on if (has_single_b_matrix && - packed_b_) { // Assume that MlasSQNBitGemmBatch() always requires packed B. - // If this changes, i.e., if MlasIsSQNBitGemmAvailable() can return true while - // MlasSQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasSQNBitGemmBatch() + packed_b_) { // Assume that MlasQNBitGemmBatch() always requires packed B. + // If this changes, i.e., if MlasIsQNBitGemmAvailable() can return true while + // MlasQNBitGemmPackQuantBDataSize() returns 0, we can consider calling MlasQNBitGemmBatch() // with B directly too. - if (MlasIsSQNBitGemmAvailable(nbits_, block_size_, compute_type_)) { + if (MlasIsQNBitGemmAvailable(nbits_, block_size_, compute_type_)) { return ComputeBPacked(a, scales, zero_points, bias, y, allocator, thread_pool, helper); } } diff --git a/onnxruntime/core/mlas/inc/mlas_qnbit.h b/onnxruntime/core/mlas/inc/mlas_qnbit.h index 232bf2261ef4c..9608644a22523 100644 --- a/onnxruntime/core/mlas/inc/mlas_qnbit.h +++ b/onnxruntime/core/mlas/inc/mlas_qnbit.h @@ -27,51 +27,50 @@ Module Name: * @brief Define compute types of block quantization, in order of decreasing accuracy. */ typedef enum { - CompUndef = 0, /*!< undef */ - CompFp32, /*!< input fp32, accumulator fp32 */ - CompFp16, /*!< input fp16, accumulator fp16 */ - CompBf16, /*!< input bf16, accumulator fp32 */ - CompInt8, /*!< input int8, accumulator int32 */ - - // special values that should be the first and last actual values - - CompMostAccurate = CompUndef, - CompLeastAccurate = CompInt8, -} MLAS_SQNBIT_GEMM_COMPUTE_TYPE; + SQNBIT_CompFp32, /*!< input fp32, accumulator fp32 */ + HQNBIT_CompFp16, /*!< input fp16, accumulator fp16 */ + BHQNBIT_CompBf16, /*!< input bf16, accumulator fp32 */ + SQNBIT_CompInt8, /*!< input int8, accumulator int32, input fp32 */ + HQNBIT_CompInt8, /*!< input int8, accumulator int32, input fp16 */ +} MLAS_QNBIT_GEMM_COMPUTE_TYPE; /** * @brief Data parameters for float/n-bit quantized int GEMM routine. + * + * @tparam T data type of input A */ -struct MLAS_SQNBIT_GEMM_DATA_PARAMS { - const float* A = nullptr; ///< address of A (float32 matrix) +template +struct MLAS_QNBIT_GEMM_DATA_PARAMS { + const T* A = nullptr; ///< address of A (float32/16 matrix) size_t lda = 0; ///< leading dimension of A const void* QuantBDataWorkspace; ///< address of quantized B (quantized n-bit int values) const std::byte* PackedQuantBData = nullptr; /// address of packed quantized B data - const float* QuantBScale = nullptr; ///< address of scale values of quantized B, one per block + const T* QuantBScale = nullptr; ///< address of scale values of quantized B, one per block const void* QuantBZeroPoint = nullptr; ///< optional address of zero point values of quantized B, one per block - const float* QuantBBlkSum = nullptr; ///< optional address of scale * zp, one per block - const float* Bias = nullptr; ///< optional address of Bias, vector size N - float* C = nullptr; ///< address of result matrix + const T* QuantBBlkSum = nullptr; ///< optional address of scale * zp, one per block + const T* Bias = nullptr; ///< optional address of Bias, vector size N + T* C = nullptr; ///< address of result matrix size_t ldc = 0; ///< leading dimension of C ///< optional post processing to apply to result matrix - MLAS_GEMM_POSTPROCESSOR* PostProcessor = nullptr; + MLAS_GEMM_POSTPROCESSOR* PostProcessor = nullptr; }; /** * @brief Batched GEMM: C = A * B + Bias - * A must be a float32 matrix + * A must be a float32/16 matrix * B must be a quantized and packed n-bit int matrix * - * Call MlasIsSQNBitGemmAvailable() with the same parameters to determine whether this function may be called. + * Call MlasIsQNBitGemmAvailable() with the same parameters to determine whether this function may be called. * - * Call MlasSQNBitGemmPackQuantBDataSize() with the same parameters to determine whether - * MLAS_SQNBIT_GEMM_DATA_PARAMS::QuantBData in `DataParams` should point to a buffer packed with - * MlasSQNBitGemmPackQuantBData(). + * Call MlasQNBitGemmPackQuantBDataSize() with the same parameters to determine whether + * MLAS_QNBIT_GEMM_DATA_PARAMS::QuantBData in `DataParams` should point to a buffer packed with + * MlasQNBitGemmPackQuantBData(). * - * Call MlasSQNBitGemmBatchWorkspaceSize() with the same parameters to determine whether `Workspace` should + * Call MlasQNBitGemmBatchWorkspaceSize() with the same parameters to determine whether `Workspace` should * point to an intermediate workspace buffer. * + * @tparam T data type of input A * @param[in] M row size of matrix A and C * @param[in] N column size of matrix B and C * @param[in] K column size of matrix A and row size of matrix B @@ -81,36 +80,37 @@ struct MLAS_SQNBIT_GEMM_DATA_PARAMS { * @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values) * @param[inout] DataParams An array (size BatchN) of parameter blocks * @param[in] Workspace Address of intermediate workspace buffer. - If MlasSQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a + If MlasQNBitGemmBatchWorkspaceSize() returns a non-zero value, this must be a buffer with at least that many bytes. Otherwise, it may be nullptr. * @param[in] ThreadPool optional thread pool to use */ +template void MLASCALL -MlasSQNBitGemmBatch( +MlasQNBitGemmBatch( size_t M, size_t N, size_t K, size_t BatchN, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, void* Workspace, MLAS_THREADPOOL* ThreadPool = nullptr ); /** - * @brief Determines whether a float32/quantized n-bit int GEMM implementation is available on the current platform. + * @brief Determines whether a float32/16 quantized n-bit int GEMM implementation is available on the current platform. * * @param[in] BlkBitWidth quantized value bit width (e.g., 4 means 4 bit ints) * @param[in] BlkLen number of quantized values per block * @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values) */ bool MLASCALL -MlasIsSQNBitGemmAvailable( +MlasIsQNBitGemmAvailable( size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); /** @@ -126,22 +126,22 @@ MlasIsSQNBitGemmAvailable( * @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values) */ size_t MLASCALL -MlasSQNBitGemmBatchWorkspaceSize( +MlasQNBitGemmBatchWorkspaceSize( size_t M, size_t N, size_t K, size_t BatchN, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); /** * @brief Gets the size in bytes of the packed quantized B data. - * If non-zero, the quantized B data must first be packed by calling MlasSQNBitGemmPackQuantBData() with a buffer of - * this size, and then that packed quantized B data buffer must be passed to MlasSQNBitGemmBatch(). - * If zero, MlasSQNBitGemmPackQuantBData() must not be called and the quantized B data must be directly passed to - * MlasSQNBitGemmBatch(). + * If non-zero, the quantized B data must first be packed by calling MlasQNBitGemmPackQuantBData() with a buffer of + * this size, and then that packed quantized B data buffer must be passed to MlasQNBitGemmBatch(). + * If zero, MlasQNBitGemmPackQuantBData() must not be called and the quantized B data must be directly passed to + * MlasQNBitGemmBatch(). * * @param[in] N column size of matrix B and C * @param[in] K column size of matrix A and row size of matrix B @@ -150,12 +150,12 @@ MlasSQNBitGemmBatchWorkspaceSize( * @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values) */ size_t MLASCALL -MlasSQNBitGemmPackQuantBDataSize( +MlasQNBitGemmPackQuantBDataSize( size_t N, size_t K, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); /** @@ -186,12 +186,12 @@ MlasSQNBitGemmPackQuantBDataSize( * @param[in] ThreadPool thread pool to use (no parallel if nullptr) */ void MLASCALL -MlasSQNBitGemmPackQuantBData( +MlasQNBitGemmPackQuantBData( size_t N, size_t K, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const void* QuantBData, void* PackedQuantBDataAndOrBlkSum, const void* QuantBScale, diff --git a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp index d7d99899d544a..f1bc013a469d9 100644 --- a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp +++ b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp @@ -23,7 +23,7 @@ Module Name: #include #include "fp16_common.h" -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_neon.h" namespace sqnbitgemm_neon @@ -131,7 +131,7 @@ HQ4BitGemmPackQuantBData_CompFp16( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, std::byte* PackedQuantBDataBegin, MLAS_THREADPOOL* ThreadPool diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h index 13ea8d96c20e4..9bc574a845a3e 100644 --- a/onnxruntime/core/mlas/lib/mlasi.h +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -1017,17 +1017,17 @@ extern const MLAS_FPQ4GEMM_DISPATCH MlasFpQ4GemmDispatchAvx512; // Float/quantized n-bit integer matrix/matrix multiply dispatch structure. // -struct MLAS_SQNBIT_GEMM_DISPATCH; +struct MLAS_QNBIT_GEMM_DISPATCH; -extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon; +extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon; -extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2; +extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2; -extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni; +extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni; -extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512; +extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512; -extern const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni; +extern const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni; // // Quantized depthwise convolution kernels. @@ -1184,7 +1184,7 @@ struct MLAS_PLATFORM { const MLAS_FPQ4GEMM_DISPATCH* FpQ4GemmDispatch{nullptr}; const MLAS_Q8Q4GEMM_DISPATCH* Q8Q4GemmDispatch{nullptr}; - const MLAS_SQNBIT_GEMM_DISPATCH* SQNBitGemmDispatch{nullptr}; + const MLAS_QNBIT_GEMM_DISPATCH* QNBitGemmDispatch{nullptr}; MLAS_CAST_F16_TO_F32_KERNEL* CastF16ToF32Kernel; MLAS_CAST_F32_TO_F16_KERNEL* CastF32ToF16Kernel; diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 23d29fd02fa5a..12f7dd3e74dbc 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -387,7 +387,7 @@ Return Value: this->ConvDepthwiseS8S8Kernel = MlasConvDepthwiseKernelAvx2; this->ConvDepthwiseS8U8Kernel = MlasConvDepthwiseKernelAvx2; this->ComputeSumExpF32Kernel = MlasComputeSumExpF32KernelFma3; - this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2; this->CastF16ToF32Kernel = &MlasCastF16ToF32KernelAvx2; this->CastF32ToF16Kernel = &MlasCastF32ToF16KernelAvx2; @@ -417,7 +417,7 @@ Return Value: this->GemmU8S8Kernel = MlasGemmU8S8KernelAvxVnni; this->GemvU8S8Kernel = MlasGemvU8S8KernelAvxVnni; this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvxVnni; - this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2vnni; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx2vnni; } #if !defined(ORT_MINIMAL_BUILD) @@ -458,7 +458,7 @@ Return Value: this->GemmU8U8Kernel = MlasGemmU8U8KernelAvx512Core; this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Core; this->FpQ4GemmDispatch = &MlasFpQ4GemmDispatchAvx512; - this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512; // // Check if the processor supports AVX512VNNI. @@ -471,7 +471,7 @@ Return Value: this->GemvU8S8Kernel = MlasGemvU8S8KernelAvx512Vnni; this->ConvSymU8S8Dispatch = &MlasConvSymDispatchAvx512Vnni; this->Q8Q4GemmDispatch = &MlasQ8Q4GemmDispatchAvx512vnni; - this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512vnni; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchAvx512vnni; } } } @@ -562,7 +562,7 @@ Return Value: this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot; // MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions - this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; } #if defined(__linux__) diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp b/onnxruntime/core/mlas/lib/qnbitgemm.cpp similarity index 70% rename from onnxruntime/core/mlas/lib/sqnbitgemm.cpp rename to onnxruntime/core/mlas/lib/qnbitgemm.cpp index a45494ef2e04f..635a3b47a23fa 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm.cpp +++ b/onnxruntime/core/mlas/lib/qnbitgemm.cpp @@ -6,16 +6,16 @@ Licensed under the MIT License. Module Name: - sqnbitgemm.cpp + qnbitgemm.cpp Abstract: This module implements the float/quantized n-bit integer matrix - multiplication hardware agnostic entrypoint, MlasSQNBitGemmBatch, + multiplication hardware agnostic entrypoint, MlasQNBitGemmBatch, as well as some SQNBitGemm-related query functions. --*/ -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_q8_block.h" #include @@ -23,35 +23,40 @@ Module Name: namespace { -enum SQNBitGemmVariant { +enum QNBitGemmVariant { SQNBitGemmVariantInvalid = -1, // Valid variants SQNBitGemmVariant_BitWidth4_CompFp32 = 0, SQNBitGemmVariant_BitWidth4_CompInt8, + HQNBitGemmVariant_BitWidth4_CompFp16, + HQNBitGemmVariant_BitWidth4_CompInt8, // End of valid variants - // Keep this element last and ensure that its value is the number of valid SQNBitGemmVariant values. + // Keep this element last and ensure that its value is the number of valid QNBitGemmVariant values. // Its value is used as an array size. SQNBitGemmVariantCount, }; -SQNBitGemmVariant -GetSQNBitGemmVariant( +QNBitGemmVariant +GetQNBitGemmVariant( size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { if (BlkBitWidth == 4 && (BlkLen == 16 || BlkLen == 32 || BlkLen == 64 || BlkLen == 128 || BlkLen == 256)) { - if (ComputeType == CompFp32 || - ComputeType == CompUndef) { // treat CompUndef (undefined) as CompFp32 + if (ComputeType == SQNBIT_CompFp32) { return SQNBitGemmVariant_BitWidth4_CompFp32; - } else if (ComputeType == CompInt8) { + } else if (ComputeType == HQNBIT_CompFp16) { + return HQNBitGemmVariant_BitWidth4_CompFp16; + } else if (ComputeType == SQNBIT_CompInt8) { return SQNBitGemmVariant_BitWidth4_CompInt8; + } else if (ComputeType == HQNBIT_CompInt8) { + return HQNBitGemmVariant_BitWidth4_CompInt8; } } @@ -61,18 +66,18 @@ GetSQNBitGemmVariant( } // namespace bool MLASCALL -MlasIsSQNBitGemmAvailable( +MlasIsQNBitGemmAvailable( size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { - const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch; + const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch; if (Dispatch == nullptr) { return false; } - const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType); + const auto Variant = GetQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType); switch (Variant) { case SQNBitGemmVariant_BitWidth4_CompFp32: { @@ -94,80 +99,80 @@ namespace { size_t -SQNBitGemmPerGemmWorkspaceSize( +QNBitGemmPerGemmWorkspaceSize( size_t M, size_t N, size_t K, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { - const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch; + const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch; if (Dispatch == nullptr) { return 0; } - if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPerGemmWorkspaceSize != nullptr) { - return Dispatch->SQ4BitGemmPerGemmWorkspaceSize(M, N, K, BlkLen, ComputeType); + if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPerGemmWorkspaceSize != nullptr) { + return Dispatch->Q4BitGemmPerGemmWorkspaceSize(M, N, K, BlkLen, ComputeType); } return 0; } size_t -SQNBitGemmPerGemmWorkspaceAlignment( +QNBitGemmPerGemmWorkspaceAlignment( size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { - const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch; + const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch; if (Dispatch == nullptr) { return 1; } - if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPerGemmWorkspaceAlignment != nullptr) { - return Dispatch->SQ4BitGemmPerGemmWorkspaceAlignment(BlkLen, ComputeType); + if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPerGemmWorkspaceAlignment != nullptr) { + return Dispatch->Q4BitGemmPerGemmWorkspaceAlignment(BlkLen, ComputeType); } return 1; } size_t -SQNBitGemmPerGemmWorkspaceStride( +QNBitGemmPerGemmWorkspaceStride( size_t M, size_t N, size_t K, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { - const auto Size = SQNBitGemmPerGemmWorkspaceSize(M, N, K, BlkBitWidth, BlkLen, ComputeType); - const auto Alignment = SQNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType); + const auto Size = QNBitGemmPerGemmWorkspaceSize(M, N, K, BlkBitWidth, BlkLen, ComputeType); + const auto Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType); return MlasDivRoundup(Size, Alignment) * Alignment; } } // namespace size_t MLASCALL -MlasSQNBitGemmBatchWorkspaceSize( +MlasQNBitGemmBatchWorkspaceSize( size_t M, size_t N, size_t K, size_t BatchN, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { - const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType); + const size_t PerGemmWorkspaceStride = QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType); if (PerGemmWorkspaceStride == 0) { return 0; } - const size_t Alignment = SQNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType); + const size_t Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType); const size_t WorkspaceSize = BatchN * PerGemmWorkspaceStride; @@ -175,21 +180,21 @@ MlasSQNBitGemmBatchWorkspaceSize( } size_t MLASCALL -MlasSQNBitGemmPackQuantBDataSize( +MlasQNBitGemmPackQuantBDataSize( size_t N, size_t K, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { - const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch; + const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch; if (Dispatch == nullptr) { return 0; } - if (BlkBitWidth == 4 && Dispatch->SQ4BitGemmPackQuantBDataSize != nullptr) { - return Dispatch->SQ4BitGemmPackQuantBDataSize( + if (BlkBitWidth == 4 && Dispatch->Q4BitGemmPackQuantBDataSize != nullptr) { + return Dispatch->Q4BitGemmPackQuantBDataSize( N, K, BlkLen, ComputeType ); } @@ -213,12 +218,12 @@ struct PerGemmQuantAWorkspace { }; void MLASCALL -MlasSQNBitGemmPackQuantBData( +MlasQNBitGemmPackQuantBData( size_t N, size_t K, size_t BlkBitWidth, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const void* QuantBData, void* PackedQuantBDataAndOrBlkSumWorkspace, const void* QuantBScale, @@ -227,15 +232,15 @@ MlasSQNBitGemmPackQuantBData( MLAS_THREADPOOL* ThreadPool ) { - const auto* Dispatch = GetMlasPlatform().SQNBitGemmDispatch; + const auto* Dispatch = GetMlasPlatform().QNBitGemmDispatch; if (Dispatch == nullptr) { return; } if (BlkBitWidth == 4) { - if (ComputeType == CompInt8 && Dispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) { + if (ComputeType == SQNBIT_CompInt8 && Dispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) { const size_t BlockCountK = MlasDivRoundup(K, BlkLen); - PackedQuantBDataStruct packed_quant_b(PackedQuantBDataAndOrBlkSumWorkspace, N, BlockCountK, BlkLen); + PackedQuantBDataStruct packed_quant_b(PackedQuantBDataAndOrBlkSumWorkspace, N, BlockCountK, BlkLen); Dispatch->SQ4BitGemmPackQuantBDataAndBlkSum( N, K, @@ -295,22 +300,11 @@ AddBiasForGemm(const float* Bias, float* C, size_t CountM, size_t CountN, size_t } } -typedef void(SQNBitGemmFn)( - size_t BlkLen, - size_t K, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams, - void* PerGemmWorkspace, - size_t RangeStartM, - size_t RangeCountM, - size_t RangeStartN, - size_t RangeCountN -); - void SQ4BitGemm_CompFp32( const size_t BlkLen, const size_t K, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams, + const MLAS_QNBIT_GEMM_DATA_PARAMS* const DataParams, void* const PerGemmWorkspace, const size_t RangeStartM, const size_t RangeCountM, @@ -355,7 +349,7 @@ SQ4BitGemm_CompFp32( float* c_blk = C + n; const float* bias = (Bias == nullptr) ? nullptr : Bias + n; - GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmM1Kernel_CompFp32( + GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmM1Kernel_CompFp32( BlkLen, a_row, b_col, b_col_scale, b_col_zp, c_blk, CountN, K, k_blks, bias ); @@ -393,7 +387,7 @@ SQ4BitGemm_CompFp32( float* c_blk = C + n; const float* bias = (Bias == nullptr) ? nullptr : Bias + n; - GetMlasPlatform().SQNBitGemmDispatch->Q4BitBlkDequantBForSgemm_CompFp32( + GetMlasPlatform().QNBitGemmDispatch->Q4BitBlkDequantBForSgemm_CompFp32( BlkLen, dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks ); @@ -429,7 +423,7 @@ void SQ4BitGemm_CompInt8( const size_t BlkLen, const size_t K, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* const DataParams, + const MLAS_QNBIT_GEMM_DATA_PARAMS* const DataParams, void* const PerGemmWorkspace, const size_t RangeStartM, const size_t RangeCountM, @@ -500,10 +494,10 @@ SQ4BitGemm_CompInt8( float* c_blk = C + n; const float* bias = (Bias == nullptr) ? nullptr : Bias + n; - if (GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_CompInt8 != nullptr) { + if (GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_CompInt8 != nullptr) { size_t RowsRemaining = RangeCountM; while (RowsRemaining > 0) { - const auto RowsHandled = GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_CompInt8( + const auto RowsHandled = GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_CompInt8( BlkLen, a_row, b_col, b_col_scale, b_col_zp, c_blk, RowsRemaining, CountN, K, k_blks, ldc, bias ); @@ -522,10 +516,10 @@ SQ4BitGemm_CompInt8( } } #ifdef MLAS_TARGET_AMD64_IX86 - else if (GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr) + else if (GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8 != nullptr) { const float* b_blk_sum = QuantBBlkSum + n * k_blks; - GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8( + GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmKernel_BlkSum_CompInt8( BlkLen, QuantA, QuantAScale, @@ -554,26 +548,29 @@ SQ4BitGemm_CompInt8( } } -typedef void(InitializeWorkspaceFn)( +template +void +InitializeWorkspace_CompInt8( size_t M, size_t N, size_t K, size_t BatchN, size_t BlkLen, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, void* Workspace, size_t PerGemmWorkspaceStride, MLAS_THREADPOOL* ThreadPool ); +template <> void -InitializeWorkspace_CompInt8( +InitializeWorkspace_CompInt8( size_t M, size_t N, size_t K, size_t BatchN, size_t BlkLen, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, void* Workspace, size_t PerGemmWorkspaceStride, MLAS_THREADPOOL* ThreadPool @@ -581,8 +578,8 @@ InitializeWorkspace_CompInt8( { MLAS_UNREFERENCED_PARAMETER(N); - const auto QuantizeARow = GetMlasPlatform().SQNBitGemmDispatch->QuantizeARow_CompInt8; - const auto QuantizeARow2 = GetMlasPlatform().SQNBitGemmDispatch->QuantizeARowComputeBlkSum_CompInt8; + const auto QuantizeARow = GetMlasPlatform().QNBitGemmDispatch->QuantizeARow_CompInt8; + const auto QuantizeARow2 = GetMlasPlatform().QNBitGemmDispatch->QuantizeARowComputeBlkSum_CompInt8; const size_t BlockCountK = MlasDivRoundup(K, BlkLen); const size_t QuantAStride = BlockCountK * Q8BlkSize(BlkLen); @@ -622,61 +619,153 @@ InitializeWorkspace_CompInt8( } } -struct Operations { - InitializeWorkspaceFn* InitializeWorkspace = nullptr; - SQNBitGemmFn* SQNBitGemm = nullptr; -}; +template <> +void +InitializeWorkspace_CompInt8( + size_t M, + size_t N, + size_t K, + size_t BatchN, + size_t BlkLen, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, + void* Workspace, + size_t PerGemmWorkspaceStride, + MLAS_THREADPOOL* ThreadPool +) { + MLAS_UNREFERENCED_PARAMETER(M); + MLAS_UNREFERENCED_PARAMETER(N); + MLAS_UNREFERENCED_PARAMETER(K); + MLAS_UNREFERENCED_PARAMETER(BatchN); + MLAS_UNREFERENCED_PARAMETER(BlkLen); + MLAS_UNREFERENCED_PARAMETER(DataParams); + MLAS_UNREFERENCED_PARAMETER(Workspace); + MLAS_UNREFERENCED_PARAMETER(PerGemmWorkspaceStride); + MLAS_UNREFERENCED_PARAMETER(ThreadPool); +} + +template +using InitializeWorkspaceFn = std::function* DataParams, + void* Workspace, + size_t PerGemmWorkspaceStride, + MLAS_THREADPOOL* ThreadPool +)>; + +template +InitializeWorkspaceFn +GetInitializeWorkspace(QNBitGemmVariant variant); + +template <> +InitializeWorkspaceFn +GetInitializeWorkspace(QNBitGemmVariant variant) +{ + switch (variant) { + case SQNBitGemmVariant_BitWidth4_CompInt8: + return InitializeWorkspace_CompInt8; + default: + return nullptr; + } +} + +template <> +InitializeWorkspaceFn +GetInitializeWorkspace(QNBitGemmVariant variant) +{ + switch (variant) { + case HQNBitGemmVariant_BitWidth4_CompInt8: + return InitializeWorkspace_CompInt8; + default: + return nullptr; + } +} -constexpr auto OperationMap = []() { - std::array ops; +template +using QNBitGemmFn = std::function* const DataParams, + void* const PerGemmWorkspace, + const size_t RangeStartM, + const size_t RangeCountM, + const size_t RangeStartN, + const size_t RangeCountN +)>; - ops[SQNBitGemmVariant_BitWidth4_CompFp32].SQNBitGemm = SQ4BitGemm_CompFp32; +template +QNBitGemmFn +GetQNBitGemm(QNBitGemmVariant variant); - ops[SQNBitGemmVariant_BitWidth4_CompInt8].InitializeWorkspace = InitializeWorkspace_CompInt8; - ops[SQNBitGemmVariant_BitWidth4_CompInt8].SQNBitGemm = SQ4BitGemm_CompInt8; +template <> +QNBitGemmFn +GetQNBitGemm(QNBitGemmVariant variant) +{ + switch (variant) { + case SQNBitGemmVariant_BitWidth4_CompFp32: + return SQ4BitGemm_CompFp32; + case SQNBitGemmVariant_BitWidth4_CompInt8: + return SQ4BitGemm_CompInt8; + default: + return nullptr; + } +} - return ops; -}(); +template <> +QNBitGemmFn +GetQNBitGemm(QNBitGemmVariant variant) +{ + switch (variant) { + case HQNBitGemmVariant_BitWidth4_CompFp16: + return nullptr; + default: + return nullptr; + } +} } // namespace +template void MLASCALL -MlasSQNBitGemmBatch( +MlasQNBitGemmBatch( const size_t M, const size_t N, const size_t K, const size_t BatchN, const size_t BlkBitWidth, const size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, - const MLAS_SQNBIT_GEMM_DATA_PARAMS* DataParams, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, void* Workspace, MLAS_THREADPOOL* ThreadPool ) { - const auto Variant = GetSQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType); + const auto Variant = GetQNBitGemmVariant(BlkBitWidth, BlkLen, ComputeType); assert(Variant != SQNBitGemmVariantInvalid); // // Ensure `Workspace` has correct alignment. // if (Workspace != nullptr) { - const size_t Alignment = SQNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType); + const size_t Alignment = QNBitGemmPerGemmWorkspaceAlignment(BlkBitWidth, BlkLen, ComputeType); const uintptr_t WorkspaceAddress = reinterpret_cast(Workspace); Workspace = reinterpret_cast( (WorkspaceAddress + Alignment - 1) & (~(Alignment - 1)) ); } - const size_t PerGemmWorkspaceStride = SQNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType); + const size_t PerGemmWorkspaceStride = QNBitGemmPerGemmWorkspaceStride(M, N, K, BlkBitWidth, BlkLen, ComputeType); - if (const auto InitializeWorkspaceOperation = OperationMap[Variant].InitializeWorkspace; + if (const auto InitializeWorkspaceOperation = GetInitializeWorkspace(Variant); InitializeWorkspaceOperation != nullptr) { InitializeWorkspaceOperation( M, N, K, BatchN, BlkLen, DataParams, Workspace, PerGemmWorkspaceStride, ThreadPool ); } - const auto ComputeOperation = OperationMap[Variant].SQNBitGemm; + const auto ComputeOperation = GetQNBitGemm(Variant); const size_t BlockCountK = MlasDivRoundup(K, BlkLen); @@ -685,11 +774,11 @@ MlasSQNBitGemmBatch( const auto* Data = &DataParams[gemm_i]; void* PerGemmWorkspace = reinterpret_cast(Workspace) + gemm_i * PerGemmWorkspaceStride; - if (ComputeType == CompInt8 && GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) { - PackedQuantBDataStruct packed_quant_b(const_cast(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen); - const_cast(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData; - const_cast(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum; - const_cast(Data)->QuantBScale = packed_quant_b.PackedQuantBScale; + if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) { + PackedQuantBDataStruct packed_quant_b(const_cast(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen); + const_cast*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData; + const_cast*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum; + const_cast*>(Data)->QuantBScale = packed_quant_b.PackedQuantBScale; PerGemmQuantAWorkspace per_gemm_quant_a_workspace(PerGemmWorkspace, M, BlockCountK, BlkLen); ComputeOperation(BlkLen, K, Data, &per_gemm_quant_a_workspace, 0, M, 0, N); } else { @@ -756,11 +845,11 @@ MlasSQNBitGemmBatch( void* PerGemmWorkspace = reinterpret_cast(Workspace) + gemm_i * PerGemmWorkspaceStride; - if (ComputeType == CompInt8 && GetMlasPlatform().SQNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) { - PackedQuantBDataStruct packed_quant_b(const_cast(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen); - const_cast(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData; - const_cast(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum; - const_cast(Data)->QuantBScale = packed_quant_b.PackedQuantBScale; + if (ComputeType == SQNBIT_CompInt8 && GetMlasPlatform().QNBitGemmDispatch->SQ4BitGemmPackQuantBDataAndBlkSum != nullptr) { + PackedQuantBDataStruct packed_quant_b(const_cast(Data->QuantBDataWorkspace), N, BlockCountK, BlkLen); + const_cast*>(Data)->PackedQuantBData = packed_quant_b.PackedQuantBData; + const_cast*>(Data)->QuantBBlkSum = packed_quant_b.QuantBBlkSum; + const_cast*>(Data)->QuantBScale = packed_quant_b.PackedQuantBScale; PerGemmQuantAWorkspace per_gemm_quant_a_workspace(PerGemmWorkspace, M, BlockCountK, BlkLen); ComputeOperation(BlkLen, K, Data, &per_gemm_quant_a_workspace, RangeStartM, RangeCountM, RangeStartN, RangeCountN); @@ -769,3 +858,33 @@ MlasSQNBitGemmBatch( } }); } + +template +void MLASCALL +MlasQNBitGemmBatch( + const size_t M, + const size_t N, + const size_t K, + const size_t BatchN, + const size_t BlkBitWidth, + const size_t BlkLen, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, + void* Workspace, + MLAS_THREADPOOL* ThreadPool +); + +template +void MLASCALL +MlasQNBitGemmBatch( + const size_t M, + const size_t N, + const size_t K, + const size_t BatchN, + const size_t BlkBitWidth, + const size_t BlkLen, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, + const MLAS_QNBIT_GEMM_DATA_PARAMS* DataParams, + void* Workspace, + MLAS_THREADPOOL* ThreadPool +); diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h similarity index 91% rename from onnxruntime/core/mlas/lib/sqnbitgemm.h rename to onnxruntime/core/mlas/lib/qnbitgemm.h index 2da336ca2f0ec..28e17f14b02c9 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm.h +++ b/onnxruntime/core/mlas/lib/qnbitgemm.h @@ -6,7 +6,7 @@ Licensed under the MIT License. Module Name: - sqnbitgemm.h + qnbitgemm.h Abstract: @@ -46,24 +46,25 @@ MlasAlignAddress(void* addr, const size_t alignment) return addr; } +template struct PackedQuantBDataStruct { PackedQuantBDataStruct(void* PackedQuantBWorkspace, size_t N, size_t BlockCountK, size_t BlkLen) : QuantBWorkspace_(PackedQuantBWorkspace), N_(N), BlockCountK_(BlockCountK), BlkLen_(BlkLen) { - // TODO: duplicate code from SQ4BitGemmPackQuantBDataSize + // TODO: duplicate code from Q4BitGemmPackQuantBDataSize constexpr size_t BlkBitWidth = 4; const size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); - size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(float); + size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(T); // _mm256_load_si256 requires alignment on a 32-byte boundary PackedQuantBData = (std::byte*)MlasAlignAddress(PackedQuantBWorkspace, 32); - QuantBBlkSum = (float*)(PackedQuantBData + PackedQuantBDataSize); - QuantBBlkSum = (float*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment()); - PackedQuantBScale = (float*)((std::byte*)QuantBBlkSum + BlkSumSize); + QuantBBlkSum = (T*)(PackedQuantBData + PackedQuantBDataSize); + QuantBBlkSum = (T*)MlasAlignAddress(QuantBBlkSum, MlasQNBitQuantBBlkSumAlignment()); + PackedQuantBScale = (T*)((std::byte*)QuantBBlkSum + BlkSumSize); } std::byte* PackedQuantBData; - float* PackedQuantBScale; - float* QuantBBlkSum; + T* PackedQuantBScale; + T* QuantBBlkSum; void* QuantBWorkspace_; size_t N_, BlockCountK_, BlkLen_; @@ -84,27 +85,27 @@ MlasQNBitZeroPointsForBlksSizeInBytes(size_t BlkCount) // Kernel dispatch structure. // -struct MLAS_SQNBIT_GEMM_DISPATCH { +struct MLAS_QNBIT_GEMM_DISPATCH { // // Quantized B data packing function prototypes. // - /** Gets size of packed quantized B data containing 4-bit integers. See MlasSQNBitGemmPackQuantBDataSize(). */ + /** Gets size of packed quantized B data containing 4-bit integers. See MlasQNBitGemmPackQuantBDataSize(). */ typedef size_t(SQ4BitGemmPackQuantBDataSize_Fn)( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); - SQ4BitGemmPackQuantBDataSize_Fn* SQ4BitGemmPackQuantBDataSize = nullptr; + SQ4BitGemmPackQuantBDataSize_Fn* Q4BitGemmPackQuantBDataSize = nullptr; - /** Packs quantized B data containing 4-bit integers. See MlasSQNBitGemmPackQuantBData(). */ + /** Packs quantized B data containing 4-bit integers. See MlasQNBitGemmPackQuantBData(). */ typedef void(SQ4BitGemmPackQuantBData_Fn)( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, std::byte* PackedQuantBDataBegin, MLAS_THREADPOOL* ThreadPool @@ -116,12 +117,12 @@ struct MLAS_SQNBIT_GEMM_DISPATCH { size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, const float* QuantBScaleBegin, bool has_zp_input, const std::byte* QuantBZPBegin, - PackedQuantBDataStruct& packed_quant_b, + PackedQuantBDataStruct& packed_quant_b, MLAS_THREADPOOL* ThreadPool ); @@ -146,10 +147,10 @@ struct MLAS_SQNBIT_GEMM_DISPATCH { size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); - SQ4BitGemmPerGemmWorkspaceSize_Fn* SQ4BitGemmPerGemmWorkspaceSize = nullptr; + SQ4BitGemmPerGemmWorkspaceSize_Fn* Q4BitGemmPerGemmWorkspaceSize = nullptr; /** * @brief Gets the required byte alignment of the per-GEMM intermediate workspace. @@ -159,13 +160,13 @@ struct MLAS_SQNBIT_GEMM_DISPATCH { */ typedef size_t(SQ4BitGemmPerGemmWorkspaceAlignment_Fn)( size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); - SQ4BitGemmPerGemmWorkspaceAlignment_Fn* SQ4BitGemmPerGemmWorkspaceAlignment = nullptr; + SQ4BitGemmPerGemmWorkspaceAlignment_Fn* Q4BitGemmPerGemmWorkspaceAlignment = nullptr; // - // CompFp32 kernel function prototypes. + // SQNBIT_CompFp32 kernel function prototypes. // /** @@ -231,7 +232,7 @@ struct MLAS_SQNBIT_GEMM_DISPATCH { Q4BitBlkDequantBForSgemm_CompFp32_Fn* Q4BitBlkDequantBForSgemm_CompFp32 = nullptr; // - // CompInt8 kernel function prototypes. + // SQNBIT_CompInt8 kernel function prototypes. // /** diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp index baaa4ba1a3b1f..01443e2ff077f 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp @@ -19,7 +19,7 @@ Module Name: #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_kernel_avx_common_int8.h" #include "sqnbitgemm_kernel_avx2_int8_blklen16.h" @@ -1306,12 +1306,12 @@ SQ4BitGemmPackQuantBDataAndBlkSum( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, const float* QuantBScaleBegin, bool has_zp_input, const std::byte* QuantBZPBegin, - PackedQuantBDataStruct& packed_quant_b, + PackedQuantBDataStruct& packed_quant_b, MLAS_THREADPOOL* ThreadPool ) { @@ -1319,9 +1319,9 @@ SQ4BitGemmPackQuantBDataAndBlkSum( const size_t BlockCountK = MlasDivRoundup(K, BlkLen); - // TODO: always use SubBlkLen = 64 in CompInt8 + // TODO: always use SubBlkLen = 64 in SQNBIT_CompInt8 size_t SubBlkLen = (BlkLen == 16) ? 16 : (BlkLen == 32 ? 32 : 64); - if (BlkLen == 32 && ComputeType == CompInt8) { + if (BlkLen == 32 && ComputeType == SQNBIT_CompInt8) { SubBlkLen = 64; } PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool); @@ -1330,15 +1330,15 @@ SQ4BitGemmPackQuantBDataAndBlkSum( // // Kernel dispatch structure definition. // -const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() { - MLAS_SQNBIT_GEMM_DISPATCH d; +const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() { + MLAS_QNBIT_GEMM_DISPATCH d; - d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize; + d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize; d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData; d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum; - d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize; - d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment; + d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize; + d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx2; d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; @@ -1349,15 +1349,15 @@ const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() { return d; }(); -const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni = []() { - MLAS_SQNBIT_GEMM_DISPATCH d; +const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni = []() { + MLAS_QNBIT_GEMM_DISPATCH d; - d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize; + d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize; d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData; d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum; - d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize; - d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment; + d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize; + d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx2; d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h index 80d67806ea6e8..445ead329acf8 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen16.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h index af6f52090adcb..5dab8091ce760 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen32.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h index 4cb9dddd4ecf9..d4b89bd9bad2d 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2_int8_blklen64.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" template diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp index 13bd369a065bb..425dfbe87c982 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp @@ -19,7 +19,7 @@ Module Name: #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_kernel_avx_common_int8.h" #include "sqnbitgemm_kernel_avx512_int8_blklen16.h" @@ -28,7 +28,7 @@ Module Name: #include "sqnbitgemm_kernel_avx512_int8_blklen128.h" // -// CompFp32 kernel implementation. +// SQNBIT_CompFp32 kernel implementation. // #include "sqnbitgemm_kernel_avx_common_fp32.h" @@ -151,7 +151,7 @@ SQ4BitGemmM1Kernel_CompFp32_avx512( } // -// CompInt8 kernel implementation. +// SQNBIT_CompInt8 kernel implementation. // MLAS_FORCEINLINE @@ -332,12 +332,12 @@ SQ4BitGemmPackQuantBDataAndBlkSum512( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, const float* QuantBScaleBegin, bool has_zp_input, const std::byte* QuantBZPBegin, - PackedQuantBDataStruct& packed_quant_b, + PackedQuantBDataStruct& packed_quant_b, MLAS_THREADPOOL* ThreadPool ) { @@ -346,21 +346,21 @@ SQ4BitGemmPackQuantBDataAndBlkSum512( const size_t BlockCountK = MlasDivRoundup(K, BlkLen); size_t SubBlkLen = (BlkLen == 16) ? 16 : (BlkLen == 32 ? 32 : 64); - if (ComputeType == CompInt8) { + if (ComputeType == SQNBIT_CompInt8) { SubBlkLen = 128; } PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool); } -const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512 = []() { - MLAS_SQNBIT_GEMM_DISPATCH d; +const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512 = []() { + MLAS_QNBIT_GEMM_DISPATCH d; - d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize; + d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize; d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData; d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum512; - d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize; - d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment; + d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize; + d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx512; d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h index 7d9dc36854621..8f1ea6676b788 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" @@ -81,7 +81,7 @@ accumulate_blklen32_r2c1blk2_avx2( _mm256_sign_epi8(bv1_32_epi8, bv1_32_epi8), _mm256_sign_epi8(av01_32_epi8, bv1_32_epi8) ); const __m256i sum_16_epi16 = _mm256_hadd_epi16(dot0_16_epi16, dot1_16_epi16); - + __m256i one_16_epi16 = _mm256_srli_epi16(_mm256_cmpeq_epi16(bv0_32_epi8, bv0_32_epi8), 15); const __m256i sum_8_epi32 = _mm256_madd_epi16(one_16_epi16, sum_16_epi16); const __m256 sum_ps = _mm256_cvtepi32_ps(sum_8_epi32); @@ -143,7 +143,7 @@ accumulate_blklen32_r2c1blk2_avx2( // const __m256i bv1 = _mm256_and_si256(_mm256_srli_epi16(bv_packed, 4), low_mask); // 16, 17,...30, 31, 48, 49,...,62, 63 __m256i bv1_32_epi8 = _mm256_srli_epi16(_mm256_sub_epi8(bv_packed, bv0_32_epi8), 4); // 16, 17,...30, 31, 48, 49,...,62, 63 - //__m256i bv0_32_epi8 = _mm256_set_m128i(_mm256_castsi256_si128(bv1), _mm256_castsi256_si128(bv0)); + //__m256i bv0_32_epi8 = _mm256_set_m128i(_mm256_castsi256_si128(bv1), _mm256_castsi256_si128(bv0)); //// This (the second line below) saves one _mm256_extracti128_si256 against using _mm256_set_m128i. ////__m256i bv1_32_epi8 = _mm256_set_m128i(_mm256_extracti128_si256(bv1, 1), _mm256_extracti128_si256(bv0, 1)); @@ -184,7 +184,7 @@ accumulate_blklen32_r2c1blk1_avx2( const __m128i bv_packed0 = _mm_loadu_si128(reinterpret_cast(QuantBDataPtr)); __m256i bv_32_epi8 = _mm256_set_m128i(_mm_srli_epi16(bv_packed0, 4), bv_packed0); bv_32_epi8 = _mm256_and_si256(_mm256_set1_epi8(0x0F), bv_32_epi8); - + const int8_t zp = get_zp(true, QuantBZeroPointPtr); const __m256i bzp = _mm256_set1_epi8(zp); bv_32_epi8 = _mm256_sub_epi8(bv_32_epi8, bzp); @@ -435,7 +435,7 @@ Q4Int8Gemm2x4BlkLen32Avx2( } } -template +template void MLAS_FORCEINLINE Q4Int8Gemm2xXBlkLen32Avx2( const std::byte* QuantA, const std::byte* QuantBData, @@ -877,7 +877,7 @@ MLAS_FORCEINLINE QuantBZeroPoint + multipleCols * StrideQuantBZeroPoint, C + multipleRows * ldc + multipleCols, remainingRows, - remainingCols, + remainingCols, BlockCountK, Bias ? Bias + multipleCols : nullptr, lda, diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h index 60a887345d0e0..d79554c34c108 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen128.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_kernel_avx512_int8_blklen64.h" diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h index bb14babd6c2b1..03064886caf24 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen16.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_kernel_avx2_int8_blklen16.h" #include "sqnbitgemm_kernel_avx512_int8_blklen32.h" diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h index e9df6b952bd27..3b1096ac05ba7 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen32.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_kernel_avx2_int8_blklen32.h" #include "sqnbitgemm_kernel_avx512_int8_blklen64.h" diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h index 2a65ac4af0c1d..72ce28d834199 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512_int8_blklen64.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" static MLAS_FORCEINLINE __m256 diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp index 6a5c01162c51b..777d4609ef5d4 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp @@ -19,7 +19,7 @@ Module Name: #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_kernel_avx_common_fp32.h" #include "sqnbitgemm_kernel_avx_common_int8.h" @@ -314,12 +314,12 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, const float* QuantBScaleBegin, bool has_zp_input, const std::byte* QuantBZPBegin, - PackedQuantBDataStruct& packed_quant_b, + PackedQuantBDataStruct& packed_quant_b, MLAS_THREADPOOL* ThreadPool ) { @@ -328,7 +328,7 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni( const size_t BlockCountK = MlasDivRoundup(K, BlkLen); size_t SubBlkLen = (BlkLen == 16) ? 16 : (BlkLen == 32 ? 32 : 64); - if (ComputeType == CompInt8) { + if (ComputeType == SQNBIT_CompInt8) { SubBlkLen = 128; } PackQuantBDataAndBlkSum(N, BlockCountK, BlkLen, SubBlkLen, QuantBDataBegin, QuantBScaleBegin, has_zp_input, QuantBZPBegin, packed_quant_b, ThreadPool); @@ -337,15 +337,15 @@ SQ4BitGemmPackQuantBDataAndBlkSum512vnni( // // Kernel dispatch structure definition. // -const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni = []() { - MLAS_SQNBIT_GEMM_DISPATCH d; +const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni = []() { + MLAS_QNBIT_GEMM_DISPATCH d; - d.SQ4BitGemmPackQuantBDataSize = SQ4BitGemmPackQuantBDataSize; + d.Q4BitGemmPackQuantBDataSize = Q4BitGemmPackQuantBDataSize; d.SQ4BitGemmPackQuantBData = SQ4BitGemmPackQuantBData; d.SQ4BitGemmPackQuantBDataAndBlkSum = SQ4BitGemmPackQuantBDataAndBlkSum512vnni; - d.SQ4BitGemmPerGemmWorkspaceSize = SQ4BitGemmPerGemmWorkspaceSize; - d.SQ4BitGemmPerGemmWorkspaceAlignment = SQ4BitGemmPerGemmWorkspaceAlignment; + d.Q4BitGemmPerGemmWorkspaceSize = Q4BitGemmPerGemmWorkspaceSize; + d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32; d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h index 177f5518bb891..b0367b7fb9a15 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common.h @@ -1,5 +1,5 @@ #pragma once -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_q8_block.h" // @@ -7,16 +7,16 @@ // static size_t -SQ4BitGemmPackQuantBDataSize( +Q4BitGemmPackQuantBDataSize( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { constexpr size_t BlkBitWidth = 4; const size_t BlockCountK = MlasDivRoundup(K, BlkLen); - if (ComputeType == CompInt8) { + if (ComputeType == SQNBIT_CompInt8) { size_t PackedQuantBDataSize = N * BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); const size_t ScaleSize = N * BlockCountK * sizeof(float); size_t BlkSumSize = MlasDivRoundup(N, 16) * BlockCountK * 16 * sizeof(float); @@ -39,7 +39,7 @@ SQ4BitGemmPackQuantBData( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE /* ComputeType*/, + MLAS_QNBIT_GEMM_COMPUTE_TYPE /* ComputeType*/, const std::byte* QuantBDataBegin, std::byte* PackedQuantBDataBegin, MLAS_THREADPOOL* ThreadPool @@ -304,7 +304,7 @@ PackQuantBDataAndBlkSum( const float* QuantBScaleBegin, bool has_zp_input, const std::byte* QuantBZPBegin, - PackedQuantBDataStruct& packed_quant_b, + PackedQuantBDataStruct& packed_quant_b, MLAS_THREADPOOL* ThreadPool ) { @@ -326,18 +326,18 @@ PackQuantBDataAndBlkSum( // static size_t -SQ4BitGemmPerGemmWorkspaceSize( +Q4BitGemmPerGemmWorkspaceSize( size_t M, size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { MLAS_UNREFERENCED_PARAMETER(N); switch(ComputeType) { - case CompInt8: { + case SQNBIT_CompInt8: { // workspace buffer is used for block quantization of A to int8 const size_t BlockCountK = MlasDivRoundup(K, BlkLen); // QuantData + Scale + BlkSum @@ -351,15 +351,15 @@ SQ4BitGemmPerGemmWorkspaceSize( } static size_t -SQ4BitGemmPerGemmWorkspaceAlignment( +Q4BitGemmPerGemmWorkspaceAlignment( size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { MLAS_UNREFERENCED_PARAMETER(BlkLen); switch (ComputeType) { - case CompInt8: { + case SQNBIT_CompInt8: { return Q8BlkAlignment(); } default: { diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h index 5cd380e591098..d15cfc782e125 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_fp32.h @@ -1,5 +1,5 @@ #pragma once -#include "sqnbitgemm.h" +#include "qnbitgemm.h" template MLAS_FORCEINLINE diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h index 895ce6cd091c2..2e96082968866 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx_common_int8.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" #include "sqnbitgemm_q8_block.h" diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp index 3f32cc6c5312d..03c8ce264c846 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp @@ -19,7 +19,7 @@ Module Name: #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_neon.h" #include "sqnbitgemm_q8_block.h" @@ -34,11 +34,11 @@ namespace // size_t -SQ4BitGemmPackQuantBDataSize( +Q4BitGemmPackQuantBDataSize( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { MLAS_UNREFERENCED_PARAMETER(ComputeType); // same size regardless of ComputeType @@ -55,7 +55,7 @@ SQ4BitGemmPackQuantBData( size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, const std::byte* QuantBDataBegin, std::byte* PackedQuantBDataBegin, MLAS_THREADPOOL* ThreadPool @@ -69,7 +69,7 @@ SQ4BitGemmPackQuantBData( const size_t BlkDataSize = MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); const size_t Iterations = N * BlockCountK; // one iteration per block - const size_t SubBlkLen = (ComputeType == CompInt8) + const size_t SubBlkLen = (ComputeType == SQNBIT_CompInt8) ? ((BlkLen == 16) ? 16 : 32) : 16; @@ -126,18 +126,18 @@ SQ4BitGemmPackQuantBData( // size_t -SQ4BitGemmPerGemmWorkspaceSize( +Q4BitGemmPerGemmWorkspaceSize( size_t M, size_t N, size_t K, size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { MLAS_UNREFERENCED_PARAMETER(N); switch (ComputeType) { - case CompInt8: { + case SQNBIT_CompInt8: { // workspace buffer is used for block quantization of A to int8 const size_t BlockCountK = MlasDivRoundup(K, BlkLen); const size_t PerGemmWorkspaceSize = M * BlockCountK * Q8BlkSize(BlkLen); @@ -150,15 +150,15 @@ SQ4BitGemmPerGemmWorkspaceSize( } size_t -SQ4BitGemmPerGemmWorkspaceAlignment( +Q4BitGemmPerGemmWorkspaceAlignment( size_t BlkLen, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ) { MLAS_UNREFERENCED_PARAMETER(BlkLen); switch (ComputeType) { - case CompInt8: { + case SQNBIT_CompInt8: { return Q8BlkAlignment(); } default: { @@ -175,14 +175,14 @@ SQ4BitGemmPerGemmWorkspaceAlignment( // Kernel dispatch structure definition. // -const MLAS_SQNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() { - MLAS_SQNBIT_GEMM_DISPATCH d; +const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() { + MLAS_QNBIT_GEMM_DISPATCH d; - d.SQ4BitGemmPackQuantBDataSize = sqnbitgemm_neon::SQ4BitGemmPackQuantBDataSize; + d.Q4BitGemmPackQuantBDataSize = sqnbitgemm_neon::Q4BitGemmPackQuantBDataSize; d.SQ4BitGemmPackQuantBData = sqnbitgemm_neon::SQ4BitGemmPackQuantBData; - d.SQ4BitGemmPerGemmWorkspaceSize = sqnbitgemm_neon::SQ4BitGemmPerGemmWorkspaceSize; - d.SQ4BitGemmPerGemmWorkspaceAlignment = sqnbitgemm_neon::SQ4BitGemmPerGemmWorkspaceAlignment; + d.Q4BitGemmPerGemmWorkspaceSize = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceSize; + d.Q4BitGemmPerGemmWorkspaceAlignment = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = sqnbitgemm_neon::SQ4BitGemmM1Kernel_CompFp32; d.Q4BitBlkDequantBForSgemm_CompFp32 = sqnbitgemm_neon::Q4BitBlkDequantBForSgemm_CompFp32; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h index ef9345d7ac484..247d885615393 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h @@ -30,13 +30,13 @@ namespace sqnbitgemm_neon // // Function declarations for SQNBitGemm ARM NEON kernel entry points. -// Refer to the prototypes in sqnbitgemm.h for documentation. +// Refer to the prototypes in qnbitgemm.h for documentation. // These are declared here so they can be used to initialize the -// MLAS_SQNBIT_GEMM_DISPATCH structure and also be implemented in separate +// MLAS_QNBIT_GEMM_DISPATCH structure and also be implemented in separate // files. // -// CompFp32 declarations +// SQNBIT_CompFp32 declarations void SQ4BitGemmM1Kernel_CompFp32( @@ -64,7 +64,7 @@ Q4BitBlkDequantBForSgemm_CompFp32( size_t BlockCountK ); -// CompInt8 declarations +// SQNBIT_CompInt8 declarations void QuantizeARow_CompInt8( diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp index 12ddc42506e98..7b9f05a9c385d 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp @@ -13,7 +13,7 @@ Module Name: This module implements the float/quantized n-bit integer matrix multiplication kernels for ARM NEON specific to input type T1 as float32 and - MLAS_SQNBIT_GEMM_COMPUTE_TYPE CompFp32. + MLAS_QNBIT_GEMM_COMPUTE_TYPE SQNBIT_CompFp32. --*/ @@ -21,7 +21,7 @@ Module Name: #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_neon.h" namespace sqnbitgemm_neon @@ -31,7 +31,7 @@ namespace { // -// CompFp32 kernel implementation. +// SQNBIT_CompFp32 kernel implementation. // MLAS_FORCEINLINE void diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp index 0d62ea37b7e26..f1acd99c7b693 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp @@ -13,7 +13,7 @@ Module Name: This module implements the float/quantized n-bit integer matrix multiplication kernels for ARM NEON specific to input type T1 as float32 and - MLAS_SQNBIT_GEMM_COMPUTE_TYPE CompInt8. + MLAS_QNBIT_GEMM_COMPUTE_TYPE SQNBIT_CompInt8. --*/ @@ -21,7 +21,7 @@ Module Name: #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_neon.h" #include "sqnbitgemm_q8_block.h" @@ -29,7 +29,7 @@ namespace sqnbitgemm_neon { // -// CompInt8 kernel implementation. +// SQNBIT_CompInt8 kernel implementation. // namespace diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h index 45c3963365e6b..941b884d0b9d2 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen32.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" template diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h index e9c3812bde899..ed78dfa67042d 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_m1_sym_kernel_avx2_int8_blklen64.h @@ -3,7 +3,7 @@ #include #include -#include "sqnbitgemm.h" +#include "qnbitgemm.h" #include "sqnbitgemm_kernel_avx_common.h" diff --git a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch index 45de47f3e5128..f55b593fd8a8c 100644 --- a/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch +++ b/onnxruntime/core/providers/vsinpu/patches/mlas_crosscompiling.patch @@ -16,7 +16,7 @@ index e46105324a..414c46a1ce 100644 --- a/onnxruntime/core/mlas/inc/mlas.h +++ b/onnxruntime/core/mlas/inc/mlas.h @@ -82,6 +82,9 @@ Abstract: - + #if (!defined(_MSC_VER)) || (_MSC_VER >= 1930) #if defined(MLAS_TARGET_ARM64) || defined(MLAS_TARGET_ARM64EC) +#if !defined(USE_VSINPU) @@ -26,16 +26,16 @@ index e46105324a..414c46a1ce 100644 // Had to temporary disable fp16 under APPLE ARM64, as compiling // the source files require a hardware specific compilation flag. @@ -90,6 +93,7 @@ Abstract: - + #define MLAS_F16VEC_INTRINSICS_SUPPORTED - + +#endif // #endif // #endif // ARM64 #endif // Visual Studio 16 or earlier does not support fp16 intrinsic @@ -1635,6 +1639,7 @@ MlasHalfGemmConvertPackB( ); - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) /** @@ -46,7 +46,7 @@ index e46105324a..414c46a1ce 100644 MlasSBGemmConvertPackB(size_t N, size_t K, const float* B, size_t ldb, void* PackedB); #endif +#endif - + /** * @brief Indirect Depthwise convolution for fp16 diff --git a/onnxruntime/core/mlas/lib/mlasi.h b/onnxruntime/core/mlas/lib/mlasi.h @@ -55,7 +55,7 @@ index 4239e2ecae..3df7e5573d 100644 +++ b/onnxruntime/core/mlas/lib/mlasi.h @@ -361,6 +361,7 @@ size_t #else - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) typedef size_t(MLASCALL MLAS_SBGEMM_FLOAT_KERNEL)( @@ -66,7 +66,7 @@ index 4239e2ecae..3df7e5573d 100644 ); #endif +#endif - + typedef size_t @@ -763,8 +765,10 @@ extern "C" { @@ -82,13 +82,13 @@ index 4239e2ecae..3df7e5573d 100644 MLAS_GEMM_DOUBLE_KERNEL MlasDgemmKernelAdd; @@ -899,8 +903,10 @@ extern "C" { #define MLAS_QGEMM_THREAD_COMPLEXITY 65536 - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) #define MLAS_SBGEMM_THREAD_COMPLEXITY (size_t(64) * size_t(1024)) #endif +#endif - + // // Single-threaded single precision matrix/matrix multiply operation. @@ -2570,4 +2576,3 @@ MlasPackInt4Elements(uint8_t* Output, UnpackedType ValueLow, UnpackedType ValueH @@ -103,16 +103,16 @@ index ed437f20f7..8c9d0a75fd 100644 @@ -20,7 +20,7 @@ Abstract: #include #include - --#if defined(MLAS_TARGET_POWER) + +-#if defined(MLAS_TARGET_POWER) +#if defined(MLAS_TARGET_POWER) #if defined(__linux__) #include #elif defined(_AIX) @@ -536,7 +536,7 @@ Return Value: - this->SQNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; } - + -#if defined(__linux__) +#if defined(__linux__) && !defined(USE_VSINPU) // @@ -124,12 +124,12 @@ index de7fd72fad..4f75dbd6fa 100644 +++ b/onnxruntime/core/mlas/lib/sbgemm.h @@ -31,6 +31,7 @@ Abstract: --*/ - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) - + #pragma once - + @@ -396,4 +397,5 @@ MlasSBGemmBatch(const size_t M, const size_t N, const size_t K, const size_t Bat } ); @@ -141,7 +141,7 @@ index 6a71283f9d..d8bd348854 100644 --- a/onnxruntime/core/providers/cpu/math/matmul.cc +++ b/onnxruntime/core/providers/cpu/math/matmul.cc @@ -132,7 +132,7 @@ Status MatMul::Compute(OpKernelContext* ctx) const { - + return Status::OK(); } -#if defined(__aarch64__) && defined(__linux__) @@ -187,7 +187,7 @@ index b9bbe36583..2f570502d2 100644 +++ b/onnxruntime/core/providers/cpu/math/matmul.h @@ -31,8 +31,10 @@ class MatMul final : public OpKernel { trans_batch_b_ = trans_batch_b_attr != 0; - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) auto config_ops = info.GetConfigOptions().GetConfigEntry(kOrtSessionOptionsMlasGemmFastMathArm64Bfloat16); @@ -195,10 +195,10 @@ index b9bbe36583..2f570502d2 100644 +#endif #endif } - + @@ -57,12 +59,14 @@ class MatMul final : public OpKernel { bool trans_batch_b_; - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) // fastmath mode state @@ -209,7 +209,7 @@ index b9bbe36583..2f570502d2 100644 #endif +#endif }; - + } // namespace onnxruntime diff --git a/onnxruntime/test/mlas/unittest/test_sbgemm.cpp b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp index f85fe97776..6039b7fa9e 100644 @@ -217,12 +217,12 @@ index f85fe97776..6039b7fa9e 100644 +++ b/onnxruntime/test/mlas/unittest/test_sbgemm.cpp @@ -16,6 +16,7 @@ Abstract: --*/ - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) - + #include "test_sbgemm.h" - + @@ -138,4 +139,5 @@ static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_exe } return SBGemmRegistLongExecute() > 0; @@ -235,15 +235,15 @@ index 13701e2e3d..7e432f53c2 100644 +++ b/onnxruntime/test/mlas/unittest/test_sbgemm.h @@ -16,6 +16,7 @@ Abstract: --*/ - + #if defined(__aarch64__) && defined(__linux__) +#if !defined(USE_VSINPU) - + #pragma once - + @@ -278,4 +279,5 @@ class MlasSBGemmTest : public MlasTestBase { } }; - + +#endif #endif // defined(__aarch64__) && defined(__linux__) diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp index 71db7d81075b5..df543d8eca1fc 100644 --- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp +++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp @@ -22,9 +22,9 @@ void RunSQNBitGemmBenchmark(size_t BlkLen, size_t Threads, bool Symmetric, bool HasBias, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, benchmark::State& state) { - if (!MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) { + if (!MlasIsQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) { state.SkipWithMessage("SQNBitGemm is not available with the given configuration on the current machine."); return; } @@ -62,21 +62,21 @@ void RunSQNBitGemmBenchmark(size_t BlkLen, tp.get()); std::unique_ptr Workspace; - if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType); + if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType); WorkspaceSize > 0) { Workspace = std::make_unique(WorkspaceSize); } std::unique_ptr PackedQuantBData; - if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType); + if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType); PackedQuantBDataSize > 0) { PackedQuantBData = std::make_unique(PackedQuantBDataSize); - MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData.data(), PackedQuantBData.get(), - QuantBScale.data(), has_zp_input, QuantBZeroPoint.data(), - tp.get()); + MlasQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData.data(), PackedQuantBData.get(), + QuantBScale.data(), has_zp_input, QuantBZeroPoint.data(), + tp.get()); } - MLAS_SQNBIT_GEMM_DATA_PARAMS params{}; + MLAS_QNBIT_GEMM_DATA_PARAMS params{}; params.A = A.data(); params.lda = K; if (PackedQuantBData != nullptr) @@ -92,10 +92,10 @@ void RunSQNBitGemmBenchmark(size_t BlkLen, params.ldc = N; // warm up run - MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, ¶ms, Workspace.get(), tp.get()); + MlasQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, ¶ms, Workspace.get(), tp.get()); for (auto _ : state) { - MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, ¶ms, Workspace.get(), tp.get()); + MlasQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, ¶ms, Workspace.get(), tp.get()); } } @@ -110,7 +110,7 @@ void SQNBITGEMM(benchmark::State& state) { const auto Threads = narrow(state.range(4)); const auto Symmetric = narrow(state.range(5)); const bool HasBias = narrow(state.range(6)); - const auto ComputeType = static_cast(state.range(7)); + const auto ComputeType = static_cast(state.range(7)); RunSQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, HasBias, ComputeType, state); } @@ -119,14 +119,14 @@ static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) { b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "HasBias", "ComputeType"}); b->ArgsProduct({ - {128}, // BlkLen - {1}, // M - {4096, 11008}, // N - {4096, 11008}, // K - {1, 8}, // Threads - {int64_t{false}, int64_t{true}}, // Symmetric - {int64_t{false}, int64_t{true}}, // HasBias - {int64_t{CompFp32}, int64_t{CompInt8}}, // ComputeType + {128}, // BlkLen + {1}, // M + {4096, 11008}, // N + {4096, 11008}, // K + {1, 8}, // Threads + {int64_t{false}, int64_t{true}}, // Symmetric + {int64_t{false}, int64_t{true}}, // HasBias + {int64_t{SQNBIT_CompFp32}, int64_t{SQNBIT_CompInt8}}, // ComputeType }); } @@ -145,10 +145,10 @@ void SQNBITGEMM_ENV(benchmark::State& state) { const auto Symmetric = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_SYMMETRIC", true); const auto HasBias = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_HAS_BIAS", false); const auto ComputeType = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_COMPUTE_TYPE", - static_cast(CompFp32)); + static_cast(SQNBIT_CompFp32)); RunSQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, HasBias, - static_cast(ComputeType), + static_cast(ComputeType), state); std::ostringstream s; diff --git a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp index 0710981fa17c6..e22018ae2877f 100644 --- a/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp +++ b/onnxruntime/test/mlas/unittest/test_sqnbitgemm.cpp @@ -18,11 +18,11 @@ Module Name: #include "mlas_q4.h" #include "mlas_qnbit.h" -static constexpr const char* ComputeTypeName(MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType) { +static constexpr const char* ComputeTypeName(MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType) { switch (ComputeType) { - case CompFp32: + case SQNBIT_CompFp32: return "Fp32"; - case CompInt8: + case SQNBIT_CompInt8: return "Int8"; default: return "unknown"; @@ -63,16 +63,16 @@ class MlasSQNBitGemmTest : public MlasTestBase { float* C, size_t ldc, void* Workspace, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, MLAS_THREADPOOL* Threadpool) { - MLAS_SQNBIT_GEMM_DATA_PARAMS params; + MLAS_QNBIT_GEMM_DATA_PARAMS params; params.A = A; params.lda = lda; params.Bias = Bias; params.C = C; params.ldc = ldc; #ifdef MLAS_TARGET_AMD64_IX86 - if (ComputeType == CompInt8) { + if (ComputeType == SQNBIT_CompInt8) { params.QuantBDataWorkspace = PackedQuantBDataWorkspace; } #endif @@ -81,7 +81,7 @@ class MlasSQNBitGemmTest : public MlasTestBase { params.QuantBZeroPoint = QuantBZeroPoint; params.PostProcessor = nullptr; - MlasSQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, ¶ms, Workspace, Threadpool); + MlasQNBitGemmBatch(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType, ¶ms, Workspace, Threadpool); } void QuantizeA(size_t M, size_t K, const float* A, int8_t* QuantAData, float* QuantAScale) { @@ -201,7 +201,7 @@ class MlasSQNBitGemmTest : public MlasTestBase { public: void Test(size_t M, size_t N, size_t K, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, bool WithThreadpool, bool Symmetric, bool WithBias) { MLAS_THREADPOOL* Threadpool = WithThreadpool ? GetMlasThreadPool() : nullptr; @@ -265,19 +265,19 @@ class MlasSQNBitGemmTest : public MlasTestBase { } void* Workspace = nullptr; - if (const auto WorkspaceSize = MlasSQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType); + if (const auto WorkspaceSize = MlasQNBitGemmBatchWorkspaceSize(M, N, K, 1, BlkBitWidth, BlkLen, ComputeType); WorkspaceSize > 0) { Workspace = BufferWorkspace.GetBuffer(WorkspaceSize); } void* PackedQuantBDataWorkspace = nullptr; - if (const auto PackedQuantBDataSize = MlasSQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType); + if (const auto PackedQuantBDataSize = MlasQNBitGemmPackQuantBDataSize(N, K, BlkBitWidth, BlkLen, ComputeType); PackedQuantBDataSize > 0) { PackedQuantBDataWorkspace = BufferPackedQuantBData.GetBuffer(PackedQuantBDataSize); bool has_zp_input = QuantBZeroPoint != nullptr; - MlasSQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData, PackedQuantBDataWorkspace, - QuantBScale, has_zp_input, QuantBZeroPoint, - GetMlasThreadPool()); + MlasQNBitGemmPackQuantBData(N, K, BlkBitWidth, BlkLen, ComputeType, QuantBData, PackedQuantBDataWorkspace, + QuantBScale, has_zp_input, QuantBZeroPoint, + GetMlasThreadPool()); } CallGemm(M, N, K, @@ -289,9 +289,9 @@ class MlasSQNBitGemmTest : public MlasTestBase { ComputeType, Threadpool); - if (ComputeType == CompFp32) { + if (ComputeType == SQNBIT_CompFp32) { CallReferenceGemm_CompFp32(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference); - } else if (ComputeType == CompInt8) { + } else if (ComputeType == SQNBIT_CompInt8) { CallReferenceGemm_CompInt8(M, N, K, A, QuantBData, QuantBScale, QuantBZeroPoint, Bias, CReference); } else { FAIL() << "Test is not implemented for compute type " @@ -324,7 +324,7 @@ template class SQNBitGemmShortExecuteTest : public MlasTestFixture> { public: explicit SQNBitGemmShortExecuteTest(size_t M, size_t N, size_t K, - MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, bool WithThreadpool, bool Symmetric, bool WithBias) : M_(M), N_(N), @@ -341,11 +341,11 @@ class SQNBitGemmShortExecuteTest : public MlasTestFixture Date: Thu, 14 Nov 2024 11:28:00 -0800 Subject: [PATCH 02/28] Re-enable test symbolic shape infer (#22737) ### Description It seems after CI updated to py310, numpy got updated to 2.0 and sympy 1.2 failed to cast float numpy array. Pointing sympy to 1.13 when py>=3.9 and re-enable unit test ### Motivation and Context Error: Linux CPU CI --- ...untime_test_python_symbolic_shape_infer.py | 141 +++++++++--------- .../docker/scripts/manylinux/requirements.txt | 3 +- 2 files changed, 72 insertions(+), 72 deletions(-) diff --git a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py index 103b68a4f7dfb..2f8fb84c4c651 100644 --- a/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py +++ b/onnxruntime/test/python/onnxruntime_test_python_symbolic_shape_infer.py @@ -39,27 +39,27 @@ def unique_element(lst): class TestSymbolicShapeInference(unittest.TestCase): - # TODO: investigate why symbolic shape infer test failed for Python 3.10 - # def test_symbolic_shape_infer(self): - # from pathlib import Path - # cwd = os.getcwd() - # test_model_dir = os.path.join(cwd, "..", "models") - # for filename in Path(test_model_dir).rglob("*.onnx"): - # if filename.name.startswith("."): - # continue # skip some bad model files - # - # # https://github.com/onnx/models/issues/562 - # if any(model_name in str(filename) for model_name in skipped_models): - # print(f"Skip symbolic shape inference on : {filename!s}") - # continue - # - # print("Running symbolic shape inference on : " + str(filename)) - # SymbolicShapeInference.infer_shapes( - # in_mp=onnx.load(str(filename)), - # auto_merge=True, - # int_max=100000, - # guess_output_rank=True, - # ) + def test_symbolic_shape_infer(self): + from pathlib import Path + + cwd = os.getcwd() + test_model_dir = os.path.join(cwd, "..", "models") + for filename in Path(test_model_dir).rglob("*.onnx"): + if filename.name.startswith("."): + continue # skip some bad model files + + # https://github.com/onnx/models/issues/562 + if any(model_name in str(filename) for model_name in skipped_models): + print(f"Skip symbolic shape inference on : {filename!s}") + continue + + print("Running symbolic shape inference on : " + str(filename)) + SymbolicShapeInference.infer_shapes( + in_mp=onnx.load(str(filename)), + auto_merge=True, + int_max=100000, + guess_output_rank=True, + ) def test_mismatched_types(self): graph = helper.make_graph( @@ -343,56 +343,55 @@ def test_einsum_sum(self): def test_einsum_transpose(self): self._test_einsum_one_input_impl(["a", "b"], ["b", "a"], "ij -> ji") - # TODO: investigate why symbolic shape infer test failed for Python 3.10 - # def test_mul_precision(self): - # graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [1024]) - # graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None) - # - # # initializers - # value = numpy.array([0.5], dtype=numpy.float32) - # constant = numpy_helper.from_array(value, name="constant") - # - # nodes = [ - # # Get the shape of the input tensor: `input_tensor_shape = [1024]`. - # onnx.helper.make_node("Shape", ["input"], ["input_shape"]), - # # mul(1024, 0.5) => 512 - # onnx.helper.make_node("Mul", ["input_shape", "constant"], ["output_shape"]), - # # Resize input - # onnx.helper.make_node( - # "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest" - # ), - # ] - # - # graph_def = onnx.helper.make_graph(nodes, "TestMulPrecision", [graph_input], [graph_output], [constant]) - # model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def)) - # output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim - # self.assertEqual(len(output_dims), 1) - # self.assertEqual(output_dims[0].dim_value, 512) - - # def test_div_precision(self): - # graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [768]) - # graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None) - # - # # initializers - # value = numpy.array([1.5], dtype=numpy.float32) - # constant = numpy_helper.from_array(value, name="constant") - # - # nodes = [ - # # Get the shape of the input tensor: `input_tensor_shape = [768]`. - # onnx.helper.make_node("Shape", ["input"], ["input_shape"]), - # # div(768, 1.5) => 512 - # onnx.helper.make_node("Div", ["input_shape", "constant"], ["output_shape"]), - # # Resize input - # onnx.helper.make_node( - # "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest" - # ), - # ] - # - # graph_def = onnx.helper.make_graph(nodes, "TestDivPrecision", [graph_input], [graph_output], [constant]) - # model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def)) - # output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim - # self.assertEqual(len(output_dims), 1) - # self.assertEqual(output_dims[0].dim_value, 512) + def test_mul_precision(self): + graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [1024]) + graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None) + + # initializers + value = numpy.array([0.5], dtype=numpy.float32) + constant = numpy_helper.from_array(value, name="constant") + + nodes = [ + # Get the shape of the input tensor: `input_tensor_shape = [1024]`. + onnx.helper.make_node("Shape", ["input"], ["input_shape"]), + # mul(1024, 0.5) => 512 + onnx.helper.make_node("Mul", ["input_shape", "constant"], ["output_shape"]), + # Resize input + onnx.helper.make_node( + "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest" + ), + ] + + graph_def = onnx.helper.make_graph(nodes, "TestMulPrecision", [graph_input], [graph_output], [constant]) + model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def)) + output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim + self.assertEqual(len(output_dims), 1) + self.assertEqual(output_dims[0].dim_value, 512) + + def test_div_precision(self): + graph_input = onnx.helper.make_tensor_value_info("input", TensorProto.FLOAT, [768]) + graph_output = onnx.helper.make_tensor_value_info("output", TensorProto.FLOAT, None) + + # initializers + value = numpy.array([1.5], dtype=numpy.float32) + constant = numpy_helper.from_array(value, name="constant") + + nodes = [ + # Get the shape of the input tensor: `input_tensor_shape = [768]`. + onnx.helper.make_node("Shape", ["input"], ["input_shape"]), + # div(768, 1.5) => 512 + onnx.helper.make_node("Div", ["input_shape", "constant"], ["output_shape"]), + # Resize input + onnx.helper.make_node( + "Resize", inputs=["input", "", "", "output_shape"], outputs=["output"], mode="nearest" + ), + ] + + graph_def = onnx.helper.make_graph(nodes, "TestDivPrecision", [graph_input], [graph_output], [constant]) + model = SymbolicShapeInference.infer_shapes(onnx.helper.make_model(graph_def)) + output_dims = unique_element(model.graph.output).type.tensor_type.shape.dim + self.assertEqual(len(output_dims), 1) + self.assertEqual(output_dims[0].dim_value, 512) def test_quantize_linear(self): """ diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt index 12db3bd132bb7..2d714e3058da4 100644 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt @@ -6,7 +6,8 @@ setuptools>=68.2.2 wheel onnx==1.16.1 protobuf==4.21.12 -sympy==1.12 +sympy==1.12 ; python_version < '3.9' +sympy==1.13 ; python_version >= '3.9' flatbuffers neural-compressor>=2.2.1 triton From 0733733307d26ca29f1fe04b3cfd9a525b0fee4c Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Thu, 14 Nov 2024 13:48:46 -0800 Subject: [PATCH 03/28] [Quant tool] Handle input models with pre-quantized weights (#22633) ### Description Allows the QDQ quantizer to handle input models that already have some pre-quantized weights. In this case, the qdq quantizer will properly skip/handle the pre-quantized weights. Also handles an operator (e.g., Conv) with a pre-quantized weight and a float bias. The tool will read the pre-quantized weight's quantization scale to compute the bias's scale (`bias_scale = input_scale * weight_scale`). Input model (pre-quantized Conv weight): ![image](https://github.com/user-attachments/assets/7d2626e4-49ad-47ae-bd0e-6339ac590435) Output QDQ model (everything is quantized): ![image](https://github.com/user-attachments/assets/393804d3-f042-47bd-895f-3d667fb2ae94) ### Motivation and Context Customers may use external tools to quantize some weights (e.g., int4 for Conv/MatMul). The qdq quantizer should still be able to quantize the rest of the model (float weights and activations) in this case. --- .../tools/quantization/base_quantizer.py | 5 + .../tools/quantization/qdq_quantizer.py | 71 ++++- .../test/python/quantization/test_qdq.py | 277 ++++++++++++++++++ 3 files changed, 342 insertions(+), 11 deletions(-) diff --git a/onnxruntime/python/tools/quantization/base_quantizer.py b/onnxruntime/python/tools/quantization/base_quantizer.py index f07fb30f10f82..6235db3234d49 100644 --- a/onnxruntime/python/tools/quantization/base_quantizer.py +++ b/onnxruntime/python/tools/quantization/base_quantizer.py @@ -19,7 +19,9 @@ from .calibrate import TensorData from .onnx_model import ONNXModel from .quant_utils import ( + DEQUANT_OP_NAME, ONNX_TYPE_TO_NP_TYPE, + QUANT_OP_NAME, TENSOR_NAME_QUANT_SUFFIX, find_by_name, model_has_infer_metadata, @@ -178,6 +180,9 @@ def should_quantize_node(self, node): if node.op_type not in self.op_types_to_quantize: return False + if node.op_type in (DEQUANT_OP_NAME, QUANT_OP_NAME): + return False + if self.nodes_to_exclude is not None and node.name in self.nodes_to_exclude: return False diff --git a/onnxruntime/python/tools/quantization/qdq_quantizer.py b/onnxruntime/python/tools/quantization/qdq_quantizer.py index 048c7f3296503..5552a4451c542 100644 --- a/onnxruntime/python/tools/quantization/qdq_quantizer.py +++ b/onnxruntime/python/tools/quantization/qdq_quantizer.py @@ -195,7 +195,11 @@ def __init__( # The default behavior is that multiple nodes can share a QDQ pair as their inputs. # In TRT, QDQ pair can`t be shared between nodes, so it will create dedicated QDQ pairs for each node. self.dedicated_qdq_pair = extra_options.get("DedicatedQDQPair", False) - self.tensor_to_its_receiving_nodes = {} + self.tensor_to_its_receiving_nodes: dict[str, list[onnx.NodeProto]] = {} + + # Maps a tensor to the DequantizeLinear node (in the original input model) that outputs the tensor. + # Populated for input models with some pre-quantized weights (typically via a different tool). + self.tensor_to_producing_dq: dict[str, onnx.NodeProto] = {} # Let user set channel axis for specific op type and it's effective only when per channel quantization is supported and per_channel is True. self.qdq_op_type_per_channel_support_to_axis = extra_options.get("QDQOpTypePerChannelSupportToAxis", {}) @@ -555,6 +559,9 @@ def quantize_model(self): if tensor_name not in self.tensor_to_its_receiving_nodes: self.tensor_to_its_receiving_nodes[tensor_name] = [] self.tensor_to_its_receiving_nodes[tensor_name].append(node) + if node.op_type == DEQUANT_OP_NAME: + for tensor_name in node.output: + self.tensor_to_producing_dq[tensor_name] = node self.initializer_quant_params = self._calc_initializer_quant_params() self._adjust_weight_quant_params_for_bias_tensors() @@ -958,6 +965,14 @@ def _quantize_normal_tensors(self): if initializer: self._add_qdq_nodes_for_initializer(initializer) else: + # Check if this tensor is already a dequantized value. If so, skip it. + # This happens if the original input model already has some pre-quantized weights + # generated by a different tool. + # Ex: (quantized_weight -> DequantizeLinear -> this_tensor) + if tensor_name in self.tensor_to_producing_dq: + del self.tensors_to_quantize[tensor_name] + continue + tensor_qparam_initializers = self._make_tensor_scale_zp_initializers(tensor_name) if not tensor_qparam_initializers: raise ValueError( @@ -1009,6 +1024,12 @@ def _quantize_sharing_param_tensors(self): if self.is_input_a_initializer(tensor_name): raise ValueError("Quantization parameter shared mode is not supported for weight yet") + if tensor_name in self.tensor_to_producing_dq: + raise ValueError( + f"Quantization parameter sharing is invalid for tensor {tensor_name} " + "because it has already been quantized" + ) + # Need to check if this tensor's quant_type is converted for some consumers. # If so, create new scale/zp initializers for these consumers. converted_qparam_inits = None @@ -1147,6 +1168,30 @@ def is_tensor_per_channel( return True, axis + def _get_tensor_quantization_scale(self, tensor_name: str, consumer_node_name: str) -> np.ndarray | None: + """ + Returns the quantization scale of a tensor that is consumed by the given node. + :parameter tensor_name: The name of the tensor. + :parameter consumer_node_name: The name of the node that consumes the tensor as input. Necessary in case + the quantization type of the tensor was converted. + Refer: QDQQuantizer::_add_qdq_ops_for_converted_activation. + :returns: The quantization scale or None. + """ + initializers = self.model.initializer() + scale_initializer: onnx.TensorProto | None = None + + if tensor_name in self.quantized_value_map: + # Tensor was quantized by this tool, so get scale from initializer created by this tool run. + scale_name = self.quantized_value_map[tensor_name].get_for_consumer(consumer_node_name).scale_name + scale_initializer = find_by_name(scale_name, initializers) + else: + # Tensor was already quantized in original model, so get scale from DQ node that outputs the tensor. + dq_node = self.tensor_to_producing_dq.get(tensor_name, None) + if dq_node: + scale_initializer = find_by_name(dq_node.input[1], initializers) + + return tensor_proto_to_array(scale_initializer) if scale_initializer is not None else None + def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> str: """ Quantized the bias. Zero Point == 0 and Scale == Input_Scale * Weight_Scale @@ -1156,17 +1201,21 @@ def quantize_bias_static(self, bias_name: str, bias_info: QDQBiasQuantInfo) -> s if bias_name in self.quantized_value_map: return self.quantized_value_map[bias_name].original.q_name - # get scale for weight - weight_scale_name = self.quantized_value_map[bias_info.weight_name].original.scale_name - weight_scale_initializer = find_by_name(weight_scale_name, self.model.initializer()) - weight_scale = tensor_proto_to_array(weight_scale_initializer) + # get scale for weight. + weight_scale = self._get_tensor_quantization_scale(bias_info.weight_name, bias_info.node_name) + if weight_scale is None: + raise ValueError( + f"Unable to get valid quantization scale for weight input '{bias_info.weight_name}' " + f"when quantizing bias '{bias_name}' to int32." + ) - # get scale for input - input_scale_name = ( - self.quantized_value_map[bias_info.input_name].get_for_consumer(bias_info.node_name).scale_name - ) - input_scale_initializer = find_by_name(input_scale_name, self.model.initializer()) - input_scale = tensor_proto_to_array(input_scale_initializer) + # get scale for input. + input_scale = self._get_tensor_quantization_scale(bias_info.input_name, bias_info.node_name) + if input_scale is None: + raise ValueError( + f"Unable to get valid quantization scale for input '{bias_info.input_name}' " + f"when quantizing bias '{bias_name}' to int32." + ) ( quantized_bias_name, diff --git a/onnxruntime/test/python/quantization/test_qdq.py b/onnxruntime/test/python/quantization/test_qdq.py index 24039fe7398a8..23b397ffd80e1 100644 --- a/onnxruntime/test/python/quantization/test_qdq.py +++ b/onnxruntime/test/python/quantization/test_qdq.py @@ -20,10 +20,12 @@ check_op_type_count, check_op_type_order, create_clip_node, + get_tensor_consumers_and_producers, ) from onnxruntime.quantization import QDQQuantizer, QuantFormat, QuantType, quantize_static, write_calibration_table from onnxruntime.quantization.calibrate import CalibrationMethod, TensorData, TensorsData +from onnxruntime.quantization.quant_utils import quantize_nparray class TestQDQFormat(unittest.TestCase): @@ -1925,5 +1927,280 @@ def test_dup_shared_bias(self): self.assertEqual(len(bias_names), 2) +class TestQDQPrequantWeights(unittest.TestCase): + @classmethod + def setUpClass(cls): + cls._tmp_model_dir = tempfile.TemporaryDirectory(prefix="ort.qdq.prequant_weight") + + # Note: swap with the commented line if you want to see the models in local test dir. + cls._tmp_dir_path = cls._tmp_model_dir.name + # cls._tmp_dir_path = "." + + @classmethod + def tearDownClass(cls): + cls._tmp_model_dir.cleanup() + + def build_conv_model( + self, + inp_shape: list[int], + weight_quant_data: np.ndarray, + weight_scale_data: np.ndarray, + weight_zp_data: np.ndarray, + bias_data: np.ndarray, + float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT, + ): + """ + Builds a model with a Conv that has a pre-quantized constant weight input. + """ + input_0 = onnx.helper.make_tensor_value_info("input_0", float_type, inp_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, None) + weight_quant = onnx.numpy_helper.from_array(weight_quant_data, "weight_quant") + weight_scale = onnx.numpy_helper.from_array(weight_scale_data, "weight_scale") + weight_zp = onnx.numpy_helper.from_array(weight_zp_data, "weight_zp") + bias = onnx.numpy_helper.from_array(bias_data, "bias") + + dq_node = onnx.helper.make_node( + "DequantizeLinear", ["weight_quant", "weight_scale", "weight_zp"], ["weight_dequant"], name="DQ0" + ) + conv_node = onnx.helper.make_node("Conv", ["input_0", "weight_dequant", "bias"], ["output_0"], name="Conv0") + graph = onnx.helper.make_graph( + [dq_node, conv_node], + "ConvPreQuantWeight", + [input_0], + [output_0], + initializer=[weight_quant, weight_scale, weight_zp, bias], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + + return onnx.shape_inference.infer_shapes(model) + + def build_conv_dynamic_weight_model( + self, + input_quant_data: np.ndarray, + input_scale_data: np.ndarray, + input_zp_data: np.ndarray, + weight_shape: list[int], + bias_data: np.ndarray, + float_type: onnx.TensorProto.DataType = onnx.TensorProto.FLOAT, + ): + """ + Builds a model with a Conv that has a dynamic float weight input, but a constant + pre-quantized input[0]. + """ + dyn_weight = onnx.helper.make_tensor_value_info("dyn_weight", float_type, weight_shape) + output_0 = onnx.helper.make_tensor_value_info("output_0", float_type, None) + input_quant = onnx.numpy_helper.from_array(input_quant_data, "input_quant") + input_scale = onnx.numpy_helper.from_array(input_scale_data, "input_scale") + input_zp = onnx.numpy_helper.from_array(input_zp_data, "input_zp") + bias = onnx.numpy_helper.from_array(bias_data, "bias") + + dq_node = onnx.helper.make_node( + "DequantizeLinear", ["input_quant", "input_scale", "input_zp"], ["input_dequant"], name="DQ0" + ) + conv_node = onnx.helper.make_node("Conv", ["input_dequant", "dyn_weight", "bias"], ["output_0"], name="Conv0") + graph = onnx.helper.make_graph( + [dq_node, conv_node], + "ConvPreQuantInput_DynamicWeight", + [dyn_weight], + [output_0], + initializer=[input_quant, input_scale, input_zp, bias], + ) + opset_imports = [onnx.helper.make_opsetid("", 21)] + model = onnx.helper.make_model(graph, opset_imports=opset_imports) + + return onnx.shape_inference.infer_shapes(model) + + def test_quantize_with_prequantized_weights(self): + """ + Test quantization of Conv with pre-quantized weights. + """ + rng = np.random.default_rng(123) + test_configs = [onnx.TensorProto.FLOAT, onnx.TensorProto.FLOAT16] + + for float_type in test_configs: + with self.subTest(float_type=float_type): + label = f"_{onnx.TensorProto.DataType.Name(float_type)}" + float_model_path = os.path.join(self._tmp_dir_path, f"conv.f32.prequant_weight{label}.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, f"conv.prequant_weight{label}.qdq.onnx") + + inp_shape = [1, 2, 100, 100] + weight_shape = [2, 2, 20, 20] + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(float_type) + + # range = 2.0, scale = 2/254, zp = 0 + weight_scale_data = np.array(2 / 254, dtype=np_dtype) + weight_zp_data = np.array(0, dtype=np.int8) + weight_data = np.linspace(-1.0, 1.0, num=1600, dtype=np_dtype).reshape(weight_shape) + weight_quant_data = quantize_nparray( + onnx.TensorProto.INT8, weight_data, weight_scale_data, weight_zp_data + ) + + bias_data = np.array([-10.0, 10.0], dtype=np_dtype) + float_model = self.build_conv_model( + inp_shape, weight_quant_data, weight_scale_data, weight_zp_data, bias_data, float_type + ) + + onnx.checker.check_model(float_model, True) + onnx.save_model(float_model, float_model_path) + + # Check that the input model only has a pre-quantized weight and save its scale/zero-point + # to check that it doesn't change after quantization. + float_node_counts = {"QuantizeLinear": 0, "DequantizeLinear": 1} + check_op_type_count(self, float_model_path, **float_node_counts) + conv_node_original = next((node for node in float_model.graph.node if node.op_type == "Conv"), None) + self.assertNotEqual(conv_node_original, None) + + _, producers_original = get_tensor_consumers_and_producers(float_model) + weight_dq_node_original = producers_original.get(conv_node_original.input[1], None) + initializers_original = {initializer.name: initializer for initializer in float_model.graph.initializer} + scale_name_original = weight_dq_node_original.input[1] + scale_val_original = onnx.numpy_helper.to_array(initializers_original[scale_name_original]) + zp_name_original = weight_dq_node_original.input[2] + zp_val_original = onnx.numpy_helper.to_array(initializers_original[zp_name_original]) + + input_data_list = [ + {"input_0": rng.uniform(-10.0, 10.0, inp_shape).astype(np_dtype)}, + ] + data_reader = TestDataFeeds(input_data_list) + + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + op_types_to_quantize=["Conv"], + ) + + # The final model should have everything quantized + qdq_node_counts = {"QuantizeLinear": 2, "DequantizeLinear": 4} + check_op_type_count(self, qdq_model_path, **qdq_node_counts) + + # Check that the pre-quantized weight still has the same scale/zp after quantization + qdq_model = onnx.load_model(qdq_model_path) + conv_node = next((node for node in qdq_model.graph.node if node.op_type == "Conv"), None) + self.assertNotEqual(conv_node, None) + + _, producers = get_tensor_consumers_and_producers(qdq_model) + weight_dq_node = producers.get(conv_node.input[1], None) + initializers = {initializer.name: initializer for initializer in qdq_model.graph.initializer} + + scale_name = weight_dq_node.input[1] + self.assertEqual(scale_name, scale_name_original) + scale_val = onnx.numpy_helper.to_array(initializers[scale_name]) + self.assertEqual(scale_val, scale_val_original) + + zp_name = weight_dq_node.input[2] + self.assertEqual(zp_name, zp_name_original) + zp_val = onnx.numpy_helper.to_array(initializers[zp_name]) + self.assertEqual(zp_val, zp_val_original) + + def test_quantize_with_prequantized_input(self): + """ + Test quantization of Conv with pre-quantized input and dynamic weight. + """ + rng = np.random.default_rng(123) + test_configs = [ + (onnx.TensorProto.FLOAT, False), + (onnx.TensorProto.FLOAT16, False), + (onnx.TensorProto.FLOAT, True), + (onnx.TensorProto.FLOAT16, True), + ] + + for float_type, convert_weight_qtype in test_configs: + with self.subTest(float_type=float_type): + convert_label = "_convert_qtype" if convert_weight_qtype else "" + label = f"_{onnx.TensorProto.DataType.Name(float_type)}{convert_label}" + float_model_path = os.path.join(self._tmp_dir_path, f"conv.f32.prequant_input{label}.onnx") + qdq_model_path = os.path.join(self._tmp_dir_path, f"conv.prequant_input{label}.qdq.onnx") + + inp_shape = [1, 2, 40, 40] + weight_shape = [2, 2, 20, 20] + np_dtype = onnx.helper.tensor_dtype_to_np_dtype(float_type) + + # range = 3.0, scale = 3/255, zp = 127 + input_scale_data = np.array(3 / 255, dtype=np_dtype) + input_zp_data = np.array(127, dtype=np.uint8) + input_data = np.linspace(-1.5, 1.5, num=3200, dtype=np_dtype).reshape(inp_shape) + input_quant_data = quantize_nparray(onnx.TensorProto.UINT8, input_data, input_scale_data, input_zp_data) + + bias_data = np.array([-10.0, 10.0], dtype=np_dtype) + float_model = self.build_conv_dynamic_weight_model( + input_quant_data, input_scale_data, input_zp_data, weight_shape, bias_data, float_type + ) + + onnx.checker.check_model(float_model, True) + onnx.save_model(float_model, float_model_path) + + # Check that the input model only has a pre-quantized input and save its scale/zero-point + # to check that it doesn't change after quantization. + float_node_counts = {"QuantizeLinear": 0, "DequantizeLinear": 1} + check_op_type_count(self, float_model_path, **float_node_counts) + conv_node_original = next((node for node in float_model.graph.node if node.op_type == "Conv"), None) + self.assertNotEqual(conv_node_original, None) + + _, producers_original = get_tensor_consumers_and_producers(float_model) + input_dq_node_original = producers_original.get(conv_node_original.input[0], None) + initializers_original = {initializer.name: initializer for initializer in float_model.graph.initializer} + scale_name_original = input_dq_node_original.input[1] + scale_val_original = onnx.numpy_helper.to_array(initializers_original[scale_name_original]) + zp_name_original = input_dq_node_original.input[2] + zp_val_original = onnx.numpy_helper.to_array(initializers_original[zp_name_original]) + + # Create data reader with random input calibration data. + dyn_weight_data_list = [ + {"dyn_weight": rng.uniform(-10.0, 10.0, weight_shape).astype(np_dtype)}, + ] + data_reader = TestDataFeeds(dyn_weight_data_list) + + extra_options = {} + if convert_weight_qtype: + # Test converting the dynamic weight's quantization type, which results in + # dyn_weight -> Q(u16) -> DQ(f32) -> Q(u8) -> DQ(f32) -> Conv + extra_options["TensorQuantOverrides"] = { + "dyn_weight": [{"quant_type": QuantType.QUInt16, "convert": {"quant_type": QuantType.QUInt8}}], + } + + quantize_static( + float_model_path, + qdq_model_path, + data_reader, + quant_format=QuantFormat.QDQ, + activation_type=QuantType.QUInt8, + weight_type=QuantType.QInt8, + op_types_to_quantize=["Conv"], + extra_options=extra_options, + ) + + # The final model should have everything quantized + qdq_node_counts = {"QuantizeLinear": 2, "DequantizeLinear": 4} + if convert_weight_qtype: + qdq_node_counts["QuantizeLinear"] += 1 + qdq_node_counts["DequantizeLinear"] += 1 + + check_op_type_count(self, qdq_model_path, **qdq_node_counts) + + # Check that the pre-quantized input still has the same scale/zp after quantization + qdq_model = onnx.load_model(qdq_model_path) + conv_node = next((node for node in qdq_model.graph.node if node.op_type == "Conv"), None) + self.assertNotEqual(conv_node, None) + + _, producers = get_tensor_consumers_and_producers(qdq_model) + input_dq_node = producers.get(conv_node.input[0], None) + initializers = {initializer.name: initializer for initializer in qdq_model.graph.initializer} + + scale_name = input_dq_node.input[1] + self.assertEqual(scale_name, scale_name_original) + scale_val = onnx.numpy_helper.to_array(initializers[scale_name]) + self.assertEqual(scale_val, scale_val_original) + + zp_name = input_dq_node.input[2] + self.assertEqual(zp_name, zp_name_original) + zp_val = onnx.numpy_helper.to_array(initializers[zp_name]) + self.assertEqual(zp_val, zp_val_original) + + if __name__ == "__main__": unittest.main() From 632a36a23394a9deacf19c221db4ff89287ac152 Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Thu, 14 Nov 2024 20:10:44 -0500 Subject: [PATCH 04/28] Update Gradle version 8.7 and java version 17 within onnxruntime/java (#22771) ### Description This change is to update the Gradle version within java project to 8.7, it also upgrades the JAVA to 17. Gradle version from react-native was also updated to 7.5 to make it compatible with changes from the Java directory. However, the target java version remains the same. Java version from these will be upgraded in a separated PR. This is spited from #22206 ### Motivation and Context This is the first step to upgrade the react native version. --- java/build-android.gradle | 6 +- java/build.gradle | 4 +- java/gradle/wrapper/gradle-wrapper.properties | 4 +- java/gradlew.bat | 20 +- java/src/test/android/app/build.gradle | 10 +- js/react_native/android/build.gradle | 2 +- js/react_native/android/gradle.properties | 2 +- .../android/gradle/wrapper/gradle-wrapper.jar | Bin 58910 -> 60756 bytes .../gradle/wrapper/gradle-wrapper.properties | 4 +- js/react_native/android/gradlew | 263 +++++++++++------- js/react_native/android/gradlew.bat | 33 +-- .../github/android/build_aar_package.py | 4 +- .../default_full_aar_build_settings.json | 4 +- .../templates/react-native-ci.yml | 2 - 14 files changed, 199 insertions(+), 159 deletions(-) diff --git a/java/build-android.gradle b/java/build-android.gradle index d5839f9f27869..9c4275b74f626 100644 --- a/java/build-android.gradle +++ b/java/build-android.gradle @@ -82,7 +82,7 @@ allprojects { } android { - compileSdkVersion 32 + compileSdkVersion 34 defaultConfig { minSdkVersion minSdkVer @@ -108,8 +108,8 @@ android { } compileOptions { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 + sourceCompatibility = JavaVersion.VERSION_17 + targetCompatibility = JavaVersion.VERSION_17 } sourceSets { diff --git a/java/build.gradle b/java/build.gradle index 34ac93cce6f4e..845121dd17a48 100644 --- a/java/build.gradle +++ b/java/build.gradle @@ -50,8 +50,8 @@ mavenSettings { } java { - sourceCompatibility = JavaVersion.VERSION_1_8 - targetCompatibility = JavaVersion.VERSION_1_8 + sourceCompatibility = JavaVersion.VERSION_17 + targetCompatibility = JavaVersion.VERSION_17 } // This jar tasks serves as a CMAKE signaling diff --git a/java/gradle/wrapper/gradle-wrapper.properties b/java/gradle/wrapper/gradle-wrapper.properties index 4baf5a11d45a3..381baa9cef1ec 100644 --- a/java/gradle/wrapper/gradle-wrapper.properties +++ b/java/gradle/wrapper/gradle-wrapper.properties @@ -1,7 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionSha256Sum=9631d53cf3e74bfa726893aee1f8994fee4e060c401335946dba2156f440f24c -distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip +distributionSha256Sum=544c35d6bd849ae8a5ed0bcea39ba677dc40f49df7d1835561582da2009b961d +distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/java/gradlew.bat b/java/gradlew.bat index 93e3f59f135dd..25da30dbdeee9 100644 --- a/java/gradlew.bat +++ b/java/gradlew.bat @@ -43,11 +43,11 @@ set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if %ERRORLEVEL% equ 0 goto execute -echo. -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail @@ -57,11 +57,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto execute -echo. -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% -echo. -echo Please set the JAVA_HOME variable in your environment to match the -echo location of your Java installation. +echo. 1>&2 +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 +echo. 1>&2 +echo Please set the JAVA_HOME variable in your environment to match the 1>&2 +echo location of your Java installation. 1>&2 goto fail diff --git a/java/src/test/android/app/build.gradle b/java/src/test/android/app/build.gradle index ecbc4b90612dd..5c6a88792207d 100644 --- a/java/src/test/android/app/build.gradle +++ b/java/src/test/android/app/build.gradle @@ -7,12 +7,12 @@ def minSdkVer = System.properties.get("minSdkVer")?:24 def qnnVersion = System.properties['qnnVersion'] android { - compileSdkVersion 32 + compileSdkVersion 34 defaultConfig { applicationId "ai.onnxruntime.example.javavalidator" minSdkVersion minSdkVer - targetSdkVersion 32 + targetSdkVersion 34 versionCode 1 versionName "1.0" @@ -34,11 +34,11 @@ android { } } compileOptions { - sourceCompatibility JavaVersion.VERSION_1_8 - targetCompatibility JavaVersion.VERSION_1_8 + sourceCompatibility JavaVersion.VERSION_17 + targetCompatibility JavaVersion.VERSION_17 } kotlinOptions { - jvmTarget = '1.8' + jvmTarget = '17' } // Conditional packagingOptions for QNN builds only if (qnnVersion != null) { diff --git a/js/react_native/android/build.gradle b/js/react_native/android/build.gradle index 825990eba0fb8..44fb9dd7c433e 100644 --- a/js/react_native/android/build.gradle +++ b/js/react_native/android/build.gradle @@ -7,7 +7,7 @@ buildscript { } dependencies { - classpath 'com.android.tools.build:gradle:4.1.2' + classpath 'com.android.tools.build:gradle:7.4.2' // noinspection DifferentKotlinGradleVersion } } diff --git a/js/react_native/android/gradle.properties b/js/react_native/android/gradle.properties index 465b04d1f5813..8fe6e40d76911 100644 --- a/js/react_native/android/gradle.properties +++ b/js/react_native/android/gradle.properties @@ -4,7 +4,7 @@ # Specifies the JVM arguments used for the daemon process. # The setting is particularly useful for tweaking memory settings. # Default value: -Xmx1024m -XX:MaxPermSize=256m -# org.gradle.jvmargs=-Xmx2048m -XX:MaxPermSize=512m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8 +org.gradle.jvmargs=-Xmx4096m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8 # # When configured, Gradle will run in incubating parallel mode. # This option should only be used with decoupled projects. More details, visit diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.jar b/js/react_native/android/gradle/wrapper/gradle-wrapper.jar index 62d4c053550b91381bbd28b1afc82d634bf73a8a..249e5832f090a2944b7473328c07c9755baa3196 100644 GIT binary patch delta 21827 zcmaI6Q*fYN6E&KNIk9cqwr$(C@x-=mTN6)gI}_VBCYk*2`7ch@S9R*#?W)~feY02h zTD^AuG}!V6SR?HZ1SOZQtQr^)5PA#{5So-EVT_dVHO!PqS_Gijr8tVDK%6&qZeU7XO?s53M}(nQWu(T*V4y~Q+IgZu`Cg|- zA^NxO&)4z&XPTQ4T(q8r1kU$+3v^INRW5@oYbsjnN+f1%-qEOBTa80WAz(KZ|xo} zjmJR^sH^9dtu)jPdVc;q{cZ@*{lgFH-^|rx5jfrUv?zo&7@6xf zqo{2J?XSS)LMbs4-JhM+oux%=2gj-LDutG->ubB)2_?)EB{+^WyZB+!7mT1{rLTY= zhBe$m_UQXkTYvIm@mXsLzO;ZaX-sd*8TOU{+u|RaQ4=3OA)fBB{i4Ff0M>x$;G=Ma zcigTy3Omv^$`Tq`q03>8Nu_CI-oZETO1CF?vujdca}q^5VwW%3jU=l>GX0P9$&0ck zdq~l*>>GvgA6Taz%F7GuSNLUoa04^fN57B& zyco@qy^}+xizUm!uOdF30KJ;UbaUDoc=X2i5;;X{GYa;D@a;d{4Jo$4RP>X?9tClm zA6c=cM=%!VTMjMk)s!gqqkA5#*o0Q?bWlKK)^^(tV3HwlK-L%B09T73kG}(|+OA-` z^lVb)kt1ER>-6ZSFd(c;kIq8KC)S!(aj2|HINyl4jgt?mD+-z(UScExUcp0v(;MW7 z^8L9qVV11`VMH~qbKYDhqq0|Re9{>1xW5V8Te9E%M&PXgi&h{f0k3Pc{q6jZ%?}_H zoWB$Olp082{{&B9j-g0t5mkg|jl>CvE}(wv3^&}%z#;L<4oA*icEVHCyrV_v8+8Of z@$FclzI0)mRJ~!yEuXL@E9{#QcM1j)91z>dP$XitO{IHxC-z@Kr}#75o26R^MTDIIu@^Iea}0XB#D?J(~_3 z7`p8Cq4U-63wntR0PH+uXw0Ih;)4~DCi1CH(GY9g!eTZolrQ9m9%L3~7}SPu?8-EC zcLo2{|54{e>ya;Y@!R=eD8mVSi?8FvUqHLI`qMWi=TI0=`Sk{KnuJ zjPi7bc_|V4WAV6OZ4_+Gs@1fbVqp|C;%OwH*_Dv0RWBbc}nZ%#zdQ64Bn# zl?%gu(t1RXAtW~S-c)6?VYP87Jk5m*%|R&;Y&h(SucL~?-dNofI3vkQUv6EhQCS#W z3oQ`_l46?W%km>bXxOW$0R5^Gi^cGDmE6>LTAV8rOKNLot}L95UJ+~aCnj&5ch`>B z%WSQ^g0oQ(0n62u2eV_bKAMLr`Suk=n|uk4rL-}Gb^Tlp-1LJADI<||x41^E5S1Y~ zb7f8!!V(lgB-nf2JU#d&oX%P6hfn>GH-9-3)(&PHu81o8+t8XiaHwuT>63bDqrmKr zMiqXD8pL&!CYDdL1$)zZq8^CPAH%Od164X8Y8J3`VI&}a99NeerQ?-0Io8TFlMB8^ zPoTgFCd2Alz9-gvYLJiKe6@P)uiO%wRLS6os1f{`BeE3zD`Wb2X;VgxhN4R0*j>M3 zi6e%iMl$DI0RDmkh*e}N)fg7o%$!@|Qyy=a*dHV66Y#zA4Zkt|uz&I}?9a`HKwBu^`J~UHFKq*{b z|8(%QtrwJn#0buu?cY8K`bV6=Z;+I8-K42=@Y2A=s@P@?oHj0`784JhgLX2=du7hZ zEd+_s#I?;ll}t~lNl)I2K&+&9G{VWdktxQ&j9D;#q^9vLwWP}@q};;cTh}+ z@l6hvdy{YvPAQxjeFbbmzZXyjBC(adii z&Qv@6@yWf)RPwzzhOqy@*n1CTsjg{ZQ{7+RL3KP~SyibD$TX!~%E$<@B+)$~v!iXJ zk9RI`3`JpEvSmh@x}~d>rRcH8@S3OPjSXPg+4Zu3-J{cJU z;jr?$h8jO&537S132!9su=0}hkqRThWP&SQWwjYCUD2l(^+)^^j9X;yY6%`K6DDmF zbWI~c%|Z}6_!EUmQ~Yfn0+SQ#tP$#s80yWSMXqV)tSK#lL`}#}WDL^JeGf{%jCTVb zIWbwl8Cmj;Jp_lKv~-B7IR9_aAy((h0oez$&~k!{gHa+fbB8PRkWyt$n&-q2{4w{2 z8y+RqHJ^P9$!O#-K0hc$-#eBx6px6u_@};{nutFw*mH>$)(~v)8Ipz>GQ|LuXWNw! z`gXl&#i7zX`e7#MDeVClSzkQQ&#DLFOpR`UIM2`={z&F^H>`&a&eB{vE955?NfPox z@<|Tub!n#hr!Kw~e693;xpM6cW;>bT+fXdPV0cjxX+a{`#s#eC}2O3AI)1&>X zv4t02&WA?qK{~D40-BYA@gsjGWmJ%^e@0_jLuHXKysqSZDQ#%=F-aSY9(2Ww4X!xw z7edknLe+}YVZ?)EO{TTfehQ0Zz8RLF03<<$9o32$Q6)0Unbv-5!0e33Vethrydn5+ zGS`SUyJx;dG)%qiC(l$vm>ieqbJb@}uwy}RTtbQ30RDhNn2h>6hCJ`qsTr8kCK8pb z@!##tV=X#LUX`;%i-aQ8L9JADw-6gCDCPp;{Lq%w2{BD;Odql);(tzY}Z9jw?UjauCZ@ z3t=Pk0QZ{}LQsEEz3bjLq!K8QtBi z?HIxS3jnbHKQ62t+{|4ZjQ^jA|1AynuW0fE6a<740tAHO|1VL;+DX;U+KIu`&e+v8 zOifpHNeJyZ60h(#nzvxWENjsTnd`2P}KL%^4&V5;wNMJi| zXQo_sGLD_Y<1-myd=#K)baM8|VFfvIb@v%NPag}}>N-G02#Gz$UjfkS)tkD+>0Ye0jar=7%=EoiXE!Hdk5ucsnxgz{njwOkA5>#;k5*orMm!FJN=e0&==H= zaYJmmFJLj=^U*DF3Y2_=%zKnr$)*oh4xXs#b8}l^bf4K4m~I*@{>q{^m=LH7ofGa|Nc4 z(^xQDF18*5=BBgx^Guv@!U9hWVE6hdVRGr&KHnIh&nPvse*UD%He0s!iIFWZ@OINn znV^>ZD~`;H5FTEoz9{?ZiwivBXn#2485!Q*8T|wyX;H!ShGH5a*X{bmRNjn(X>Wsm zqHOI~oqVf9zsEI4S8a(q-1Q~XqGOW6@67>0A@?6+Ndqu$3k7n7&BMb(ROcR@u`5p2 zA{TBp$&d~09Ws`oXJ-vdH-AJYEiM)rw&4FThDPvi)$L~k z2Lbs5^&g1;uO91#hDoW0p#*kSan;fOIdJ5JnWL&mQK9JwZQ_8EtJA_-+v*bG;K-1p ziPg-KcOq;uba$)^eTNIYEobzer7U3@@{o$Sm-{be{UiP7vw)qq;4H!aiW1-k%Y~mZ z(aHI`<=T7OeR{P`2>@Tv{j_i6VxW#}#ppweu~I4Q6S?;N+^DDb7658;2azU2c1P#} zMXd3b&}_dhMX?vJi!s54DM5!bJ^QD(;#cRqvO3tx0YwHkx) zhfrYE<9d&8=y>@DIFJw5?3hl>caaul>pI{ulD4rJd}sL-A&yKlZmQ{1K?8~xmQ%={jC-&sMHm8m%MjbPTU5tDgnye~P=vVMAk}U_- z3}`%6_aL^`i?Mf$I)0g9{KgqMvYTM(O0!e~)j1nfhqLFhE5gUeh%a0kq=rYS5eB=} z%9L0bgvo6^13JA|`eVavGufFa`EM7SZiI98BX!)*&RCb&IU6&E+f+$ralPgS|8_X+ zgXwYJ5sSV6YI$O}d-C*KXyiP3|4ywOA?e9!mz!>vL=aaj+_4qbaHWfsec#3Jo#i{o zldd<9J1RY~gsKj4#UWEnG zl8o^+z$HhKQ&zV05z34;10-2 zQG1Y{kvdAve5~~T$iAFMx!2GsKdn)Cm(Fux036D@eb*MSIP*q``Kxw^q)jjsE(-Q5 zk8+nejZm)Nq9UwdjT!Qm&(Us6yv4pD7v6vR<2Q{VqP-y@w_dOq2!X8wNTiaOB`4;@ z@J-O++F07}w zsu$PWW~)t{iS^Vz@0|A@lF$2HrXb*L2u)#bmL-haO>b2!TV7bf^pwv>c1HW*Ggfml z>I+aMiaZ+r?{v-&uG=gE0|5#6ufwqYza0i*6$?mH-*#0MNBh2(Ka+RhWE+;L(yBsX z{!eg=e-?@tmKGX)821&nf^O#IJsmvnc)6OM3m&oZWEW3!37o?tPICqF2)t>&?V%2> zZ?>kC=ArSP-*9*LxxVD?a(BNjJQf5%I^mFmNiySM7pg zLB*G8uI7rVc3GQuVr3-1w@SPB|I|~(q2A>lYyI;MA5fDt^NAwXbCOLis<7gA>3TWN ze!>{emZyy>wuSYT_DWyGjWI?Cy`J&8`3=lOW%n`Q@3Ms5`oMoyA4)YC#n`B$Q0$(c z9T`BOc>>xWEoQy@K4sN3>{HmlUS!LTo1RCUXVHcB zm(33Ufzu5Q#y5(~_8`RJ1nRher;)dPcgJXLoNb%MGq}4y&)Fz-Q7y`7(Hrpsk`xii z;5dL)UBWwHT}k>v`qTCe0^nC2>NBw;)apU!;D5mFLXj*dBx&N~QyHVxJ^QUV(CZ^8 zB?aLd-AmPfi*|te5y)5OIM13pLZ~%T&=JySbl|9VrTJr1C$iP5giHY_R$h z7}19D(p^at7}MEldBWS2IS`YE25sgtkcNi&V-$%GMSGvD`R}!tQoA`!`ZVV@$M4?% zHQ)E9^ECgl!1d;r;rEOyBgz8JKV|9_U;*$t6Fl$ZJNs(43aFa@_8J!_^g46?NXrP2 z@4H_#WeWkL6aasP4T6?B0Pc}8V{?r}b+XKQGsM}&1 zFc`s%60dhmFUfij|b}6<* zlg$yfM!Qx20EuY7Z&CDDoyOA(pc)iTuC2c8C^c5#+U5B^`TLlvB_MEyPn(f( zY)z}JVkDHw@mn~olvrC~SU$A9IR2U6>83^7+L;{|hEir{p4r&ibX9lsrE0CI18c?~ zG@Y-ntLX0jU5ChfbphuA{Ca(Qy}p3;@PHJ(&eSFxJUB*|`?vGrei_5U)LC-BZ|oc& zmUn;Tbm*jlC>b~UCC#72lpL4mf|5;x<#|~GnF95@PJ#tJYDfn?%KD{}k>Ye{8fpT) zD&#D|1Txk_4D0t+v-(D?7;g6yc&3bK(tf5xd5Y5B!QlE-Ij#o}PzJ%Wl_73|+!9vx z%O|`fKdu#BH!IivzL8ihZaDVl>CAz2z2Y_=XK>+On7>P1QDXQH1x!SV($?))lQJ1BVoVIy4x+0c=BX;0Ywr;KXvDGCY&CKH@Ns< zQ+zqIgjGGcfL>J}z~hz~a^~b5Z9la+u?q2Kgoj#wKXh=779PtzZ#!si<1gpj+0!mW zvt`R6S_5=H-@|uhzqXEaWhXQeUNk)EV)+R>_FcTqKxw%Bc4Dxd;0Sz6Qy-_*RNOEw zr&w`#YUSB}<2<2sZ6f*vgI(#g)O0$3jT1f5Mu5@0RHQT=P(OZy9h)V=QZCsC~U!nkALP)KipT z(nW?c1wms0gfx9hlb0c4e@&dB{dF(iD@Wr%SD@`t-2Z|l9A}oy#87mej;nTf+2iQ3iXm>@Do$U!qbcW2WMN{Q0WaaPH>dd z=E>Ygs>JtPT31iKab(Hgd26ngjp14>2FyYZ22M9*|PrIuWu>B(=Tzyl0 z${#Jj_&s-L$^L=oZ%{(&CRMU|jzqHw52XV&c#0o=eZWjw zZRspiw~!*uEeJ|Hcwlw>mzwxZzOYqs|MeLt(WeL$E-;?)zbR1ZG2WNohkTmr5nBTD z`iBv3v^auv*$oeCZ2x!w(L>3%99Y5Xd*uMR!?E|a+IBINtBEiihemDYpf4X9B;<4M zEQL&o4&k$x&_P9;Px=6v!;1G!Ij?N|r8n#Vk;7Z)&4RzkFgW)->>4^tNtLljlUK7u zkm1Sq3xT7%$Cl!*cu`bLr9&qB<$-|p1lomJqSHf&_91gn3u-YpwZd2KSzOGCH=EWy zs2!&$i-bqcas%cFjPJ4h><*DTbmqN~h+=tc;GdKL3BfUPr32YR%y+bk)(O!O-{&R{5< z&w1}kv?gn6M!+y$)_`M@8W9F3Sd|+I@)*ZHh!s?l5g2Z}$H0vMEgSCDeCva}I#P2+f_?VxJAO1jgkX`Bqg;B--;1BHKGrMnf_{Am@J?gC#xECQC4N~ z4u~6Q9uhwAN@J%=EVNq8#}6Nk2c8Em4afJAMfI>P1~vn`&tpYa_cyI|PhOYZhctWYnhODv(%D z-X`%;54AetMCSDHn*1n8CEhQj6mAvLO?n42%r2!aN8JIHlF{zEy14lj&3;wJ~9~6v>c{dY`Fq8@l$=I zCVaRg3m|;ahIf1TLeP2}hh0HUet@w4B?7MuNz6-~lhk~U*I=%tg4X6%5C5RD%g4t& z*e#BRjwuaJ$pA1yy_+(ADk%Tu6+%~OPU8ywNLKh2CzPEi;5%t*awtK(`AaaWL*nRw zP1u_CJ7N?b9x0AgLO*1|ONOs@#0HjnjFE+)ovXuh_HhD-@bMu-_0w zQ%btKMgtn+KfjA29m*doh2-H6o?#szV<7>%W`vlXqYClG$BZ$L+lq1|#F6*hG|UuhudmseFtaNM7u zCaq5ITs<(;qnp}^M3-c>^7^h6wMnF2NPNU4*`w?FIzefi!+o9bS-NsouTgyR8Js|$ z$Bmk4b#N))NGBUx4$boD(5IVG8u~$HpLD$={iu2e^qoO42v3yq3F+SpWOY_ zFMvWvsYgB%^3*?iyWkn@o`@#|BYL>pfV@ChJk8S|@H(Ojkpx}RupTv%?j#sGnp`I> z)VFWhX>9>75uZKjDJHHsI02&S8tjtj?2SV;ZB-!Gk3HbjIa~kG6Q8mkyMi0+m%Az3 zE0=nZeZEtID8YxH6uC_hOnsqB7m9<9B;ZbD#qgJIcW_SisWpxQ%ocnuI@>bJ|4}iy54^r6wFJV& zEqijzdS3`3e;`I-o;w99i=6YOP`ov%x=NMC-o9f{<3suUJXzd8sZV~)?(sQA6sV_b zn3_L;&+D#}z@lM-F_LvcylTb%qUYv93x31?h(|cMU2M_v)^iwbh5C~7U>zgSQvPhR zNB4A1sd)j<%P4xx**bI^=;xxx?jMyMv(j$g%`5tEQe^A&xyDxSH=@f&@4{rzV1w4V zc#DT$TswyBW)+Q6XqvP0H*i#$0B-v@isw3x=Nl}2Qw z&N}>i-CnV)xz!J|^!&9A&l+hHj@s($csjf~K69d_#*?mZ`A}9trP#Jpd!f}j^VL0+ z=PH~pn)t4=tjlh02f}jdAK9#KS-bApYJIe#8EXaQ`5*AV@XF#T#O=5g08J9RwRauX zWs2GMoway_aE`Y$=B^7hRc~kFXru#1!Do0nfta20*C18DPLvn_0?Vleh(IVLufaU> zR)n)I9KB6z=4&yb8_<*b67^EjOi!@25a?^BXHa`Ew%9CWB@#Ap!>rZ}hhnN`3%p8& zeoN_LZCC;JbKh3Nzq?YmK=9$*nZ*YT(mz+Th1YYUFVbxgd50r$H`D@2&PNuWVRkoK z&UyPDlywcG69FSrbd(LORq8+7@|4i;aNT;cb3qko*~3A8tuOvR^ zTw~ZgVMiy!4w&;(^ODF?lO{;~xFKiSSakbv=YOBTT!f(7*19_K0Rv&Q&P3=8p@SNs zQs`LEao(VrNwjL!O3|V*+_dTp|@SS%jCf%X-0(=JU&jFa_+54-jh>qBdfp3qMWE z5=#(fRaHTW&ALCrJekvhF%U#bSX)%H1Xp`u8jm0eAVQwpz{Q!_v;72leY)O(O_3nD zkWAwT$_HuIXxI+ZYQs3(-0Z}#4;+seC##hK17!x)^K ze)!W3&#nVAftv~6-d&g|5eN4r_M=32c(z_Z#cr6iX}|I%?(94?R=7d&ICJe5t%d}g z=0~1hZ1)5;u+2`xLh-3ivh<9>)rVQ+1J|1#XJdpdB30A)&o4C(QY~0EDZjn{=Wpm7 zWbj#fu8TUUjX{I8e$dB(`>`j=#QDJfzp78Uh0~HKZ?0ZW;2dpM?ZrEv5MSgzzopoK zu>Ah@ zBTbd7MP6epvo^j_ECX+}W$31B$&KOROaxcB(#0NK9$RKv7n%pMM<1YzAin(h#HX?X z*S{1aRaFrfoAKDVl+LS|Gt3#{$^I4B(@8Ip{IQAWp=1fj)2|*wr1TZO+D)sOp#E6t zE~}mZYOC{HGHLiCvv?!%%3DO$B$;B5R%9Hqp|GcLuK4ZTSVc3v%l!|fF8f&Ci ziC_NmTpWr?+WI!oDJ9^3_P0)&vZnnC=|OXsUw|yV6J;7y&=LI(&F>Z|1ImFW{LRRQ zNGb1P>9DQj4z^p9!(t!}H)FnU;$>GkZ#ZsNnh^D0&^&jgALow;xclGOcm_N&h~2UP zTuT_y1Z)aF`vv$nInbO!%fSd}di$Yi;(zyESy*Pt5g|Zy32iQ$BAbk z!n37YTg616pojDJ_wLu)(7&Oo8riV^ ztSlFas6;fT&?cQx;G1@cvc30JN`N2^3Z(aNoJ~dBQotNsi*~{tjg5PLLU2-YIxHc?R z3lGuWeA-JQ3V)>m6!WU;wtBA4=$j<>SgRyvm;xvBxf?wqYwh*-QZhM@gDAXsKjrY4 z!Ul8hGJW25!YHdKx%lG_n*{4NNds#e9kzArxSa4Zi74mD#&k7A>%0*yRXo~-I{a1m zK~iqRW9bT4(k=N`GB0oAvv`W;{Tk}xZ9S~`x?$b*+sY)MiGY2-S?aAcGzEo#$kw%- zk~|lsgAG@XvxGN7@)z`_!E(cx+=}#iX}v##aQ+W9{G?QS+j7*KLZap_l$Y@@jmdZ` zgT&@eYYP_884p$yE$LvDgp*h;Wtak$J0c2nyD@oK4%3+6IzB!qP8zE*4hZ}+wMH=O z4 zahPD@ohXE$Nj>2qB}zc`p5=ubtJ z^pqWG6<{9m9o|Rlg~AF-Ygw|E!Gh0Ue;n#kJ06yYceL_|PHW9L8ry&T7%Z{35kt3N zw+OJ-#cX&Ob1N-nZJ)WY+32O`#UDI&Xi*n&nAlbyuGS0LPAKR$OU|Av_Ubq! zr!mj0mo={$;7hgMs2}P$qtEU@(ruPj_UB@S#2?$k=;`ZLUt_-B!t$?Y zL1f!9Jl6&0KV9jGc(fr>(vu!rb*ov1f&wKHBs$3q)xX@-M=<;#(7T!fXLFS|2W@aq zRdvTFcFerjm>tjc(og7T%?Xi}Y`$X-GdxTaf3vaYTRywjdQSzjV~Utq z!{CROf;YeqJ~pdJ6@fz)Zz)k_)yPy9{B9uYt$F#wvpxDN3^BCppj~;j?y`WnunchB zvL7*F(1PpF>oht6^E?Y4q(Tix&tbRc&uR1CFLyTwYd#0{ci=D@u)2AiERIA6jYvT% z-Kg-{wO^QOuB3Y;?%L0qVKB-6>jiz$IojUC@y)l16eWS9obr1E`NBS6DX$rFs~?h$ zemJmb@w!{h^XWtSpD)#|G$*D3?mCd3!#J4kH*6?3HAQ1(4w3Wk*0Pm08t;Nw+a?ya z)}&3Eo-Nu>(o0jJfn!#;vdECfp6D-9W%WS?lF;Mov>YOdXE|pQ?^4Tn-ubpzAF}^b zAJ`oE4Z{bH2sT-ELzQD@Xr*JWn702Cncp+GR_?hJ7I@Wkx#a*n-7lN~g-g)|&u5h@RKWa^vBa~QvOsQxLF_Te-O!AQi zmOp>-m*_oV=yp@Y_M1)PxgN;x|I=mKe z^p^O|mVYcdekgy(F5DXl9iGFlVU_%RtSrt{gN}D9W5gYK1EV~7#+~#Q`i1vo=y(| z`}k+INxbx~3s1O`&o(0AU_l7R_;f3^)~&AW_IRly|Dh?MUq6>G1R>#*dAlQe)MFXNK^NxNcIaln;G%%YB~uZV1n=S50Q2BVUT7_92X@7Lgk;nO z^Hq7pElB>LKoe+)2`Lah%yEUe>0HAV7(((hj;(5Nw6TK_*%?Fm9-6;u&RC4^hdy7F z^1Whg)FYF=9AU8BhDK8OcIX~z=o7!1en9ee;O{4NBI&=?V=W0fJ zTe81?6?i2?`8vU=mm(@g*=$9FksagV=uaGX#C4z*zs~+tAyWf3tb_ar{*r-{Kk>(n zq;V$e$0suR5loSZk%<*Zj8%r70Q>0xshrw;IWmElW&yqe9P zX$XWzLvpcjB=vmoOK$iehxlVuLw(L_vR`!ezn&Ky1dqOd{IAiqaIAdc``@=RY)BB0 zAN2n@UHB=d0z`lv+^XXzNSO?EnJ#QL(g;=#lDrV3P4?b)BfPn>^@KysKT^kzd9mV! zVOhpc znkOgpkoW2v&d$qp#8Cgz#jd+ahFAZ6Ry(t~8q9VagBe&kEkeEk7ZN_&QZpZGHAz~| zm&)McfJ@Ce7dHG*Gwk$=*Cc^BTi)1{Urz5KOUYU)9T##bT|7LqCVnZ1)<7PBuKuuE&`)NRCR8 zOaU*K-4Vyjvx%PuR! z$3aa^;hl#mvqQj{0Xd{R;H7&-_^>SNl4EALZG^9%--6fJuQtJIo-hu@!xb5SK?dbS zUN@5ZQ$#0oh?Z6`PM;?{bv%dkwKz~(1^x$xEnPC0o^%8jmK$@NDMU9gTbfIib=@vB z;0ZVyVqvsOeNFCcm=6$L6A+HcUktNKSW6>lm_YK&_{S_pQP6S@Gp(``Xc^XnwH}}M zAF=GqgcpwZeY@^Xn2OD9`zFWZk=xPNi5i4_Wfc zojPk9<~tVW)Ooh&R&eH)&bjIr&VS@HlENKX7xB?AdNnBs=NCPq7*wnBGp4MilxZ`_ zS0?^+sVmU5@{5*hSUxB9tA-EgpL1VqSnLGyHUD-BTdYFAR!j{3;z@HVDOi&Rx<=)B z=ntQ9I4@iA#*Cdp*l^3ZMbGS#>*xb^=tyFwa!^YXu2k%8Ow+#wXNIp#x($wNgjc*&Kj0>P*r=wBA9~6Hmh6TF znaZyHuYw8#Q;lK)6N(g#Wa`E)V|idZy?n;9!NG35r^A7kL*wW_adjxwZowu4cvQaVo*mO+8r5LR+Mgz=HkI=ob4Qw_IRllR%n$ zh_{Y?KT>^d$ALWMawfCNtH!xx-sukl(Wx$SeAsoGoMqbY1if3viKKOg-N~c61WzqF z)a*g#8g6v^7L*)$xoC)kYYVfQEa)j;pLtu)2;)xddP?3l$X6fVK^A*kcP?vIde1bY zoTZ_{y#0E$!PcRBE&EQ>Cnums!i}WdYA(-`M$kqju@Y>Ia?qaIdp9|fDb90zjIP^a zs$7DOdRBjN(VjuCxs@EXt_N_dx&SJkQ#_LORf zlPv0}BE>Inpgd$7P{!4aHS;yE8!f=cV7=~*t}DgzlwQ{wh^OM>(~aSg2tl6q;5C;( zQ-e*uS1aCD`KfI6{GxT;wo+vAi42v3#C{fEp||h|L9fMQJO+%H;qb?a>I~{LFDZ~a zy?v6-Od`#}0hM(?wG*8dTi>QnKdiq{kD*38~F*ez-=W&9~!# z@PfeC7*2M|aauscu$f4eVcbJ;{P*GznbHVyHNm@e5DMkAcJ|8TOyd!Ng+h3HVcCf> z-frSi%xDWUkeoN)}RDAVE| z^GAy|AdQ7TvMRTj(UG3tbH_u*wJe{+=aFI{T>iPJ_zIyOEm2DBq4ja;=!(iA#^$3SsQ*CzF2%kqG zZaw$|a}2B+W9!H!$X!M*(1_Y*%~uOx=xj!C7SQF)aYfIJ;Z!~PU?LPW>nT+6TaX=f znHpy=2fvTeF?d|k(}ET(E`Ymr1~Wvi*n?R-8|P^bclWKRa}b z5(aDCEv$Ka&MxIabc zjp&OHJT@4$`a}rnn|Q_P$+%^G3kVR(_J@3ZoPk7~r) z9|aCPkkA|K!%c-*S0l_}N>4k6`@ILk-RggC+#6A{2+v=L>m)ouV4AHx&xoe6OmBs^ zxiZ_`1qc}3h45M3iTV*PWkr~i_p!$AN_qo=CC@SlyL;Vl8!%%Km4Jpo*~3 z`p%nUQUNi9oM#Fj#RI!1w^*OxYLnZ#Wq=)QdkqyqtY?=!f=4!!!x&6i)1nq_|8*CI z%?m{LOrA#LOtXpbX6%a;GV&IBTlZ<&=<=F42~KObJZ>C%?&Zfd6X&0lNYj#S%uqak zmvpd&3pTOSveTmZLN%oUClnJ(F<&O{2s@Z;n8x(@5TOhnhTr^uvLYpm2ziraq5<+; z6Nh|gjA{DBkjh(;fkiWGI#i_)mAuJR*4$s_zFo8M)HQ)}jSA>7rWEi2$&M2KY_SdU zRhjtlI_o?Vxi{2m=tB$b3`tD?k!##fw%;aqte>?5yJ;1tMnXQ?ej<)=V~YWF;Q5m1 z&KWp0LJFU*y3Hc* zOe?*T5t@2y9v8RRO+9_>0a50ZVV;m0V50qgc9~{ff^n>;B}eW0yBL~4`c}@8aLRr0 z+l^C|E!zHBKMmdStyp5L>xzAb{M&VaL&a8-KG&E0oOQ|2$Gv{n4i>;1Zw4V10J+Zg zVWQl_aW+0mML=pq0;Pd5xlVKnnqn7wN3M_R5z>y>vmL8;>GYO)cFSyAHo|fu%Gf1{ zqGUd}l=2O2uhpwBqlk-5zrgu5AK!rAD^J5<>o$^zxT~)3(LUIgOWl>CcyVt2Y`SL2 zSXlj&F>lkQ4}b2iC=5tJSRlYP?CVuX!(PT*#xmg@wfU3wWp~{xWb>-3x6bo%%jd}A zC(wEsL_TIkXKTsXZit?+>GY$;k{`AiV^i*rNf3<63wu@-R&>ZP4=QQPmZN9*Z!+4s z!Ke1)%412!n7I$83zR5lbB3-|IOJ1;;V1=n`w1^HbKQg6k~ZMzBWC4N>=U%}Q|D06 zS{@X9ED~9ves;NIzGD4{Sf1O_Ur9V>CF_b!)wbyhdKZ^qRE~FIk;J?x-EZmSc+$MU$yw`&Sp5gmoAZdqXT_X+erbn6=li? zdpXPpIoJ4$@eO!#N>N#pB;FBv`gKE#W4YyOt5vk!uto=&&#By`?$Kv>;Dl73L}xbm z!+~`H6tM}ZHjUGZv|5U4HrSgiPe{9dLBgO~{){8#@(_TjAvO^=#>Z2~YVo;#y0l_7 zKa7N|$jWYK00n=01U}qtH|)Ww`hm z@%^bkE{~BXgsH@he5@MCP#P0?ZqjqS;6Rfc1ILFsK_7m>STdy!K`-Y)&7bMrEl~@Uplx_)OX#`nPkOiefLKbjI0f7Mnkya#?5a|$6LSR>r zlnx06q*H$T^490~o%5SHbDrzE=bC4pndgs*bKh4W&3K%dz-QVzI;2t^M}6aPAjQtq z(~0EhwU3XvB!-yv$B^OM_k6A0=85z@c7(SL(zPM`81?=3iH0#3EkEw&EN9_umS7K# zpkhSQnov}d#4c^4)i|Fex4kKBH(4J2>l}>w z&i%Xhm8!E0Xwk%_Sq2*u29e=Mv73kJoobaP+BEn2_aB3!G9S$@e5K(RP`O+X(P~m? zu+90r8dGW-FHKhN@1QA`-A{2Q3;B`6*Jp{(RFkW{>3+_q@nJu4scc&s5}CG|F}n%r z$JG=cevr(Gk`I@)3`=DTBi69G_g&}-d0`RWK=!`VD2dt5)cys-Dy=c%+*c0fR|8Ok zl0*1$L&>kyrcGMAf)irTU-iqh_((j{nj~cwGr-vD?0&Yv)kA_=Kxxtnr36vjOO3ok zsh}{a(JOlk>2I|1LQgPg#tA9v4W9HoqvLu>cdQTvkC>06Yy;osmn@@`&IBRmw z7ffk^TrW1vk!IbYMZ?InU56LcKbNJinmx$efM*8=lEzS{@0#?0*Ca6myicxqzM4_J zC60-v#)YzY+tS1;Ur{Bh1aKwK&+=^RR}m*EP^8ytbr#EhS}rkZP5shOyeJJZ?aiS; zL*?Y@(lL~kHbHEgQJ+gbDMnnzkCSe|bF;N59>{A{<|r~Yap@f{Nyut{GO;}-{bFWH zhkKXw)(Y-}PpBedLp7AT#J+f6bPK)hV245LxrO@aU3iUfh}xh)Te@*$!VHvZWbVsd z`7GaxN10L{B8e{nub1K1d>IbF)*%Gj8f^ZhQp3uY2y+WHnq4tbfJ$!La%p903~{jw zltpI-9VB`$9GWi9orKwy)n;W2^fS`uu=b&3w7aL>c&N1A$l$1bFQoHq=emkdWBBwq zScQ9=K?CUS^~IVAA()EgC(hEs)NLPTcE^S$ z{1QJ(1!UTjDzUzb#P>sqGMvazkEAQPnZYy5h zF3rsS@;bQp{f5u`X3;t|!!B2843GEdtilr^{Ko3~&Y5<$GAAvZ|B=+3q7z?e$&;0@h>-ova6lV z=g8j6$(1-<5)S973ze(S>K&m$$1#+N7Kjuzv*fgePy46G?dZ513N5a&&voA{pSi5E z9!QPf&Bm`e{(wvgE9Y)5e(Y#cZvXX2Wr=KVD60)27Cw5P@>+9P@Gr)iAa=V@GQ;CH z6)upex`c)B&%0^eX%T}EjF**64{7^-&jsF5(mG{gD>YxamnXEWO~&JG!fssq3y9T1+)4guHMPDS);3YJE6-iMZy&ag`1hTFyw<@QY>0G*HX7k zo{6BF=sJ!s%;yJgj6IT^zQR*XS~03SuS0|PD}5Cd(WQ9dra8~hGs7HD3n?F-Fc}q_ z&ZjcQP2V#06xOROUWfct%**Y{^2E8xuJ%bL+eBA3hR-bNvrUUha;@nt{V1tSbDZ~R zea&j=THU42`MsFvuo+`fGURM#qlYm>ux=;v^#z;ebX+{n$2{Fh#pL%KBg2? z6ke~u)UekcS*xxR%OQf6{7DA3+w zK^v3WMAUg#=3rcVuD-I|^^5~OzF9n&vV6IjvP0c8E~Y$wbJ29ikR%u_;=SxInr$G8 z)gW)j72tL^Wb48NaFGhh{&~&tV>1Pv!k{yrHo77gJ63Rgmr}VrUdp-3FL%h;v(Oc2 zE{Yr!zuW&}`5a@bkI^frDqdQn{zmpv zmeCYUE#8ytiLbL{P&7C4MBa4Wqsv57OTyC)6zBYyDt z#8a0Lvp0yT-bslq@)lW|@PYu@p2GChf}{s}E{w=Lb_ERT*C^<``6=U^O?p~Q>Ms(c z^R_Q#dh)qdCVfuWgNCC_ME0tfbQbE-U~>>k~Kl;$#`$CRxR zhXV}Eago`Tq$Kn-Fxpi?)V&d-D_6HFqZ`pFQj1Qawm_M@YMbR0^!-A}PVKE7j&bLN zT(N84wDsNyM(kGpw~-o}E?;6xI6Vqr7mOAoy{P8!PZm43*ltS_R3Y z?V9Da>;sn^sh2oIgssCKkfAgg_5=9(w9JL~lnqG;Md+aD2&~f2JOiLxVzhglNK8bu zM)++n3ld-F1KUQ}Kub$n41VxV^MyUbVm9a`lPZ&{AVM&r>Gs(3aTr*q|E15^kd*6) zNLe>yoTVHQBPQYFyznVw>?sNtZ%}%GTH8 zFBE#oES>WkabUajM$xL^M(wi>S%-D{THQpABM zJ!UTZaST23>D(LkML$>_3AYLg3w^dvKx8Y5V_Gwx2y+El)}) z3cLUwTS;R?__}Aw+I3!+pJ}Hm7w%-yp-Pp_*QkzV7M9=EdPdZ%4eJKAB^(~UUoxO_ zqY*hY*4=%$`r^EC98JjDQ@kQt?}6z_;GR-2$#q+9_Ej>RC2( zD~2n{(O)i_TGNAmkqGX4cBFhfv}|KTigQrI8j^Q* zl6Lm`o}6|_LMRyXBk6V20>o;_07VI2R|hteKB{;-}~?=aD5;OWqbEv zc%O{ZW==)fc}0NNhI-mb+Lmhi3)F^Y+HVJ={{AW8{`B*vH`-cCM7?L^VUZhyz)_pZqM0osX=I9hvWZ^8NkB*P~m`2PI)015W!z8Hi3Raj7fBRWmQc zcEnKby`Y>#0SBgJ@z3$Fat@eNilAdmpy|P0WuBV=1Rm(QizSHoa*~FS=_*yP~>$ zJn$Z+MX7TwsqRcBqLl+u>Sd-(e1107=6y+&P8*Uj7B`K`tM%T%-Ky8@cXiZCUTTgd zjRg2E`Y4z`54pzVcO2DLZEXoyybb8yw zf1ZP$|tCEc7Yhyc;m(inMR>IkzFe9uLJLxWb~sK;2kZi zfK(H+a+dh*jQ-nvuhw~)52`C*(;SRiKdZ5?W|XJ|ys~1lbhT$Ws91l-V57xF>=_~W zx6M)w*sN(3)g|u%GJj*8G1tOuHpb9irREkfohPYS+n=|Xnjfy8tq#2(Kn6eANbAFh z8^rEC!%ogBGGLOD+N+3c{g1FQ%DQ`JehE*D?GyDg=r8ObtXe<8H2* zvC4?+O@5m5?4M{B!Q~#(8J+dH9CYSU8;8Cmo4{qgx~uOsOl*u;Xw6!}7bbnwAsF;Z z#>}s*(7Bzs_)Wb}XL20t3!2^3X}l~vgVU9z-=joaET}6qr-AbOXal)x>$r&W`1%T& zhv*=MpUypv<(HaW7l&$SuX98Ek)#`P4#GwtYthgNF0d9`P`=b8Bi)rXO} ziOE-{M5i-t64KPh8s1;(qp_LtdSlR4@_ibAzlC z{3V#l!rybPzg3oDYw}$aVh1zidFac>`smkdy=GOJtH>!)Kyd+gzuta%i<|=y;V5qO zB0&BHJV@lIZ=JH?%Hf@F+WFsDf+|VxeDqDh9Z2Jv=LC!?TxEy-^5YhR{7FdklG}XE zuPAx(a^;R~@Wc%ztr5fea45nDqVCKs@&z7>)U4H9kBiC)RoI} zHPMt#)IX}4jmpOTs@y}`RL^ueQn6BYwjD!8m#W`e5JPsUMmxoWbqcOYcV6+?@>LEO z7y^%f>6y_=;dYHaWWzBesaYEA{ze82T{&J>jeg{fcbFb9Xb}aMe`f+UxLAFDUbeR4 zIbvmCc=VVaC9fdYXTrv^YRapIP4VDiIw2BSiS6*byKw!R$!3?(B{iqH0aMpapPChr z11rWj%t!i5kOS}NYxfBL?$A0zZrY?{2ofXfTh)IUVUj`JG?Pjta7-@LGwX89RcY_w z!T=_Z!7AnY+5g)x(Qe=z!7xz({*T)Z!S05Su>HN{heK&V*D)jzMg!K5nE{HgV6AS=@S~j3HK?Sk7Wd-hoE3VJe2m|VQlb$s*^5&w&1CzcM=Kg zBTnHY2nTJZ5Wu-hr?hlR6X26Nh4eY(AS9E8uonudPs0E~*}uXpVAl*3d`Sq&%AbZf z^I5@P(+EIH@syU$(1MmT7XfjVzo-_#t$qsGXXOE3%~NPq#(COv!7L0Y(JbA>B>(y-{kNIX->1tD*ZKdt`OVtsbUrQ+ z|L1%}-v*PR%%A}=9Lyd-00|y{Q6_ME;3BZ=OQ0N}#uqlSQ$rZg{tGiO>USC}q~Ztb zzd+%?`8fPNDngqdemm$?NIED4|E)OuH<4W^LBt1&4MV|@K^V}H1meQ_ihmjp`q7)e|ePD*K0L-`AIlv3WVTwiu*9K?I2tPO;%fK#=i@=j8;c_sV>VpJ7ghL*sO$(WwOL+Zq?!0CulF)v8%z zn~m(J+ztvpiToqI*%Gt}2Ru#cLh_^=%cTyzi`OKnmG|02$Eh@0-?wNCwms76R>qM#VQcf1}C(YR0wWiI)#OX0chV;GYnaoS1 z&}-mlCQ5Fp=~;V-?m=#8mtk)fx7V`38l?C(wwht<$1O5~(qD~=!`7_X6u@6&yc7%$ zC_kZA`GMDIr3>{B0FCh)&U(C7`ietj(^;74>FZRq+X|-Z6#@7(f4mDXe!XxmgLkHV+pJell+Ny}Wr(JqC1KL*|Zh$&RI$%a~QG~e< z0XAjqoM&EBlZ8uH+5uRH&Q#4_e)=;?43w5qS#1?|_+3kI`mVSm!oCFs+6r*`z-ebI zEBNt_0(QBVZ!Vz+BL3+_gCh1^9Z=b42GZmU-O$v9ROH^GZXW4Y5z2S&cfSB{-v|j` zaVbMog0r$O;CzheMQ04HUa30@x0p}nG|tXv_#7tIRpsAzhL>W2;1}#EHBB2&NS^Vm zHY9TMa_$G@XaL{`jk)6~@t)y=B)*z$_<{&7ad*!AAHDG($@M>qQiA!2f(H;8XX+P{Me&e zwI#!H=D!D6ex#pyt_L1wcTf^{X)?!L2{vR802wvPC~3+(jhJ!!;xu(>=9VOB08Ist z75FKfz7~8k<~CAGHw6{#IeuJ9^9^neXE#@7ZBJ7qf0fmbQPDFp`t>co&Y$j3j8PXD zdaa0nO>kl(*SKtO~+E>k|~vB5w<dSZcb` zdb3s!Ji5c$AP7EOT8S#L(RljAP0}Fks{i1w($#g?sMWv?A2F1 z+lSSSX1)5q_#wdk)YeA`KONiVebp$?j3CSi@ZBW`A4D9 z4#zZ}7_-bK|C&Jkur`7kdhk!5LiA>7En{eEtQ5PEnfXY0aQCXEWz9UD$wfIazH_wx zN*#(w2U596kBiX6lLO){Zs}?rD`O7WxNu}GPpv?aXkgqm5KpLL(q%%y$6qKvwOA!R z=nJi75ucAK^dBLapd%VD#(xT8Lg>Cn5BG>``=%&=^$ok}B#ja8QvNL{A3y#}dfzWk zU^IpAaSdMFd$1soJzM6?Fz-*A_$_>zN9M#vlp~l*Vf0J&&-rPy&L2bD_PQ-@=KR~q zD|-{3((TB5)^?1v*79et+JSy}@X692kpCqpne=vQ9uWkDhX@3O`2P_^3artF_QV}^ z^NRy%kahI>ok%6zNT)?PyqM^g*l3baNG8=S7N1P4otV~_7z|;uKP(c4x|x8w?qu>F zo+ArL=D3t&512Wd+?>?%e_v#1i+(w=77QTar{LIM`e2_A~nf5>Hr~C}b z0%d@ubFbaS^Lak!jAx8JPk;~Fd0)f3uNJGH5n0}I2lNl#5Wl^W;ip#v9kG9VC28k` z!%v9fkBT(kO=$+jK;<*T;m2LE$J;_LdA5K1C241FjNg>sX;wd7Krrurk|qD17vj!F zVZOym0F2A5DM@cteAG!9A~!UeK0oZcM~S?c!4EIR1Ds{1iC4l3^q zDV$Z^;>R@uBX%O6?kOJSTcyrj6TIrZy2tu7f5qv#CHJ>sJeUCeVgvO&hu$>i#pxP1 z%o5M8TRWz?(nIq68x9$!sR=S}qbjYS5)_0|mb2Fin}-quz*uqDO^61I@>e5=-TfWZ zL4EK7BSXfuPQ{C|=au$cEF1WB4LatP8MS2qg-UB~eb}?-hEhjTMWdQH+MeZj6Si9{b!@}fsaI%)ZHu_+{h+~7_^klVV#cz5(UToarJ}IqLX|z?Ph$Uz6 z6bes^&WbA6GS2;D#Xx?laT8=VJEKKnacgo>o;MoUv6Cgc&NOv&!&b!9SjwBqWCHw3>AWGx&GbEBmgL72CT8$~)of6RbGo%W&!ermUVTC_Ro4#`E2j4Y%w^L7<) z>OTio(3o8&sPcO+tjFNEEs-!^G&!S&Zu77qxi$|?t@Jds6SGr!v(d&_ttMBxWi0cy04)=)1uqaNj!f*zV0>vAWvc+o>=teimK!NTX5Ru2z|)aQs>bb-b~Gvm9kY!Lh>GIuE#Y@;L=8ri@K1N zxr5oLPY91JDq`Vv1aa@5Qp-JIR@J7eWl{|p+SY&&KZMT!(3*ssK#Cp3my3_hpMmjH zXO^*f2k+F*do{9qCmU&h@=>*(z|R1Mq>h4cdV+Q;=Ec2Xkg?jX&16-G($=~MwsBtP z+)7j5o2U3CL-q_pKG|+gMR5=vr4EG&7S6z*swL9SBS!{(v8*y^=MkXUB%15v1JFW1 zG`Pr*c4;%~Mxjf}nrkGw`noFW5bNZ~U3 zi(ujad^5~3iZES*{bfjd>3z(!In6QU15PoNSogZ=VHv%?EQd!kfZy5kssVu^^aEGH z1EnLTzg4H;w2hi<-@W!z?*}Wqr;Vo3}$KBg4Me`-a-(h8i&6&#qGcXH#iz`OV9Z zaIYF-0pj)O)}8hQd_JcAdnV6FE7>PUhb+w+p8vn#A{3ROjE`r*E|>bGJ9gpC*_U=5(+z5by(vT+mOelok>&KSW z=`qDBdqMe6&HqxxA=>uU32aE5Sjl%fhs?kVvv!xhx4GXoE$?|(Z{Oe4d4cI{Q+aY; z>G82uLVG~b1Se7dlE5E7R&z_AZxF|_vM67rdER7>A!g!A-@1}G`LR#e2YK>aJ(;_I zt#Vq|!l38(OVz$(9p_RjS`oe^=Xu?H6hO9>uFvOX+)-@v@O+vpE!r{di@#4$pvJfMj3;D#9g~WmKJ=TZucB0M3@i zF1H~}WOcfi&0}=tO0fNGW986^L&ny%D^yKB6Ek^v;8!vU?l&89$~-_v!t%`4&llkk z3UJ?@pYa%`tV*``b3b6O%_NLlR?6bZzX9_nHB2!uO=*CUsh$d3h6R^b-Ezq#B+7C!~;Z$>w$BvB`0B-1;zLa|E26 z8{n8lqlZzPKVU*zm0w6S_)94&ySW7)uZ=CbmrsS;Rgjm)l~#`h0-f$6yC?u%T90x;5A8Oj*H@moj# z*vF_ z>b3t+*lY*X(T!BmCqAxcDOhhq?0YOxH2eqzzZhvl80~{;TQQ7U#e|E6U1{7rgx4FBwNz zaxYZ=K3(czULx(IS*QfctTQ00=SF!xlk1;w2I9fQxgTR8amc5f(mc^w*cj@M-|p%W z9c0ySRO-fzyM=jWhC-n|3RUNaPf>*k9rS5&XWZ}M1{{Qwp1e=rC>=)Nyf%|wu-_~_ zAdKoRsDBa#Laxx*A2!vv;my@yYLu+XMDjGwSYdj~=VW1es^^#?4NQQX@fMjEr^hDa zEuHo_Bedl2{$7h~rt@Uns?~G)${_;0F<`ay!+T!=kkf1&UbOg-%jJ=E5(u??5Q4Bz zfZW2I{ZP2=X)E{rSB$Tleg{rJ{G=tX4xh-*f!>~&q~_aDxXLs`|^g#ZxjMs|xUx8!C`*s55Zc{Ju(h(&&w+ZFlbBDw_jo{xd1#( zDoNZ4Nqqk3az`ywMlD=?LaAI$%&3$%tcY`B&$ z{CXm2jbVaFc%t5$S6;@*Plo`{T(D3^%w2gW@7btRVzue#hzpqP&E$e?e>tfHR7|>6 z3jO6vyqadkBr4h?-)qb)a5chp^{|Z#)-b(2m5yk6CZ*f4d(Aj%#&u==gg-s^$B63z zoyQ%URxv!~6|?TCs{_p=OE**&^DM$Mh4}27>g|x~ISjezFg6Zn4_l<8f~u~BstN;b z6(M%ic`Yv8biS_+nK8ul7p?q74f;uzI9w%1Y^)o%uN2;Drghm#EFOy zV3?LLj}*Qe@AJ5y{wCkSDOmZ^cJxfb>S5b$bRjqikk&oJfSEQ1CCfyV#=p-Gw>!$`VI|CJQBi44rq zg7QQgMgM`yX)aqPDL}op5-=5_R1T*86=gvTE$v7o1V-ZMf7~nu<LG@S}O!gH7P|wNDG85djI4 zTVTSPTOl&sbaZFSy;ZZvO+!Q00S25^zvF|PeLaNq>sCUUsq#cNxEhuH@~jB-QCpH3 z(b0>KVpP3%?iT5%RiAPluT#0V-l8?WO&YX0y3;{_J#>RHxE;m)@+^W0;H36!iVX3L ziiGs63T&&;q657d1&1McI=rSC@C=LeIM9E%+;;Yi!`rzW6&GZvC?EPf`T~B_2>2sb zju~kU|0YnmXOckomFhP~zjP8G)^EQU4Lc5vd%IVLBuvU9OpD4>x|jB?gvlGRMB^jj z7NjMX{=pMq3}Y;RBk3(Zn0$*2tgBp$t%IJrSle8{00=hLmHoL*n7PThmhAL+b$7c( z`7Ne!R`y)lo{ML7(NLr1Yy=GIThd_7XnZd2F^nsN4^SF^X?@vAt(Ef8MJQvKY_v4g z^l^ygsq@!qtS~X9!*1e)O%B0*fqm1N_LHfK7)l(ebv;Noe!dtz2vu8%zPSJHL{EC8 zo3}(9Q2~=BDP^ByGdllvDmsrYL4?QFPz__{-q^FPTYp+QTE& z&6sKnO_MmR(g#?lky>!rMGeE(Y9MM;U|y#uB%*;1v&eVROYB4v|Mwt#X{Fa$vJu!= zv!g=uuQSGMU*92>vshCoqDt9o>2?>K>P>K~@R>Di|lD?w4 z-w@H9rQ6!_VsL7?VR~ow&c45g?UCB4^|0axGNNlPWSmnBl!kVc@MbouMc!FTLuA*! zoGeO~=+ls;2kkf7+M;X%olh{4d}he)zHMg+9)cfG?9$e8R)MM&9EWcltT|T>ZFHkQ z75uFP{2i)<&dvt?oDdozl(E(kmAVuy=MuzfK2y!;>|0N{+RnWiQUP+{h~XTMaxC^2 z-#9OYl7pxTD@LRx`&4JEb7Ey8gPiyDAAjGJSi&e=?yzAR6`@5dKGh3!GOnQ3(SKUTHioCUU4lr zN|!`KG>s(s{!JSsgt*{OZ)?;ju~jJko9?aXXa;`q6-wXCpdTJ94E$*K8?t?& z0~hZ+u(yDFnW4Y~oXNqQDOj5XA@%-L;Qp@jt|`n<(Z17{W&siL5Sn;0V1RN0UAX{S z{4GO~lS}w%$y4D>` ztW0}u@ij4ev5XI{@vjhKgy2#vCeuvOo$<`B0KLG&Z}cj%!o#Rp%FA@B16&8BHc%^4 z2UETuJZ*{Wn7J|2mfBTJ%^0aXcpl7Z?rk%?HnR`u^KA^y}m9^A;{dJ-X^d5E+jRZ{~BlB`cdeqlbR z0~d@ucNeXr%#+;TLi-FL-KuH5YB7=(CH+mfH0GW}4FHGK12yZpHU{LY2`*8m>ZNOt zF$D3$c9lukXla^FJf$Z?9;FLI&MG=>o_O2onkOf6oefV6-Q=`p>m`%CsTSCxPWn5T z)oAalVU+0lj8h-ub|sqDBNYlYZt`XGK#obaOYyApUJ#5}>O2#vn)E-l73r3PTIanC zVV4B&22Qu)z8A^)v2`d8FV8Dl@8M|U`s6N{@@<}y9B3GizhTquc*J|cRSkG8PkF06 z=2&LOe2Nb7v3qmR^7r*NV^jhB3Q*Hyryif{p`k%$d-IBCa9h^|JAy|IqqNSfK)x3l z`K{O#vy<_WD?E-$6%HdVx4(16%&?Cc(D+m(DuB(Y-rj7vl;VS4RTaZ?%$J-7f};xN z?S}a%0psf#w+iy4!=tFcNXxt}1I(7#z*Ws5HpS8~&mL$(+qJkX;(uuncf{ep-9?Nr zEmf6RHY zK;haczzfv(q$HE3!U?-318)D91$#M1tckE;y)#;%>0YUj6&7cliLv3FV6<*%gB4m7 zbTe7dhf`A_4r2UR$_}pL=f8SrmB)Da&sTWvy zoYD4sM}!8Ma3t%K1*Q^HPZCD(Rpf{L?aD==Qs;aDOLCYn7%BdbK({4knH0v}>b{os z=1P;V)Q*1)804$5-A9$qUtl6@+~KKoyL6KKaH)?D;`!9|EJ3h(^|CXqTU<2!}6KX!Ce$@% z21n;Pbe&(!VJ2^d(~;VF=&JmY*J5=A<;GW3u`}a*?H6>}Ml`auW&aS9g0kKqxNXr6 zl7QKaKrJs{G!OKDKaHbwNuUc#BA8ZLI<_v1`!vCWA|lLoC`81;5XCuH2wB8Ute01G z0p3b>HIhA-Dc*Tn;w5XgBJ(4kLN+}P^BOgh{Fj6;s^WhfEI8M<>8P3WW`AZpzIQ%* zUq9t%zE2CnK&uA?PmICo>=U=T<8iaH&^TkGff&W)cnQb@;lV{LX2o94(UNUpcO*B4 zQ?!ixCnZ~WrzZ&5(A{zpoCY(~IggH*2K_}{=G`cDCW)Gpp71x&`z>-Gok#|=jXOk# zF`lS(-5q$Z2lR4p8o9kSc*@;9c+A~FS@TFYhsPcho|rrIrtvjWd;DA7nggFAp1|LP zz~B2p#J*Azr~*^CgvJ0$GGDb3o-M{jXhDkoLlgy>w_u@RP>TBe z!+LMAm@|#wQ(VZ242sgSY>sUVt>mor52KBF`leNm(sa4$h$r_p)J1bSjFu@;Z$7&! zIZCBX~N)<#aW$A;(N83>%U=hl&n*EFUbk5OP5=GAirufqPu(R zMLAn)T}_mlUdw~`64F}TDeIX4~?${v>N4X() z`#8z@3iot9)%x3*srPwddZTWkAu_W1lN_B1`^`VZfLErGlBKf5Ff>3~JJX=C>RLa(jHxP*MlJ6`C&ns-oN%Kb@i zNr8fgjAD9V>A~e1w01+4@{<(`S#6i&)-w6lqlF4=(7~PT?B*HtWY2ZJdw=(DVR8qG z`xXGVZe{Y4idL#3>zb~?Iaf!=P2{uy#*yg0l%_z5y$@NrdA&JcQ%Tf>yD^W_e1?IQ z2OEuEYR+S#jdE^Us(~JL0f)6s<>5(fUua@Vt65C8a-J`{9@*(AyJgx$*~^1ap;ubw zTx65u2Zzx*MM}a*CW^VohziEZ5SBBYgY@1;ri%IBcE7aCidMWiJ(mi5$me9sQ;|lO zOQQ*vh1gbEx6m;lvo%{~$yuR}w5Gc6jHc2)^NCVMlXaH1-Jq>CEcZJb_m29ME>CKS zSCnZ+*j276wPaD%R@u7y4`f4>!pYn_8+(G~v-L{1*GwiXaYK19f{03@<*$7&C+cD) z{~%@tDzqtc@+HMxO_W{ro>ql6C;5Hwg4Q>?WZsobOE)XvIhKdkeLL(5n4=|Q`g$LJ zZ#h$Bu<>x2yzXSFU1k^Hnmf$4TPi2ZU5ko?y}NVFG^B5zD46P%diPc9sgX`*(l@-; z$Gs^k-BN%+LCW8&i)_Si{-7QFaY-Pi{p)c?{55+|eDzwaznRwL z2}Y~4wS5ZH3^5SE+OA-PmgVKzlYeSnPOep49GV;) zgCsv>Fyd*bS%nRKOch0a=s`p4Y)&>$l-oC7rwJaXoJe>O331{R^Q{N78)vQ4zcA&X z<=|WP;Ifvp0>sZhY`A3IqtLsg!4HSQx4h8Xl+e3n(A#k+y@aGw{LM11ga&;510we*h?m4td*dR>tvLPzJnv!BHOiSHL%smed$gA*;DLX;zplORIwb`ya4wO_9FdT`poYa?8&Z_JGaQd9wc2VQVNpK zubMD&wr^OdpP1ju!lWF6&^&4lDGLlYCHN%vIDyVwLC<9}7Ig>6W{OPYp&P_$2BXGp zgrWO4!3#*&Vv5NBkPzq8j0ZLI$uv#8kN21vo3j?A?bl&Hq;gi2|7BAxZv+5E|GJ_Iz@Ak-kU-ehrEw?s= zkU!N6op-HzH=hHRYf}blrxWmX){qp{hy)HCA(kP@AqF_h^Q}0aYG&0}OT(A$c8Z=3 z@3~ca?6x-=?WbdW-Q}w@a+iLat<=VAW4X7E=@<8u3flmF^Yq&wIIBt#poMN2f;wJl z;Qkx>J~veCnq|0!%Pp1)FTFjX!=uGoa=$tyY}GJ{D6x-uyP3EEMwfpgge@HCnIMj3 zK0EXV9**k|1o`ZS${#lt^My4)Gr#Uw<2HCJ2{DEJL287u9;-2l`GC;E5ZcX!mZer_ zAt@?qJN=+&sO#!xRu8E$unmjiUz~E5T0dNmGPL(S$(R%r+oTjP8wC}mM?R{|IuF}_ zv6AADhxQNd)2s};0yD#HJ+xj~2X^%U{4kSQQqXTN4u2-MSfuudv0tJC4Ccx>qvc;x ze(z{Hy^fJ*X$Tae}2IL$`)e8ybk+A|PDC5o;*pN&5 zUm^C%PG&{3v7w`zW^HFv)3uG+^=HG+uSEXe zX>Z0jnb+^P&$p1zovzm{m)Qrw(_Ej^uzwx5SjhDIf1v=3r?RR<)7ZcTxr9`UQ%z0#>^WD~5LzmK=r)H2@sU4Nk&;@oSenx9&d_==twfQ?K{rZ%0*4H4yrKzsDfNBdALL57NOqN?!ebG<5S{QH*yQB5}j4- z^%u!c=x~y)mrp8n&R~Er8JZBqToA9Aa|m94p}Swx>I*rh&TIkqKzcJ=;<9t9mH&d( zUdK(DG=nWkt^nxvd}-6lY2W3jFZ$S81K+aQ#%^oh=_oth3NHuwUqhXSqpnQ4qrGga zo8WnBT?*|6y^q=Efi<6uz4u4%$EvVtu{qs>jiP#{Qeo06E>oR9b$;7UM?J`A`Ws0Bv+?h#Dbz)gpIG?FHMjqmz^rrQaDeuA64XnL)uWabTfJvtq5Y z2?rMmg<>b+8fGkh#Jhdo(nWH&CAXS_Zz%dlWe}0 zAm-~7I zFDBv`!nYaK)1!xYxj^LVvLwn@=sqzG9se?_Vh1kI6W32|>Q5e>v-r#t1~~981$r?< zjMIe)FWAazBX`wVqs~EF)ke8yO5=9(;e!PYTvfu~-z3MR$dOXauWUfw;l7l_>?HeHypmUA<82 zDGfYyHL$l2{4L&Lq-9C5SuNFeXzJiI6xf=C|AQ5$!7h=E9f)uVAAtqO>1NjVC-jMv1zMG8c-f2U5?{Es#<#?APg%-c?~3xuAC>;D&}6m=DbP{8OF>BUd6Hjp^{## zy@h^Er-@Nsou-&!BxmIA2-?)xQ&Ka)K@%}RqiQQWB zyIYw|@b27RoMe{y1cR0s(7}No!INDdv4Cv~Z8n65jF8o&+{JSVWG^C)C9hHAj0Wb) zvh;(h=#yZu9)t_xi$T-kyWtqZCP^gnk~-O%&Mm8|bi!JbC(4I{IOug@vy*S@FuZ^k zq{>xXL#iCm&aFVY(V_MUn6~NW-htfUeB&@E48Ka`-)Gc1rCx&OVYk)vR-+~dFt@;H zR~tx>)%1g};}Q2K8aHy68L|}BM*@F0Mmnc2kE$}WW^M4x&^dHbe@3-(CecKV)ZPds z3KrZ`f-bj;c^E2`n&@a9WcF{FyX60B@UQdD*C zh%eBkT8!7DO^U(1!gG>_T+*!bP!C#lFzL8_L_1%W!_}cvp1LjAChfPVcmmaP5$f~@ zlV(%LZ}~#T%d!BE-f0MRjWQbAe>SAX;LGj-8c~=zt)1!kwi?(w5fO5by-H)Q-q8fB z=N?Z!IRyDoZr&LN^XEQK^$MDElCB>}f3(W{aOTc&dJ$mrG?r#;VWPZQ%8?i`1G-2#!?(D2t2c=i-3U)SCk z!MOZc{Ap+tNB>%1UZC(P<3pP34~t>hqwo)mbDR%$;k~BY4-QcORMa|G+~wh%Lefk$y1cZME)05&oScHkPC6+4#liit*JGkj=Jq8rf+1K7G6T-+40|M6si z+lso6VD?bI9%aY9foYIka;Y0mW2)6YU4vPqd)oC*kQ3+lcLv-SY7vqWjikpNMV4%! zp^Y+NM)KVt=62C5{_k+}{Si*oC7f)!g&Shm6xiwKd%8ki*`}MHKjG5*CKI*Cb$lb` zLaDP4*Ze*Q`<8KE2k_b@@^JVb!+$e{!s02UD_VBiu?jFU*ou6aNSqS<;I#C-+`-vQdZ_m| zj$-Y927~xj0)5O!(VW-2bC`30Ly9m(WySCJze^fZ@@5pH#Jf!tBO3F}#w0WxdH$)5 zS^RZFF1w#~^$P{oJ~k!tMva`LH$zWcpj23OMdBrQg!jwk;NB20xDMh&SMkujpJL(& zmOVZmV9_V&VMz{`-NW+y9b*K1HO|!CRq^~w1cvMr0LuiH-dDeHbAXe7MWruvkk6ki ze|lzsNRmGZ|B@T-(i`PZSIM5gwYT_Ono&98%1=v=OZT^k8~zy%;RBb?37-^m{*4%d z!6;^bWng5R;0HHyv(oiPCpwLv6OOP7Xx2`6Cjmxu@%&w!5_J_$95+_X$-T1=cZU*| z;jEvSf%nV<%G@msi)d{sYB?aqR}28F__D12Eqie#Fpta^scXu#u(?VmJ zh|6V>H#2w3&O6uIfSr{bnSQE_O;`W{3o(iUW5E<{kxW~RyF%* zmF??L`W8$cys`I!8V}~a`;-TheN()fCshh zWTpu(H_R+m1+Hvf4_75ugS-6^C~rc7XcXC{={AQ#%@0BgX?fBjmPOa7&YJp{?R2je z(E7ziZkpeP}-h^n3g z=PcvekCqjWT>DRI@oaL@n4&)BUoYqj z4IuWK3P~>>lh4>3%+7@G%9MdaU5KE1pTtk8A^~ zpLm#~8PHouXV8LN%>{wy?#SAaN4PlyQ9|)gW?y{-1OvX)6!^LJMq!v%wCLF~$zM`} zBB&D+aq$`&PQhR1DUwzP#w`P*^9q!L8y)G`rPQt%X6cem6<7|B=Q`2AWNyB5)F)|@ zX9|`EZ`6v1rL3`I;afLXes8hNA~Z7i*_PvPHUMhoxV-u6oLKjfrjO{3A5Hno_+hlQ z;0I;!R}foC3EdImJIw?wRC#?~seqKYM{Hsf8c%9~3#3_1F?KUjBBP44y`#qDL`SN9 zzM?P*VNK!p6TlIAfu$3O{PJvMkNqdqKGcFW|J3r_B|lS^^&F%7{aj<}#pJ{;J?OB# z2yr*27ewL(Q;t`rKt71Ar^MIag3U|A^O16~4Q#nr#9^m~t%>F6vMhtPJ7h;U^hXEz z!3(C~s5(gWZ{=0lI+S$X%cShsKOu3!9PMlTV#i64>53OGjW<}}H}P%5CO}yLKBPP3 zr-YD8#-*wyenrvJ%c1C=c=t&Bo6W&;j8cD?x9;w-zkt%L>hqk!MTy2mkv&Lr z+zzZzFJb*i}oikZz`{GAnGv``kV`_UGZu8 zAr%>XV2l9W%(bz*2u)48q>s22jALTu1r;8RdXO!=KzO(;-icGuB$w3%-AJ&QaRf^=Nk$;hBeo>z?LEhmV1SD)=?hd=hhMNW)5cOy#!Oc z5=KbRO8e%BJEaROsBC^-w+ns3i5!6OSI>_o=H!tX)U7o{KYE8-O}wR z6>Ybb^#aw2RR3V%T-d(|__N)7b0ha-f|_BJHR%C>XHcD>(M$@fo@SM7FWV5*=#bt` zKrF-o3gz!vZXs`Rsjjq@lLS;>MvZ9qo7Mh*NN|2oXy3o&!TmA82dap4=yy0lg0W8- zN_~Oozi_D1=3mc|sX)dXBkkpQ`TR2uPP(gIa&kZCNnz5GEPqSOO+t(?$N2dRhDKPx z&hf8<9-By)#U1=JeQIyA>+|;`{}5f{nBf=N84OZeQZcxT1+82kk`GebJouY;p~DAx zx%V9C{f1X-xXUMW%I)O~{=ns|Gzh9AD1{5u{dN9@@otZoFAu(?7Sr8}-MRyY`|)|6 zL%$plRzX#aZtG@3j%IaJc9Js|XL4XC4+SV4bcZr%P{k|pcvg*ay+T0@Y0$I^YR1Le z)pryY?2$AImlV`rQ^dp%bhPvxde9Gb^@@0tkQ?@jo{a7}K*^_J=lx0b1KZaH-YTykugF1=?-P!{s_#b$ z101oUUP2fa?pq|F2dUhf0rtfk1|k$#Z%=e+`n3#DtD1krv0I8)I^&v_$OGxmk`?>! ztjTyOgZp0~y_xmJT=!*7ti1!Al67tiTm7sZ=opzkD@bpnjvfgf@Fqsw15jSE!+%`T z|4aA2xy_?15W<>qc;t)I)TScGKt;TBDNv5aZ)wlf;|7otZ#7b@m~_J>Pbq-28`|h= z2&c)^LK;&#U}6aInyo8YW2fA%AyC9`C{k(gQEZySud?-DlUi@+Ln~Ex!;A?Qz>u07 zbR7}kn1m5?xxsKcCfJPXW-^q^NzlWpOhAgZV0G~>m0&m+>e!=0X8F8>*GiUABIRGq z64~Ig`gXB}=b$C>Vc{X+hhbpj;8g-XMPM2gxH~hY#Q{{gRB}~(9K_OPnSJoUVvsBJ z!I(8RGl$SPXrnTHNcTbt%sbSXN<>9&5L^+P6h}aNMtNW7$up~tP5GuMS(0>8`m5$@|#B%W}sN)QW#%r|TWl#R@ zWcLnIqQmn_8aamVzw8ClMZhRy;{=}i7Fn1?ObImNn|y&&IKCjSTSL|kzppui%O97Y zS+rK)?Z~0h(x~^WdN`i3gSc=ss?OdSl*?!##OIBw3S6=!Q5NWA$USliTFx@g=oDhT z^onX1VyVyASkHP`(;*V+iv26Qs2!vkd5D#@bDedfIWjJwgxUkv90Ce1nPur}Qc_GE z5aBde{91|<9bMz+Wr8EeQcMD{Fc zhLjIcxGFm_QWRy0EGb(=X-u~4k$vq`;|uv;^DX6n=b7i8bMEh)`_6mc_j%4U_kE9= zn!8C<42C>=E)wfBHS-=zaUc7Xlh+j>{fMG#9-{k?OwUA2?{V zsObP4UtD$rP-dwb8D!R$sS`rwEYcG9= zi!LP@3_VC76gja)uF-Bw1YEXjEfIX#-JOwfG{}Rpyz_-ilcvj-D4!D@!|Ute&|Poi z9xH1-IU|r z;}+Kms#AB874av^zOH3UJqnPU7qPBrllSlzJy+~jl+$tCWeriN|InqwPDV>+_=`dvjd?CDp}fe;@B6Tihr=M;4W&!eqg~nh&bYAvPmO9D0Y<}PnYMjMU!ivloY_X zUy9mg&I-lo9-{SECr;McxJYa}@-r!5wE6pD2vdeS798|y#f|}A%DrrgOpQrB=hmv} z6C!2e8d&sxRJbuPIhg%nf*09kzB@km>aE{pCk#ui_F5UtnOBI$C_QE z;y1!P(2XAo2H%AFnP{FLF#k*3=1^V>S?p1_X8G_3z7F|ysiB`|UhdxTaqbm$UonNL}{Qnxx1bT4#vXKPty9PmH5o9eI8uO@%cq}kFQ)W@uw&1msBuYISN%Vl#m zOn&-ajlgYm`7okqKs2uQ^>7x}Cw{Z`dQ6yAv-p|;Q=ZF>vsj+?Vo~hv9{rBVO6gI~ z^%J=oa?Xr$Yik8@a&en3haFS>IUNyGQjfPS`Jtsfju+5PTJGwD8d|IOy+zc#uz zK7IYX(fq0H#nQ&7&%N3n->*e=bX$I@OT|b=C{*jiBGw;Al}YN<_~_PHr==DioXd4g zzY-kb{Rm%D?}@pQTIcgi;QOJ}~AK?)A( zmz=dA1Q~SRv3TAyZOw_Bs&3+aI-umDuIQTHmUaOX(qf^kY;jbnd~k8LvsBS5(``=E zr`+p#Kh9#gaoy%-b#uf>R_CzPIg$cbzq;CwGOOE7!V{>uWqMY~vgb)wnKr##QZC zlMV!Bqi+RcB(XIX3@w`mry$#ki0C_$j*g!+-u7{@*>r>~@>Yg*IG8{%wN3hyiI*cx zznwRmYh9SvYulzB3^Q!U%(!C{q(_%WI~4KPtlg&1=G%>RyY!L_>H5;vas7Ys;?HOq zbZ*j>i>(>)Ho44OHlQayZplToZ&t^RcMvvZ@Zz5-iZk-2JSVg8W~~Bl+iYCC<&&n0 zKNAP&HeM{fy`GWzF@bl~>Zb8Bi&eVj9k0FvD4(-506kwl{MMz5?jZwtI=EOI=2S?_ ztuoweJ*BHH&kE0XCLaCn8<=&yX~Kkec(XLI>DlnGkL-_2m*8{0D>;f|%|}W&nk0J( zIUx@Z-QqchYaMsrj^R*>CYTQe-pGzO=h9P>T9_LMo}yo0_C0>|UVX{Tvbx<({G+_E z+mv*~8@(*}HzyozcJRe=x&DfYtUE&lbkBp}l9FeX;GG@wDGv`mwUele3-4s2tQzW1 zEq71N$zc(cDr{3dd3+RY=#>!IX%<&U@RYRv=#Jy?Cw_b9cglFIQEX(F{Gy>OHJ~Xu zydW}g7q&e7qj^N5VTC6yDR38lf{(6Etk!Ait4KXJX0u=WSz--YpsUK*siE&e{%$dvp$8>=!Vt;m z3Our_qOXFdHJqHEIApiQYI|Shy@dK$)Me(0nBy!o0tJ?QpR0bZm4&3@KOQwekol|4 zkUgXYd?sCH%sTS=fPLTX4-zD1X5lE|uym0jTywZ_Z#N8g@|7^LvyPp%mBTOpNp!DC zRg}bilrDpwJy9q3`R!JAvJ1Nx!FVg(4-`$A$yz_fwXAfYV%?49_0XnGLF|W&SIVuH zKc(z--Y66Mkg8sy)+Da^2hM_;I$Y{X8@Ws?bNqvad_#qchMn#AKPV&05C<5vl{PFF z#+cEkT$#7cR_?&R{qHQLyv8A}GZVax^H=L43)dr*4%h3Vw$<0O2QW3O4XkS;2 zu;OLDDJ3E%TfONq7A=q;7w3XEzo@`vd*Gl>5eiJSLRZTepio@06(-_PMuYg0q%DmxihFyzy)3AfWCX!g+OOFA9op7A1@FK=V$)%i-#p65u*Ev$8Z5qdjwgj z@`-n^>=P379_9gG`f-A5Gm3yPElBF%f>b29HOdOog|IS1-qeE_oy173+9Dsq2yV`> zqNtUkpkgkAP&T0TF9n^6(f&+SaCuf4wZH1iFNzp_5b;k97)jLr!XCdvA*>K&0^AU! zvMlqOK!^~vh4U$bpf1$Hw0R^DDM2-@r6a-dc~;cE?LfbRLG99%`Ul+G9V7&0BUcwsTEYKCYBurL+|tBRi;Az5cK=95Se{Y`AI7DY5{}#@AW)H2=0R}YyA_6 z0Yop#1J+hl717AN_hV4hyq5+BTZPu09N*_hYYSyshN%6?*Drz?J#&tlib*cX1I{i` zP(?6l2?GRN04Lwcfy0Ze;N}t%@bQLJ6ez#U3Z5QCp#B?WP$1!7!Jv+M69h`bMZx)D z3?Lp1X2YdG%@GXX9}N18r~!GwU><@Olwn4KBajsn3aRRl$O>AHB7xy6V8EyY5O5Vd ztBwMfMo$1FJXM^XMuCoFNPy=CqzZ%OD2BaJ1bQ9$eTV_EAZhFZ@E{&KNq{cnDnLde z6@NLd3#=wnMTrShARnZP*%Nv|S0+_tnA8Q{=0Kt(=&8gFCQV|1o_wn6dz=+CnP3I= z)PBFycp>%TRW&w2JtA3~xW7^(TU>#->$@TL?p34@M0e> zCcZMNi9DqbSU!V9G^jBx50tln0n_F{Ll>yvDhry9L7naK`<*A=g92YXfT|Cmq^H_m zze)W6eE#=Y5TpO--8Z{G4lAJcrt`lCF?wh}@EFd=ZDvFVtyzM>l%dZ8G?w|vd)og1 D86?w= diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.properties b/js/react_native/android/gradle/wrapper/gradle-wrapper.properties index 51d930a381f3a..012d6d90445b4 100644 --- a/js/react_native/android/gradle/wrapper/gradle-wrapper.properties +++ b/js/react_native/android/gradle/wrapper/gradle-wrapper.properties @@ -1,6 +1,6 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionSha256Sum=7faa7198769f872826c8ef4f1450f839ec27f0b4d5d1e51bade63667cbccd205 -distributionUrl=https\://services.gradle.org/distributions/gradle-6.8.3-bin.zip +distributionSha256Sum=cb87f222c5585bd46838ad4db78463a5c5f3d336e5e2b98dc7c0c586527351c2 +distributionUrl=https\://services.gradle.org/distributions/gradle-7.5-bin.zip zipStoreBase=GRADLE_USER_HOME zipStorePath=wrapper/dists diff --git a/js/react_native/android/gradlew b/js/react_native/android/gradlew index fbd7c515832da..a69d9cb6c2065 100755 --- a/js/react_native/android/gradlew +++ b/js/react_native/android/gradlew @@ -1,7 +1,7 @@ -#!/usr/bin/env sh +#!/bin/sh # -# Copyright 2015 the original author or authors. +# Copyright © 2015-2021 the original authors. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -17,67 +17,101 @@ # ############################################################################## -## -## Gradle start up script for UN*X -## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/master/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# ############################################################################## # Attempt to set APP_HOME + # Resolve links: $0 may be a link -PRG="$0" -# Need this for relative symlinks. -while [ -h "$PRG" ] ; do - ls=`ls -ld "$PRG"` - link=`expr "$ls" : '.*-> \(.*\)$'` - if expr "$link" : '/.*' > /dev/null; then - PRG="$link" - else - PRG=`dirname "$PRG"`"/$link" - fi +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac done -SAVED="`pwd`" -cd "`dirname \"$PRG\"`/" >/dev/null -APP_HOME="`pwd -P`" -cd "$SAVED" >/dev/null + +APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit APP_NAME="Gradle" -APP_BASE_NAME=`basename "$0"` +APP_BASE_NAME=${0##*/} # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' # Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD="maximum" +MAX_FD=maximum warn () { echo "$*" -} +} >&2 die () { echo echo "$*" echo exit 1 -} +} >&2 # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false -case "`uname`" in - CYGWIN* ) - cygwin=true - ;; - Darwin* ) - darwin=true - ;; - MINGW* ) - msys=true - ;; - NONSTOP* ) - nonstop=true - ;; +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar @@ -87,9 +121,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables - JAVACMD="$JAVA_HOME/jre/sh/java" + JAVACMD=$JAVA_HOME/jre/sh/java else - JAVACMD="$JAVA_HOME/bin/java" + JAVACMD=$JAVA_HOME/bin/java fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME @@ -98,7 +132,7 @@ Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else - JAVACMD="java" + JAVACMD=java which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the @@ -106,80 +140,101 @@ location of your Java installation." fi # Increase the maximum file descriptors if we can. -if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then - MAX_FD_LIMIT=`ulimit -H -n` - if [ $? -eq 0 ] ; then - if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then - MAX_FD="$MAX_FD_LIMIT" - fi - ulimit -n $MAX_FD - if [ $? -ne 0 ] ; then - warn "Could not set maximum file descriptor limit: $MAX_FD" - fi - else - warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" - fi +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac fi -# For Darwin, add options to specify how the application appears in the dock -if $darwin; then - GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" -fi +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. # For Cygwin or MSYS, switch paths to Windows format before running java -if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then - APP_HOME=`cygpath --path --mixed "$APP_HOME"` - CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` - - JAVACMD=`cygpath --unix "$JAVACMD"` - - # We build the pattern for arguments to be converted via cygpath - ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` - SEP="" - for dir in $ROOTDIRSRAW ; do - ROOTDIRS="$ROOTDIRS$SEP$dir" - SEP="|" - done - OURCYGPATTERN="(^($ROOTDIRS))" - # Add a user-defined pattern to the cygpath arguments - if [ "$GRADLE_CYGPATTERN" != "" ] ; then - OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" - fi +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + # Now convert the arguments - kludge to limit ourselves to /bin/sh - i=0 - for arg in "$@" ; do - CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` - CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option - - if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition - eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` - else - eval `echo args$i`="\"$arg\"" + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) fi - i=`expr $i + 1` + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg done - case $i in - 0) set -- ;; - 1) set -- "$args0" ;; - 2) set -- "$args0" "$args1" ;; - 3) set -- "$args0" "$args1" "$args2" ;; - 4) set -- "$args0" "$args1" "$args2" "$args3" ;; - 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; - 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; - 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; - 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; - 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; - esac fi -# Escape application args -save () { - for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done - echo " " -} -APP_ARGS=`save "$@"` +# Collect all arguments for the java command; +# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of +# shell script including quotes and variable substitutions, so put them in +# double quotes to make sure that they get re-expanded; and +# * put everything else in single quotes, so that it's not re-expanded. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# -# Collect all arguments for the java command, following the shell quoting and substitution rules -eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' exec "$JAVACMD" "$@" diff --git a/js/react_native/android/gradlew.bat b/js/react_native/android/gradlew.bat index 5093609d512a9..f127cfd49d402 100644 --- a/js/react_native/android/gradlew.bat +++ b/js/react_native/android/gradlew.bat @@ -14,7 +14,7 @@ @rem limitations under the License. @rem -@if "%DEBUG%" == "" @echo off +@if "%DEBUG%"=="" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @@ -25,7 +25,7 @@ if "%OS%"=="Windows_NT" setlocal set DIRNAME=%~dp0 -if "%DIRNAME%" == "" set DIRNAME=. +if "%DIRNAME%"=="" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @@ -40,7 +40,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 -if "%ERRORLEVEL%" == "0" goto init +if %ERRORLEVEL% equ 0 goto execute echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. @@ -54,7 +54,7 @@ goto fail set JAVA_HOME=%JAVA_HOME:"=% set JAVA_EXE=%JAVA_HOME%/bin/java.exe -if exist "%JAVA_EXE%" goto init +if exist "%JAVA_EXE%" goto execute echo. echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% @@ -64,21 +64,6 @@ echo location of your Java installation. goto fail -:init -@rem Get command-line arguments, handling Windows variants - -if not "%OS%" == "Windows_NT" goto win9xME_args - -:win9xME_args -@rem Slurp the command line arguments. -set CMD_LINE_ARGS= -set _SKIP=2 - -:win9xME_args_slurp -if "x%~1" == "x" goto execute - -set CMD_LINE_ARGS=%* - :execute @rem Setup the command line @@ -86,17 +71,19 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar @rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* :end @rem End local scope for the variables with windows NT shell -if "%ERRORLEVEL%"=="0" goto mainEnd +if %ERRORLEVEL% equ 0 goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! -if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 -exit /b 1 +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% :mainEnd if "%OS%"=="Windows_NT" endlocal diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py index 19f66245a45e2..1b34b3d302e57 100644 --- a/tools/ci_build/github/android/build_aar_package.py +++ b/tools/ci_build/github/android/build_aar_package.py @@ -23,11 +23,11 @@ # Onnx Runtime native library is built against NDK API 21 by default # It is possible to build from source for Android API levels below 21, but it is not guaranteed -DEFAULT_ANDROID_MIN_SDK_VER = 21 +DEFAULT_ANDROID_MIN_SDK_VER = 24 # Android API 24 is the default target API version for Android builds, based on Microsoft 1CS requirements # It is possible to build from source using API level 21 and higher as the target SDK version -DEFAULT_ANDROID_TARGET_SDK_VER = 24 +DEFAULT_ANDROID_TARGET_SDK_VER = 34 def _parse_build_settings(args): diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json index b0eff75812673..1c7769c623d41 100644 --- a/tools/ci_build/github/android/default_full_aar_build_settings.json +++ b/tools/ci_build/github/android/default_full_aar_build_settings.json @@ -5,8 +5,8 @@ "x86", "x86_64" ], - "android_min_sdk_version": 21, - "android_target_sdk_version": 24, + "android_min_sdk_version": 24, + "android_target_sdk_version": 34, "build_params": [ "--enable_lto", "--android", diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml index d8ea1c35c89c4..29c5f6bb34d7a 100644 --- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml @@ -261,8 +261,6 @@ stages: publishJUnitResults: true testResultsFiles: '**/TEST-*.xml' testRunTitle: 'React Native Android Instrumented Test results' - javaHomeOption: 'path' - jdkDirectory: '$(JAVA_HOME_11_X64)' sonarQubeRunAnalysis: false spotBugsAnalysis: false displayName: Run React Native Android Instrumented Tests From ac9c135b9543ad0374fe335bc3dc5feb0f24f010 Mon Sep 17 00:00:00 2001 From: Preetha Veeramalai Date: Thu, 14 Nov 2024 20:10:07 -0800 Subject: [PATCH 05/28] Ovep develop 1.21 (#22824) ### Description OVEP development changes for ORT 1.21 Release ### Motivation and Context Has critical bug fixes Support for concurrency execution of models is enabled Support for OV 2024.5 Memory optimizations for NPU platform --------- Co-authored-by: jatinwadhwa921 Co-authored-by: Ankit Maheshkar Co-authored-by: sfatimar Co-authored-by: saurabhkale17 Co-authored-by: TejalKhade28 Co-authored-by: Javier E. Martinez --- cmake/onnxruntime_providers_openvino.cmake | 21 ++++++++------ .../core/session/onnxruntime_c_api.h | 11 ++++++-- .../openvino/backends/basic_backend.cc | 5 ++-- .../openvino/openvino_execution_provider.cc | 28 +++++++++++++------ .../openvino/openvino_execution_provider.h | 2 +- .../openvino/ov_versions/capability.cc | 10 +++---- .../openvino/ov_versions/data_ops.cc | 27 +++++++++++++++--- .../providers/openvino/ov_versions/data_ops.h | 3 +- .../linux-openvino-ci-pipeline.yml | 2 +- .../linux/docker/Dockerfile.ubuntu_openvino | 8 +++--- 10 files changed, 77 insertions(+), 40 deletions(-) diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index 5dcee285a5b13..e500957f864f8 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -11,22 +11,22 @@ "${ONNXRUNTIME_ROOT}/core/providers/shared_library/*.cc" ) - if (WIN32) - set(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release) - endif() - # Header paths find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX) - if(OpenVINO_VERSION VERSION_LESS 2024.0) - message(FATAL_ERROR "OpenVINO 2024.0 and newer are supported. Please, use latest OpenVINO release") + if(OpenVINO_VERSION VERSION_LESS 2024.3) + message(FATAL_ERROR "OpenVINO 2024.3 and newer are supported. Please, use latest OpenVINO release") endif() if(OpenVINO_VERSION VERSION_GREATER_EQUAL 2024.4) add_definitions(-DUSE_OVEP_NPU_MEMORY=1) endif() - if (WIN32) - unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO) + # If building RelWithDebInfo and OV package does not have that configuration map to Release + get_target_property(ov_rt_implib_rwdi openvino::runtime IMPORTED_IMPLIB_RELWITHDEBINFO) + if ((CMAKE_BUILD_TYPE STREQUAL RelWithDebInfo) AND NOT ov_rt_implib_rwdi) + set_target_properties(openvino::runtime PROPERTIES + MAP_IMPORTED_CONFIG_RELWITHDEBINFO Release + ) endif() list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES}) @@ -82,3 +82,8 @@ LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) endif() + +set_target_properties(onnxruntime_providers_openvino PROPERTIES + MAP_IMPORTED_CONFIG_RELEASE RelWithDebInfo + MAP_IMPORTED_CONFIG_DEBUG RelWithDebInfo + ) \ No newline at end of file diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index b0c5d2329c428..b1a79f5921328 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -626,8 +626,13 @@ typedef struct OrtMIGraphXProviderOptions { } OrtMIGraphXProviderOptions; /** \brief OpenVINO Provider Options - * - * \see OrtApi::SessionOptionsAppendExecutionProvider_OpenVINO + * \brief This Struct is frozen since ORT 1.13.0. Its maintained part of Legacy API for compatibility. + * \brief For latest OpenVINO Provider Options update to the ProviderOptions map. + * \brief Latest OpenVINO Provider Options are listed in the + * \htmlonly + * onnxruntime document. + * \endhtmlonly + * \see OrtApi::SessionOptionsAppendExecutionProvider() */ typedef struct OrtOpenVINOProviderOptions { #ifdef __cplusplus @@ -645,7 +650,7 @@ typedef struct OrtOpenVINOProviderOptions { * Valid settings are one of: "CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16" */ const char* device_type; - unsigned char enable_npu_fast_compile; + unsigned char enable_npu_fast_compile; ///< 0 = disabled, nonzero = enabled const char* device_id; size_t num_of_threads; ///< 0 = Use default number of threads const char* cache_dir; // path is set to empty by default diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 8a1844544328c..56cceb8cf2a19 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -120,8 +120,8 @@ BasicBackend::BasicBackend(std::unique_ptr& model_pr } catch (const char* msg) { ORT_THROW(msg); } - - inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, 1)); + int num_infer_req = (global_context_.num_of_threads > 0) ? global_context_.num_of_threads : 1; + inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, num_infer_req)); } bool BasicBackend::ValidateSubgraph(std::map>& const_outputs_map) { @@ -663,7 +663,6 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { // Requesting for an idle infer_request from a pool of infer_requests_ OVInferRequestPtr infer_request; infer_request = inferRequestsQueue_->getIdleRequest(); - #ifdef IO_BUFFER_ENABLED if ((global_context_.device_type.find("GPU") != std::string::npos) && (global_context_.context != nullptr) && global_context_.is_wholly_supported_graph) { diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index 6e39f5832226c..72a188108adef 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -3,6 +3,8 @@ #include #include #include +#include +#include #include "core/providers/shared_library/provider_api.h" #include "core/providers/openvino/openvino_execution_provider.h" #include "core/providers/openvino/contexts.h" @@ -187,15 +189,23 @@ common::Status OpenVINOExecutionProvider::Compile( #ifdef USE_OVEP_NPU_MEMORY std::vector OpenVINOExecutionProvider::CreatePreferredAllocators() { - AllocatorCreationInfo npu_allocator_info{ - [this](OrtDevice::DeviceId device_id) { - return std::make_unique(global_context_->ie_core.Get(), OrtDevice::NPU, device_id, OpenVINO_RT_NPU); - }, - 0, - }; - - // fill in allocator - return std::vector{CreateAllocator(npu_allocator_info)}; + if (global_context_->device_type.find("NPU") != std::string::npos) { + AllocatorCreationInfo npu_allocator_info{ + [this](OrtDevice::DeviceId device_id) { + return std::make_unique( + global_context_->ie_core.Get(), + OrtDevice::NPU, + device_id, + OpenVINO_RT_NPU); + }, + 0, + }; + + // fill in allocator + return std::vector{CreateAllocator(npu_allocator_info)}; + } else { + return std::vector{}; + } } #endif diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index 16f06ad9dd1da..bea9badea475a 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -199,8 +199,8 @@ class OpenVINOExecutionProvider : public IExecutionProvider { #endif private: std::unique_ptr global_context_; - openvino_ep::EPCtxHandler ep_ctx_handle_{}; std::shared_ptr backend_manager_; + openvino_ep::EPCtxHandler ep_ctx_handle_{}; }; } // namespace onnxruntime diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 0d7ac64d86e68..95c7466e02f2f 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -35,16 +35,14 @@ GetCapability::GetCapability(const GraphViewer& graph_viewer_param, device_type_ = "CPU"; if (enable_qdq_optimizer) npu_qdq_optimizer_enabled = true; } -#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 1 - data_ops_ = new DataOps(graph_viewer_, V_2024_1, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 2 - data_ops_ = new DataOps(graph_viewer_, V_2024_2, device_type_, npu_qdq_optimizer_enabled); -#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3 +#if OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 3 data_ops_ = new DataOps(graph_viewer_, V_2024_3, device_type_, npu_qdq_optimizer_enabled); #elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 4 data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled); +#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 5 + data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); #else - data_ops_ = new DataOps(graph_viewer_, V_2024_4, device_type_, npu_qdq_optimizer_enabled); + data_ops_ = new DataOps(graph_viewer_, V_2024_5, device_type_, npu_qdq_optimizer_enabled); #endif } diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index e8f6ae0a43734..b2c5fd6f83167 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -118,6 +118,7 @@ std::vector supported_op_mode = { {"CumSum", V_2022_1, {"CPU", "GPU"}}, {"DepthToSpace", V_2020_4, {"CPU", "GPU"}}, {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}}, + {"DequantizeLinear", V_2024_4, {"NPU"}}, {"Div", V_2020_4, {"CPU", "GPU"}}, {"Dropout", V_2020_4, {"CPU", "GPU"}}, {"Elu", V_2020_4, {"CPU", "GPU"}}, @@ -254,6 +255,8 @@ void DataOps::populate_types_supported() { std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32)); supported_types_initializer_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64)); + supported_types_initializer_.insert( + std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16)); supported_types_initializer_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)); supported_types_initializer_.insert( @@ -262,6 +265,10 @@ void DataOps::populate_types_supported() { std::make_pair(V_2021_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8)); supported_types_initializer_.insert( std::make_pair(V_2021_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8)); + supported_types_initializer_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4)); + supported_types_initializer_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4)); supported_types_npu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL)); @@ -285,6 +292,10 @@ void DataOps::populate_types_supported() { std::make_pair(V_2024_3, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FN)); supported_types_npu_.insert( std::make_pair(V_2024_3, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT8E4M3FNUZ)); + supported_types_npu_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4)); + supported_types_npu_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4)); supported_types_cpu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL)); @@ -304,6 +315,10 @@ void DataOps::populate_types_supported() { std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64)); supported_types_cpu_.insert( std::make_pair(V_2022_2, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16)); + supported_types_cpu_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4)); + supported_types_cpu_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4)); supported_types_gpu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT)); @@ -319,6 +334,10 @@ void DataOps::populate_types_supported() { std::make_pair(V_2021_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT8)); supported_types_gpu_.insert( std::make_pair(V_2022_1, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_BOOL)); + supported_types_gpu_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT4)); + supported_types_gpu_.insert( + std::make_pair(V_2024_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT4)); } void DataOps::populate_op_mode_supported() { @@ -368,7 +387,7 @@ void DataOps::populate_op_mode_supported() { // populate unsupportedmode_t { - UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4}, + UnsupportedOpMode obj = {{V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, [this](const Node* node, const InitializedTensorSet&) { // If the Input of ReduceMax op is UINT8, it is rejected (Due to output mismatch) for (size_t i = 0; i < node->InputDefs().size(); i++) { @@ -383,7 +402,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"ReduceMax", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, [this](const Node* node, const InitializedTensorSet&) { const auto& input_arg = node->InputDefs()[1]; auto shape = input_arg->Shape(); @@ -400,7 +419,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Reshape", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, [this](const Node* node, const InitializedTensorSet&) { // If the operator is unsqueeze // If axes is an input, then we cannot produce a static graph. @@ -415,7 +434,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Unsqueeze", obj}); } { - UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0, V_2024_1, V_2024_2, V_2024_3, V_2024_4, V_2024_5}, [this](const Node* node, const InitializedTensorSet&) { // check for attributes auto& upsample_attr = node->GetAttributes(); diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index 5cd4c8658fb77..a2db56deca7cd 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -31,7 +31,8 @@ enum versionNum { V_2024_1, V_2024_2, V_2024_3, - V_2024_4 + V_2024_4, + V_2024_5 }; using VersionNum = enum versionNum; diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml index d8c0120fc9ee5..9ee589a3d6ef3 100644 --- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml @@ -33,5 +33,5 @@ jobs: parameters: AgentPool : 'Linux-CPU-2019' JobName: 'Linux_CI_Dev' - RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.3.0 -x "--use_openvino CPU --build_wheel"' + RunDockerBuildArgs: '-o ubuntu22.04 -p 3.10 -d openvino -v 2024.4.0 -x "--use_openvino CPU --build_wheel"' TimeoutInMinutes: 120 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino index 4c80e7a907630..8f3dcb69d6c56 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino @@ -1,7 +1,7 @@ ARG UBUNTU_VERSION=22.04 FROM ubuntu:${UBUNTU_VERSION} -ARG OPENVINO_VERSION=2024.3.0 +ARG OPENVINO_VERSION=2024.4.0 ARG PYTHON_VERSION=3.10 ADD scripts /tmp/scripts @@ -19,9 +19,9 @@ ENV IE_PLUGINS_PATH=$INTEL_OPENVINO_DIR/runtime/lib/intel64 ENV DEBIAN_FRONTEND=noninteractive RUN cd /opt && mkdir -p intel && cd intel && \ - wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.3/linux/l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64.tgz && \ - tar xzf l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64.tgz && \ - mv l_openvino_toolkit_ubuntu22_2024.3.0.16041.1e3b88e4e3f_x86_64 openvino_2024.3.0 && \ + wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.4/linux/l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \ + tar xzf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64.tgz && \ + mv l_openvino_toolkit_ubuntu22_2024.4.0.16579.c3152d32c9c_x86_64 openvino_2024.4.0 && \ cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y WORKDIR /root From bbe7c8773837aa7573e202aefd2c633a06be2c23 Mon Sep 17 00:00:00 2001 From: "Po-Wei (Vincent)" Date: Fri, 15 Nov 2024 10:50:55 -0800 Subject: [PATCH 06/28] Fix 1.20 cuda minimal build failure (#22751) ### Description Fixes build failure for the cuda minimal build ### Motivation and Context [This change](https://github.com/microsoft/onnxruntime/pull/19470) in 1.20 is causing build failures for the cuda minimal build. Essentially, some cudnn logic was not guarded by the `USE_CUDA_MINIMAL`. Also the build is looking for cudnn while in the cuda minimal build it shouldn't depend on it, resulting in linking error. cc @gedoensmax @chilo-ms --- cmake/onnxruntime_unittests.cmake | 5 ++++- onnxruntime/core/providers/cuda/cudnn_fe_call.cc | 4 ++-- onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h | 4 +++- 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 561f65a33b89c..3b73933f3ff1d 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -67,7 +67,10 @@ function(AddTest) if(onnxruntime_USE_CUDA) #XXX: we should not need to do this. onnxruntime_test_all.exe should not have direct dependency on CUDA DLLs, # otherwise it will impact when CUDA DLLs can be unloaded. - target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart cudnn_frontend) + target_link_libraries(${_UT_TARGET} PRIVATE CUDA::cudart) + if(NOT onnxruntime_CUDA_MINIMAL) + target_link_libraries(${_UT_TARGET} PRIVATE cudnn_frontend) + endif() endif() target_link_libraries(${_UT_TARGET} PRIVATE ${_UT_LIBS} GTest::gtest GTest::gmock ${onnxruntime_EXTERNAL_LIBRARIES}) endif() diff --git a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc index 640025c248187..7cd320a26d973 100644 --- a/onnxruntime/core/providers/cuda/cudnn_fe_call.cc +++ b/onnxruntime/core/providers/cuda/cudnn_fe_call.cc @@ -4,7 +4,7 @@ #include "core/providers/cuda/shared_inc/cudnn_fe_call.h" #include "core/providers/shared_library/provider_api.h" #include -#if !defined(__CUDACC__) +#if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL) #include #endif #ifdef _WIN32 @@ -22,7 +22,7 @@ const char* CudaErrString(ERRTYPE) { ORT_NOT_IMPLEMENTED(); } -#if !defined(__CUDACC__) +#if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL) #define CASE_ENUM_TO_STR_CUDNN_FE(x) \ case cudnn_frontend::error_code_t::x: \ return #x diff --git a/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h b/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h index a51d84a7efa59..2ce7bc0bf51fd 100644 --- a/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h +++ b/onnxruntime/core/providers/cuda/shared_inc/cudnn_fe_call.h @@ -5,7 +5,7 @@ #include "core/common/common.h" #include "core/providers/cuda/cuda_pch.h" #include "core/providers/cuda/shared_inc/cuda_call.h" -#if !defined(__CUDACC__) +#if !defined(__CUDACC__) && !defined(USE_CUDA_MINIMAL) #include #endif namespace onnxruntime { @@ -14,10 +14,12 @@ namespace onnxruntime { // Error handling // ----------------------------------------------------------------------- +#ifndef USE_CUDA_MINIMAL #define CUDNN_FE_CALL(expr) (CudaCall((cudnn_frontend::error_t)(expr), #expr, "CUDNN_FE", \ cudnn_frontend::error_code_t::OK, "", __FILE__, __LINE__)) #define CUDNN_FE_CALL_THROW(expr) (CudaCall((cudnn_frontend::error_t)(expr), #expr, "CUDNN_FE", \ cudnn_frontend::error_code_t::OK, "", __FILE__, __LINE__)) +#endif } // namespace onnxruntime From c73a3d1804ac494c612100f1d7b912b1e585da00 Mon Sep 17 00:00:00 2001 From: Jing Fang <126209182+fajin-corp@users.noreply.github.com> Date: Fri, 15 Nov 2024 22:59:11 +0000 Subject: [PATCH 07/28] [ARM] MatMulNBits fp16 support - connect kernels (#22856) ### Description A breakdown PR of https://github.com/microsoft/onnxruntime/pull/22651 ### Motivation and Context --- cmake/onnxruntime_mlas.cmake | 8 +- .../mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp | 2 +- onnxruntime/core/mlas/lib/platform.cpp | 4 +- onnxruntime/core/mlas/lib/qnbitgemm.cpp | 94 +++- onnxruntime/core/mlas/lib/qnbitgemm.h | 81 ++- ...nel_neon.cpp => qnbitgemm_kernel_neon.cpp} | 17 +- ..._kernel_neon.h => qnbitgemm_kernel_neon.h} | 45 +- .../core/mlas/lib/sqnbitgemm_kernel_avx2.cpp | 4 +- .../mlas/lib/sqnbitgemm_kernel_avx512.cpp | 2 +- .../mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp | 2 +- .../mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp | 4 +- .../mlas/lib/sqnbitgemm_kernel_neon_int8.cpp | 2 +- .../test/contrib_ops/matmul_4bits_test.cc | 74 +-- ...nch_sqnbitgemm.cpp => bench_qnbitgemm.cpp} | 91 ++-- onnxruntime/test/mlas/bench/bench_util.h | 25 +- .../mlas/unittest/test_hqnbitgemm_neon.cpp | 504 ++++++++++++++++++ onnxruntime/test/mlas/unittest/test_util.h | 12 +- 17 files changed, 850 insertions(+), 121 deletions(-) rename onnxruntime/core/mlas/lib/{sqnbitgemm_kernel_neon.cpp => qnbitgemm_kernel_neon.cpp} (87%) rename onnxruntime/core/mlas/lib/{sqnbitgemm_kernel_neon.h => qnbitgemm_kernel_neon.h} (74%) rename onnxruntime/test/mlas/bench/{bench_sqnbitgemm.cpp => bench_qnbitgemm.cpp} (65%) create mode 100644 onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index a85ea942c42a3..22971f3313a60 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -84,8 +84,8 @@ function(setup_mlas_source_for_windows) ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp - ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h - ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp + ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h + ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp ${MLAS_SRC_DIR}/fp16_neon_common.cpp @@ -363,8 +363,8 @@ else() ${MLAS_SRC_DIR}/qgemm_kernel_neon.cpp ${MLAS_SRC_DIR}/qgemm_kernel_udot.cpp ${MLAS_SRC_DIR}/qgemm_kernel_sdot.cpp - ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.h - ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon.cpp + ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.h + ${MLAS_SRC_DIR}/qnbitgemm_kernel_neon.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_fp32.cpp ${MLAS_SRC_DIR}/sqnbitgemm_kernel_neon_int8.cpp ) diff --git a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp index f1bc013a469d9..69e37d2b916d1 100644 --- a/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp +++ b/onnxruntime/core/mlas/lib/hqnbitgemm_kernel_neon_fp16.cpp @@ -24,7 +24,7 @@ Module Name: #include "fp16_common.h" #include "qnbitgemm.h" -#include "sqnbitgemm_kernel_neon.h" +#include "qnbitgemm_kernel_neon.h" namespace sqnbitgemm_neon { diff --git a/onnxruntime/core/mlas/lib/platform.cpp b/onnxruntime/core/mlas/lib/platform.cpp index 12f7dd3e74dbc..81bef3b9f194c 100644 --- a/onnxruntime/core/mlas/lib/platform.cpp +++ b/onnxruntime/core/mlas/lib/platform.cpp @@ -531,6 +531,7 @@ Return Value: this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchNeon; this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchNeon; this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchNeon; + this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; // // Check if the processor supports ASIMD dot product instructions. @@ -560,9 +561,6 @@ Return Value: this->SymmQgemmDispatch = &MlasSymmQgemmS8DispatchSdot; this->ConvSymU8S8Dispatch = &MlasConvSymU8DispatchDot; this->ConvSymS8S8Dispatch = &MlasConvSymS8DispatchDot; - - // MlasSQNBitGemmDispatchNeon has a dependency on dot product instructions - this->QNBitGemmDispatch = &MlasSQNBitGemmDispatchNeon; } #if defined(__linux__) diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.cpp b/onnxruntime/core/mlas/lib/qnbitgemm.cpp index 635a3b47a23fa..f064a8e1d6a78 100644 --- a/onnxruntime/core/mlas/lib/qnbitgemm.cpp +++ b/onnxruntime/core/mlas/lib/qnbitgemm.cpp @@ -82,7 +82,12 @@ MlasIsQNBitGemmAvailable( switch (Variant) { case SQNBitGemmVariant_BitWidth4_CompFp32: { return Dispatch->SQ4BitGemmM1Kernel_CompFp32 != nullptr && - Dispatch->Q4BitBlkDequantBForSgemm_CompFp32 != nullptr; + Dispatch->SQ4BitBlkDequantBForSgemm_CompFp32 != nullptr; + } + case HQNBitGemmVariant_BitWidth4_CompFp16: { + return Dispatch->HQ4BitGemmPackQuantBData != nullptr && + Dispatch->HQ4BitGemmKernel_CompFp16 != nullptr && + Dispatch->HQ4BitBlkDequantBForHgemm_CompFp16 != nullptr; } case SQNBitGemmVariant_BitWidth4_CompInt8: { // SQ4BitGemmKernel_BlkSum_CompInt8 return @@ -253,6 +258,16 @@ MlasQNBitGemmPackQuantBData( packed_quant_b, ThreadPool ); + } else if (ComputeType == HQNBIT_CompFp16 && Dispatch->HQ4BitGemmPackQuantBData != nullptr) { + Dispatch->HQ4BitGemmPackQuantBData( + N, + K, + BlkLen, + ComputeType, + static_cast(QuantBData), + static_cast(PackedQuantBDataAndOrBlkSumWorkspace), + ThreadPool + ); } else if (Dispatch->SQ4BitGemmPackQuantBData != nullptr) { // TODO: these assertions are true if called from matmul_nbits kernel but not from mlas tests. //assert(QuantBScale == nullptr); @@ -387,7 +402,7 @@ SQ4BitGemm_CompFp32( float* c_blk = C + n; const float* bias = (Bias == nullptr) ? nullptr : Bias + n; - GetMlasPlatform().QNBitGemmDispatch->Q4BitBlkDequantBForSgemm_CompFp32( + GetMlasPlatform().QNBitGemmDispatch->SQ4BitBlkDequantBForSgemm_CompFp32( BlkLen, dequant_b, b_col, b_col_scale, b_col_zp, CountN, K, k_blks ); @@ -419,6 +434,79 @@ SQ4BitGemm_CompFp32( } } +void +HQ4BitGemm_CompFp16( + const size_t BlkLen, + const size_t K, + const MLAS_QNBIT_GEMM_DATA_PARAMS* const DataParams, + void* const PerGemmWorkspace, + const size_t RangeStartM, + const size_t RangeCountM, + const size_t RangeStartN, + const size_t RangeCountN +) +{ + constexpr size_t BlkBitWidth = 4; + MLAS_UNREFERENCED_PARAMETER(PerGemmWorkspace); + + const size_t lda = DataParams->lda; + const size_t ldc = DataParams->ldc; + const size_t k_blk_num = MlasDivRoundup(K, BlkLen); + const size_t qldb = k_blk_num * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); + const size_t ldb = k_blk_num * BlkLen; + const size_t k_zp_bytes = MlasQNBitZeroPointsForBlksSizeInBytes(k_blk_num); + + const MLAS_FP16* A = DataParams->A + RangeStartM * lda; + MLAS_FP16* C = DataParams->C + RangeStartM * ldc + RangeStartN; + const std::byte* QuantBData = static_cast(DataParams->PackedQuantBData) + RangeStartN * qldb; + const MLAS_FP16* QuantBScale = DataParams->QuantBScale + RangeStartN * k_blk_num; + const std::byte* QuantBZeroPoint = + (DataParams->QuantBZeroPoint == nullptr) + ? nullptr + : static_cast(DataParams->QuantBZeroPoint) + RangeStartN * k_zp_bytes; + const MLAS_FP16* Bias = (DataParams->Bias == nullptr) ? nullptr : DataParams->Bias; + + // 32N is the sweet spot of cache utilization. It is machine dependent though. + constexpr size_t StrideM = 2; + constexpr size_t StrideN = 32; + + // TODO(fajin): move allocation up to the op. + size_t bufsize = ldb * StrideN * sizeof(MLAS_FP16); + MlasThreadedBufAlloc(bufsize); + auto* dequant_b = reinterpret_cast(ThreadedBufHolder.get()); + + for (size_t n = 0, countN; n < RangeCountN; n += countN) { + countN = std::min(StrideN, RangeCountN - n); + GetMlasPlatform().QNBitGemmDispatch->HQ4BitBlkDequantBForHgemm_CompFp16( + BlkLen, dequant_b, QuantBData, QuantBScale, QuantBZeroPoint, countN, K, k_blk_num + ); + + const MLAS_FP16* a = A; + MLAS_FP16* c = C; + for (size_t m = 0, countM; m < RangeCountM; m += countM) { + countM = std::min(StrideM, RangeCountM - m); + GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmKernel_CompFp16( + a, dequant_b, Bias, c, countM, countN, K, lda, ldb, ldc + ); + + if (DataParams->PostProcessor != nullptr) { + DataParams->PostProcessor->Process( + DataParams->C, RangeStartM + m, RangeStartN + n, countM, countN, ldc + ); + } + + a += countM * lda; + c += countM * ldc; + } + + QuantBData += countN * qldb; + QuantBScale += countN * k_blk_num; + QuantBZeroPoint = QuantBZeroPoint ? QuantBZeroPoint + countN * k_zp_bytes : nullptr; + Bias = Bias ? Bias + countN : nullptr; + C += countN; + } +} + void SQ4BitGemm_CompInt8( const size_t BlkLen, @@ -720,7 +808,7 @@ GetQNBitGemm(QNBitGemmVariant variant) { switch (variant) { case HQNBitGemmVariant_BitWidth4_CompFp16: - return nullptr; + return HQ4BitGemm_CompFp16; default: return nullptr; } diff --git a/onnxruntime/core/mlas/lib/qnbitgemm.h b/onnxruntime/core/mlas/lib/qnbitgemm.h index 28e17f14b02c9..eb3d0b44ae3de 100644 --- a/onnxruntime/core/mlas/lib/qnbitgemm.h +++ b/onnxruntime/core/mlas/lib/qnbitgemm.h @@ -91,17 +91,17 @@ struct MLAS_QNBIT_GEMM_DISPATCH { // /** Gets size of packed quantized B data containing 4-bit integers. See MlasQNBitGemmPackQuantBDataSize(). */ - typedef size_t(SQ4BitGemmPackQuantBDataSize_Fn)( + typedef size_t(Q4BitGemmPackQuantBDataSize_Fn)( size_t N, size_t K, size_t BlkLen, MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); - SQ4BitGemmPackQuantBDataSize_Fn* Q4BitGemmPackQuantBDataSize = nullptr; + Q4BitGemmPackQuantBDataSize_Fn* Q4BitGemmPackQuantBDataSize = nullptr; /** Packs quantized B data containing 4-bit integers. See MlasQNBitGemmPackQuantBData(). */ - typedef void(SQ4BitGemmPackQuantBData_Fn)( + typedef void(Q4BitGemmPackQuantBData_Fn)( size_t N, size_t K, size_t BlkLen, @@ -111,7 +111,8 @@ struct MLAS_QNBIT_GEMM_DISPATCH { MLAS_THREADPOOL* ThreadPool ); - SQ4BitGemmPackQuantBData_Fn* SQ4BitGemmPackQuantBData = nullptr; + Q4BitGemmPackQuantBData_Fn* SQ4BitGemmPackQuantBData = nullptr; + Q4BitGemmPackQuantBData_Fn* HQ4BitGemmPackQuantBData = nullptr; typedef void(SQ4BitGemmPackQuantBDataAndSumBlk_Fn)( size_t N, @@ -142,7 +143,7 @@ struct MLAS_QNBIT_GEMM_DISPATCH { * @param[in] BlkLen number of quantized values per block * @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values) */ - typedef size_t(SQ4BitGemmPerGemmWorkspaceSize_Fn)( + typedef size_t(Q4BitGemmPerGemmWorkspaceSize_Fn)( size_t M, size_t N, size_t K, @@ -150,7 +151,7 @@ struct MLAS_QNBIT_GEMM_DISPATCH { MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); - SQ4BitGemmPerGemmWorkspaceSize_Fn* Q4BitGemmPerGemmWorkspaceSize = nullptr; + Q4BitGemmPerGemmWorkspaceSize_Fn* Q4BitGemmPerGemmWorkspaceSize = nullptr; /** * @brief Gets the required byte alignment of the per-GEMM intermediate workspace. @@ -158,12 +159,12 @@ struct MLAS_QNBIT_GEMM_DISPATCH { * @param[in] BlkLen number of quantized values per block * @param[in] ComputeType GEMM compute type (e.g., multiplying float or int8 values) */ - typedef size_t(SQ4BitGemmPerGemmWorkspaceAlignment_Fn)( + typedef size_t(Q4BitGemmPerGemmWorkspaceAlignment_Fn)( size_t BlkLen, MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType ); - SQ4BitGemmPerGemmWorkspaceAlignment_Fn* Q4BitGemmPerGemmWorkspaceAlignment = nullptr; + Q4BitGemmPerGemmWorkspaceAlignment_Fn* Q4BitGemmPerGemmWorkspaceAlignment = nullptr; // // SQNBIT_CompFp32 kernel function prototypes. @@ -229,7 +230,38 @@ struct MLAS_QNBIT_GEMM_DISPATCH { size_t BlockStrideQuantB ); - Q4BitBlkDequantBForSgemm_CompFp32_Fn* Q4BitBlkDequantBForSgemm_CompFp32 = nullptr; + Q4BitBlkDequantBForSgemm_CompFp32_Fn* SQ4BitBlkDequantBForSgemm_CompFp32 = nullptr; + + /** + * @brief Dequantize B into the format expected by the Sgemm kernel. + * B is a quantized 4-bit integer matrix that is block quantized and column major. + * This is equivalent to dequantizing B and then running MlasSgemmCopyPackB. + * + * @param BlkLen Number of values in a block. + * @param[out] FpData Supplies the output buffer for the dequantized B float data. + * It should have enough space for + * (CountN + 16 - 1) / 16 * 16 * (CountK + BlkLen - 1) / BlkLen * BlkLen + * elements. Only the first (CountN + 16 - 1) / 16 * 16 * CountK elements are + * useful, but the kernel implementation can be simplified with the extra space. + * @param QuantBData Supplies the quantized B matrix block data. + * @param QuantBScale Supplies the quantized B matrix block scale values. + * @param QuantBZeroPoint Supplies the quantized B matrix block zero point values. Optional. + * @param CountN Number of columns of B. + * @param CountK Number of rows of B. + * @param BlockStrideQuantB Number of blocks between adjacent columns of the quantized B matrix. + */ + typedef void(Q4BitBlkDequantBForSgemm_CompFp16_Fn)( + size_t BlkLen, + MLAS_FP16* FpData, + const std::byte* QuantBData, + const MLAS_FP16* QuantBScale, + const std::byte* QuantBZeroPoint, + size_t CountN, + size_t CountK, + size_t BlockStrideQuantB + ); + + Q4BitBlkDequantBForSgemm_CompFp16_Fn* HQ4BitBlkDequantBForHgemm_CompFp16 = nullptr; // // SQNBIT_CompInt8 kernel function prototypes. @@ -338,4 +370,35 @@ struct MLAS_QNBIT_GEMM_DISPATCH { float* AScaledGroupSum // scale_k * Sum_blklen(a_i) ); QuantizeARowComputeBlkSum_CompInt8_Fn* QuantizeARowComputeBlkSum_CompInt8 = nullptr; + + /** + * @brief Multiply fp16 matrix A rows with fp16 matrix B columns. + * Results are written to fp16 matrix C. + * If bias is provided, the bias are added to the result. + * + * @param A first row of the A matrix segment. Row major. + * @param B first column of the B matrix segment. Column major. + * @param Bias the bias at the target column. Optional. + * @param[out] C first element of the output matrix segment. Row major. + * @param CountM the number of rows of A chunk. + * @param CountN the number of columns of B chunk. + * @param K the number of columns of A matrix and rows of B matrix. + * @param lda the leading dimension of A. + * @param ldb the leading dimension of B. + * @param ldc the leading dimension of C. + */ + typedef void(HQ4BitGemmKernel_CompFp16_Fn)( + const MLAS_FP16* A, + const MLAS_FP16* B, + const MLAS_FP16* Bias, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t K, + size_t lda, + size_t ldb, + size_t ldc + ); + + HQ4BitGemmKernel_CompFp16_Fn* HQ4BitGemmKernel_CompFp16 = nullptr; }; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp similarity index 87% rename from onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp rename to onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp index 03c8ce264c846..d05de64e68ec8 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.cpp @@ -6,7 +6,7 @@ Licensed under the MIT License. Module Name: - sqnbitgemm_kernel_neon.cpp + qnbitgemm_kernel_neon.cpp Abstract: @@ -20,7 +20,7 @@ Module Name: #include #include "qnbitgemm.h" -#include "sqnbitgemm_kernel_neon.h" +#include "qnbitgemm_kernel_neon.h" #include "sqnbitgemm_q8_block.h" namespace sqnbitgemm_neon @@ -185,10 +185,17 @@ const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchNeon = []() { d.Q4BitGemmPerGemmWorkspaceAlignment = sqnbitgemm_neon::Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = sqnbitgemm_neon::SQ4BitGemmM1Kernel_CompFp32; - d.Q4BitBlkDequantBForSgemm_CompFp32 = sqnbitgemm_neon::Q4BitBlkDequantBForSgemm_CompFp32; - - d.SQ4BitGemmKernel_CompInt8 = sqnbitgemm_neon::SQ4BitGemmKernel_CompInt8; + d.SQ4BitBlkDequantBForSgemm_CompFp32 = sqnbitgemm_neon::SQ4BitBlkDequantBForSgemm_CompFp32; + if (MLAS_CPUIDINFO::GetCPUIDInfo().HasArmNeonDot()) { + d.SQ4BitGemmKernel_CompInt8 = sqnbitgemm_neon::SQ4BitGemmKernel_CompInt8; + } d.QuantizeARow_CompInt8 = sqnbitgemm_neon::QuantizeARow_CompInt8; +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + d.HQ4BitGemmPackQuantBData = sqnbitgemm_neon::HQ4BitGemmPackQuantBData_CompFp16; + d.HQ4BitBlkDequantBForHgemm_CompFp16 = sqnbitgemm_neon::HQ4BitBlkDequantBForHgemm_CompFp16; + d.HQ4BitGemmKernel_CompFp16 = sqnbitgemm_neon::HQ4BitGemmKernel_CompFp16; +#endif // MLAS_F16VEC_INTRINSICS_SUPPORTED && MLAS_TARGET_ARM64 + return d; }(); diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h similarity index 74% rename from onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h rename to onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h index 247d885615393..ccadd24ac1991 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.h +++ b/onnxruntime/core/mlas/lib/qnbitgemm_kernel_neon.h @@ -6,7 +6,7 @@ Licensed under the MIT License. Module Name: - sqnbitgemm_kernel_neon.h + qnbitgemm_kernel_neon.h Abstract: @@ -53,7 +53,7 @@ SQ4BitGemmM1Kernel_CompFp32( ); void -Q4BitBlkDequantBForSgemm_CompFp32( +SQ4BitBlkDequantBForSgemm_CompFp32( size_t BlkLen, float* FpData, const std::byte* QuantBData, @@ -64,6 +64,47 @@ Q4BitBlkDequantBForSgemm_CompFp32( size_t BlockCountK ); +// HQNBIT_CompFp16 declarations +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) +void +HQ4BitGemmPackQuantBData_CompFp16( + size_t N, + size_t K, + size_t BlkLen, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, + const std::byte* QuantBDataBegin, + std::byte* PackedQuantBDataBegin, + MLAS_THREADPOOL* ThreadPool +); + +void +HQ4BitBlkDequantBForHgemm_CompFp16( + size_t BlkLen, + MLAS_FP16* FpData, + const std::byte* QuantBData, + const MLAS_FP16* QuantBScale, + const std::byte* QuantBZeroPoint, + size_t CountN, + size_t K, + size_t BlockCountK +); + +void +HQ4BitGemmKernel_CompFp16( + const MLAS_FP16* A, + const MLAS_FP16* B, + const MLAS_FP16* Bias, + MLAS_FP16* C, + size_t CountM, + size_t CountN, + size_t K, + size_t lda, + size_t ldb, + size_t ldc +); + +#endif // !(defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64)) + // SQNBIT_CompInt8 declarations void diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp index 01443e2ff077f..81615da46aa2e 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx2.cpp @@ -1341,7 +1341,7 @@ const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2 = []() { d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx2; - d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; + d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx2; d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx2; @@ -1360,7 +1360,7 @@ const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx2vnni = []() { d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx2; - d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; + d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx2vnni; d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx2; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp index 425dfbe87c982..b4e25d4e4040a 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512.cpp @@ -363,7 +363,7 @@ const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512 = []() { d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32_avx512; - d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; + d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx512; d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx512; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp index 777d4609ef5d4..a4468bb906bbc 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_avx512vnni.cpp @@ -348,7 +348,7 @@ const MLAS_QNBIT_GEMM_DISPATCH MlasSQNBitGemmDispatchAvx512vnni = []() { d.Q4BitGemmPerGemmWorkspaceAlignment = Q4BitGemmPerGemmWorkspaceAlignment; d.SQ4BitGemmM1Kernel_CompFp32 = SQ4BitGemmM1Kernel_CompFp32; - d.Q4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; + d.SQ4BitBlkDequantBForSgemm_CompFp32 = Q4BitBlkDequantBForSgemm_CompFp32_avx2; d.SQ4BitGemmKernel_BlkSum_CompInt8 = SQ4BitGemmKernel_BlkSum_CompInt8_avx512vnni; d.QuantizeARowComputeBlkSum_CompInt8 = QuantizeARow_CompInt8_avx512; diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp index 7b9f05a9c385d..31a499b8243af 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_fp32.cpp @@ -22,7 +22,7 @@ Module Name: #include #include "qnbitgemm.h" -#include "sqnbitgemm_kernel_neon.h" +#include "qnbitgemm_kernel_neon.h" namespace sqnbitgemm_neon { @@ -608,7 +608,7 @@ Q4BitBlkDequantBForSgemm_CompFp32_Impl( } // namespace void -Q4BitBlkDequantBForSgemm_CompFp32( +SQ4BitBlkDequantBForSgemm_CompFp32( size_t BlkLen, float* FpData, const std::byte* QuantBData, diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp index f1acd99c7b693..73beb06a3cfad 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon_int8.cpp @@ -22,7 +22,7 @@ Module Name: #include #include "qnbitgemm.h" -#include "sqnbitgemm_kernel_neon.h" +#include "qnbitgemm_kernel_neon.h" #include "sqnbitgemm_q8_block.h" namespace sqnbitgemm_neon diff --git a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc index 7d99ffab9a88f..87a9ef762b9ac 100644 --- a/onnxruntime/test/contrib_ops/matmul_4bits_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_4bits_test.cc @@ -82,6 +82,7 @@ struct TestOptions { bool has_bias{false}; std::optional output_abs_error{}; + std::optional output_rel_error{}; }; std::ostream& operator<<(std::ostream& os, const TestOptions& opts) { @@ -253,6 +254,10 @@ void RunTest(const TestOptions& opts, test.SetOutputAbsErr("Y", *opts.output_abs_error); } + if (opts.output_rel_error.has_value()) { + test.SetOutputRelErr("Y", *opts.output_rel_error); + } + if (!explicit_eps.empty()) { test.ConfigEps(std::move(explicit_eps)); } @@ -269,16 +274,11 @@ void TestMatMulNBitsTyped() { base_opts.block_size = block_size; base_opts.accuracy_level = accuracy_level; - if (base_opts.accuracy_level == 4) { + if constexpr (std::is_same::value) { + base_opts.output_abs_error = 0.055f; + base_opts.output_rel_error = 0.02f; + } else if (base_opts.accuracy_level == 4) { base_opts.output_abs_error = 0.1f; - } else { - if constexpr (std::is_same::value) { -#ifdef USE_WEBGPU - base_opts.output_abs_error = 0.03f; -#else - base_opts.output_abs_error = 0.01f; -#endif - } } { @@ -391,48 +391,48 @@ TEST(MatMulNBits, Float32_Accuracy4) { TestMatMulNBitsTyped(); } -#ifdef MLAS_TARGET_AMD64_IX86 +#if defined(MLAS_TARGET_AMD64_IX86) || defined(MLAS_TARGET_ARM64) #if !defined(USE_DML) // Actual and expected difference is over 0.01 with DmlExecutionProvider. // Skip the tests instead of raising the tolerance to make is pass. +TEST(MatMulNBits, Float16_Accuracy2) { + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); + TestMatMulNBitsTyped(); +} + TEST(MatMulNBits, Float16_Accuracy0) { TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); } -TEST(MatMulNBits, Float16_Accuracy1) { - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); - TestMatMulNBitsTyped(); -} - TEST(MatMulNBits, Float16_Accuracy4) { TestMatMulNBitsTyped(); TestMatMulNBitsTyped(); diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp similarity index 65% rename from onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp rename to onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp index df543d8eca1fc..64d229889214b 100644 --- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp +++ b/onnxruntime/test/mlas/bench/bench_qnbitgemm.cpp @@ -8,6 +8,7 @@ #include #include #include +#include #include "benchmark/benchmark.h" @@ -16,16 +17,16 @@ #include "core/util/thread_utils.h" #include "core/platform/env_var_utils.h" -template -void RunSQNBitGemmBenchmark(size_t BlkLen, - size_t M, size_t N, size_t K, - size_t Threads, - bool Symmetric, - bool HasBias, - MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, - benchmark::State& state) { +template +void RunQNBitGemmBenchmark(size_t BlkLen, + size_t M, size_t N, size_t K, + size_t Threads, + bool Symmetric, + bool HasBias, + MLAS_QNBIT_GEMM_COMPUTE_TYPE ComputeType, + benchmark::State& state) { if (!MlasIsQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) { - state.SkipWithMessage("SQNBitGemm is not available with the given configuration on the current machine."); + state.SkipWithMessage("QNBitGemm is not available with the given configuration on the current machine."); return; } @@ -43,19 +44,19 @@ void RunSQNBitGemmBenchmark(size_t BlkLen, onnxruntime::concurrency::CreateThreadPool(&onnxruntime::Env::Default(), tpo, onnxruntime::concurrency::ThreadPoolType::INTRA_OP)); - const auto A = RandomVectorUniform(M * K, -1.0f, 1.0f); - const auto B = RandomVectorUniform(K * N, -1.0f, 1.0f); + const auto A = RandomVectorUniform(M * K, AType(-1.0f), AType(1.0f)); + const auto B = RandomVectorUniform(K * N, AType(-1.0f), AType(1.0f)); - const auto Bias = HasBias ? RandomVectorUniform(N, -1.0f, 1.0f) : std::vector(); + const auto Bias = HasBias ? RandomVectorUniform(N, AType(-1.0f), AType(1.0f)) : std::vector(); - std::vector C(static_cast(M * N)); + std::vector C(static_cast(M * N)); std::vector QuantBData(QuantBDataSizeInBytes); - std::vector QuantBScale(QuantBScaleSize); + std::vector QuantBScale(QuantBScaleSize); std::vector QuantBZeroPoint(Symmetric ? 0 : QuantBZeroPointSizeInBytes); bool has_zp_input = !Symmetric; - MlasQuantizeBlockwise(QuantBData.data(), QuantBScale.data(), + MlasQuantizeBlockwise(QuantBData.data(), QuantBScale.data(), Symmetric ? nullptr : QuantBZeroPoint.data(), B.data(), static_cast(BlkLen), /* columnwise */ true, static_cast(K), static_cast(N), static_cast(N), @@ -76,7 +77,7 @@ void RunSQNBitGemmBenchmark(size_t BlkLen, tp.get()); } - MLAS_QNBIT_GEMM_DATA_PARAMS params{}; + MLAS_QNBIT_GEMM_DATA_PARAMS params{}; params.A = A.data(); params.lda = K; if (PackedQuantBData != nullptr) @@ -99,8 +100,8 @@ void RunSQNBitGemmBenchmark(size_t BlkLen, } } -template -void SQNBITGEMM(benchmark::State& state) { +template +void QNBITGEMM(benchmark::State& state) { using onnxruntime::narrow; const auto BlkLen = narrow(state.range(0)); @@ -112,44 +113,48 @@ void SQNBITGEMM(benchmark::State& state) { const bool HasBias = narrow(state.range(6)); const auto ComputeType = static_cast(state.range(7)); - RunSQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, HasBias, ComputeType, state); + RunQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, HasBias, ComputeType, state); } -static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) { +template +static void QNBitGemmArgs(benchmark::internal::Benchmark* b) { b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "HasBias", "ComputeType"}); b->ArgsProduct({ - {128}, // BlkLen - {1}, // M - {4096, 11008}, // N - {4096, 11008}, // K - {1, 8}, // Threads - {int64_t{false}, int64_t{true}}, // Symmetric - {int64_t{false}, int64_t{true}}, // HasBias - {int64_t{SQNBIT_CompFp32}, int64_t{SQNBIT_CompInt8}}, // ComputeType + {128}, // BlkLen + {1, 4096}, // M + {4096, 11008}, // N + {4096, 11008}, // K + {1, 8}, // Threads + {int64_t{false}, int64_t{true}}, // Symmetric + {int64_t{false}, int64_t{true}}, // HasBias + std::is_same_v + ? std::vector{int64_t{HQNBIT_CompFp16}} + : std::vector{int64_t{SQNBIT_CompFp32}, int64_t{SQNBIT_CompInt8}}, // ComputeType }); } -BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime(); +BENCHMARK(QNBITGEMM)->Apply(QNBitGemmArgs)->UseRealTime(); +BENCHMARK(QNBITGEMM)->Apply(QNBitGemmArgs)->UseRealTime(); // This test gets benchmark arguments from environment variables. -template -void SQNBITGEMM_ENV(benchmark::State& state) { +template +void QNBITGEMM_ENV(benchmark::State& state) { using onnxruntime::ParseEnvironmentVariableWithDefault; - const auto BlkLen = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_BLKLEN", 32); - const auto M = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_M", 1); - const auto N = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_N", 4096); - const auto K = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_K", 4096); - const auto Threads = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_THREADS", 1); - const auto Symmetric = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_SYMMETRIC", true); - const auto HasBias = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_HAS_BIAS", false); - const auto ComputeType = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_COMPUTE_TYPE", + const auto BlkLen = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_BLKLEN", 32); + const auto M = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_M", 1); + const auto N = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_N", 4096); + const auto K = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_K", 4096); + const auto Threads = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_THREADS", 1); + const auto Symmetric = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_SYMMETRIC", true); + const auto HasBias = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_HAS_BIAS", false); + const auto ComputeType = ParseEnvironmentVariableWithDefault("ORT_QNBITGEMM_COMPUTE_TYPE", static_cast(SQNBIT_CompFp32)); - RunSQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, HasBias, - static_cast(ComputeType), - state); + RunQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, HasBias, + static_cast(ComputeType), + state); std::ostringstream s; s << "BlkBitWidth:" << BlkBitWidth << "/BlkLen:" << BlkLen @@ -159,4 +164,4 @@ void SQNBITGEMM_ENV(benchmark::State& state) { state.SetLabel(s.str()); } -BENCHMARK(SQNBITGEMM_ENV<4>)->UseRealTime(); +BENCHMARK(QNBITGEMM_ENV)->UseRealTime(); diff --git a/onnxruntime/test/mlas/bench/bench_util.h b/onnxruntime/test/mlas/bench/bench_util.h index f96dd5c673b3d..78789ef1cbc1a 100644 --- a/onnxruntime/test/mlas/bench/bench_util.h +++ b/onnxruntime/test/mlas/bench/bench_util.h @@ -8,8 +8,12 @@ #include #include +#include "core/framework/float16.h" +#include "core/mlas/inc/mlas.h" + template -std::vector RandomVectorUniform( +typename std::enable_if_t, std::vector> +RandomVectorUniform( size_t N, ElementType min_value = std::numeric_limits::lowest(), ElementType max_value = std::numeric_limits::max()) { @@ -26,6 +30,25 @@ std::vector RandomVectorUniform( return r; } +template +typename std::enable_if_t, std::vector> +RandomVectorUniform( + size_t N, + ElementType min_value, + ElementType max_value) { + if (min_value.ToFloat() >= max_value.ToFloat()) { + return std::vector(N, min_value); + } + std::default_random_engine generator(static_cast(N)); + std::uniform_real_distribution distribution(min_value.ToFloat(), max_value.ToFloat()); + + std::vector r(N); + for (size_t i = 0; i < N; i++) { + r[i] = ElementType(distribution(generator)); + } + return r; +} + std::vector RandomVectorUniform(std::vector shape, float min_value, float max_value); std::vector BenchArgsVector(benchmark::State& state, size_t& start, size_t count); diff --git a/onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp b/onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp new file mode 100644 index 0000000000000..a455007c2f6ae --- /dev/null +++ b/onnxruntime/test/mlas/unittest/test_hqnbitgemm_neon.cpp @@ -0,0 +1,504 @@ +/*++ + +Copyright (c) Microsoft Corporation. All rights reserved. + +Licensed under the MIT License. + +Module Name: + + test_hqnbitgemm_neon.cpp + +Abstract: + + Tests for MLAS n-bit int block quantized GEMM on ARM CPU with input A type T1 fp16. + +--*/ + +#include +#include + +#include "test_util.h" +#include "core/mlas/lib/mlasi.h" +#include "core/mlas/lib/qnbitgemm.h" +#include "mlas_qnbit.h" + +#if defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) + +class MlasNeonFp16CastTest : public MlasTestBase { + private: + MatrixGuardBuffer fp32Buffer_; + MatrixGuardBuffer fp16Buffer_; + + template + void TestFp16ToFp32() { + const auto* src = fp16Buffer_.GetFilledBuffer(count, [](unsigned short* start, size_t size) { + for (size_t i = 0; i < size; i++) { + start[i] = static_cast(i); + } + }); + auto* dest = fp32Buffer_.GetBuffer(count, true); + + MlasCastF16ToF32KernelNeon(src, dest, count); + + for (size_t i = 0; i < count; i++) { + if ((src[i] & 0x1c00) == 0x1c00) continue; // skip inf and nan + ASSERT_EQ(dest[i], MLAS_FP16::FromBits(src[i]).ToFloat()); + } + } + + template + void TestFp32ToFp16() { + const auto* src = fp32Buffer_.GetFilledBuffer(count, [](float* p, size_t size) { + for (size_t i = 0; i < size; i++) { + p[i] = static_cast(i) + 0.125f; + } + }); + auto* dest = fp16Buffer_.GetBuffer(count, true); + + MlasCastF32ToF16KernelNeon(src, dest, count); + + for (size_t i = 0; i < count; i++) { + ASSERT_EQ(dest[i], MLAS_FP16(src[i]).val); + } + } + + public: + static const char* GetTestSuiteName() { + return "NeonFp16Cast"; + } + + void ExecuteShort(void) override { + TestFp16ToFp32<(1 << 16)>(); + TestFp16ToFp32<1>(); + TestFp16ToFp32<4>(); + TestFp16ToFp32<7>(); + TestFp32ToFp16<(1 << 16)>(); + TestFp32ToFp16<3>(); + TestFp32ToFp16<4>(); + TestFp32ToFp16<6>(); + } +}; + +class MlasNeonFp16PrepackTest : public MlasTestBase { + private: + std::random_device rd_; // a seed source for the random number engine + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + std::uniform_int_distribution<> distrib_; + MatrixGuardBuffer input_, ref_, packed_; + + template + MLAS_FORCEINLINE void Transpose8x8(const uint8_t* src, size_t n, size_t k, uint8_t* dst) { + for (size_t c = 0; c < 8; c++) { + for (size_t r = 0; r < 8; r++) { + size_t i = (n + c) * Ldb + r + k; + size_t j = n * Ldb + (r + k) * 8 + c; + dst[j] = src[i]; + } + } + } + + MLAS_FORCEINLINE + uint8_t GetInt4(uint8_t v, size_t i) { + return (i & 1) ? (v >> 4) : (v & 0x0f); + } + + MLAS_FORCEINLINE + void PrepackSlice(const uint8_t* src, size_t j, uint8_t* dst) { + for (size_t i = 0; i < 8; i++) { + uint8_t v0 = GetInt4(src[j + (i >> 1)], i); + uint8_t v1 = GetInt4(src[j + ((8 + i) >> 1)], i + 8); + dst[j + i] = v0 | (v1 << 4); + } + } + + template + MLAS_FORCEINLINE void Prepack(const uint8_t* src, uint8_t* dst) { + size_t n = 0; + for (; n + 8 <= N; n += 8) { + for (size_t k = 0; k < Ldb; k += 8) { + Transpose8x8(src, n, k, dst); + } + } + + for (; n < N; ++n) { + for (size_t k = 0; k < Ldb; k += 8) { + PrepackSlice(src, n * Ldb + k, dst); + } + } + } + + template + MLAS_FORCEINLINE void Check(const uint8_t* packed, const uint8_t* ref) { + size_t n = 0; + for (; n + 8 <= N; n += 8) { + for (size_t i = 0; i < K; i += 2) { + for (size_t j = 0; j < 8; ++j) { + ASSERT_EQ(packed[n * Ldb + (i >> 1) * 8 + j], ref[n * Ldb + (i >> 1) * 8 + j]) + << " seed " << seed_ + << " n " << n << " i " << i << " j " << j; + } + } + } + + for (; n < N; ++n) { + for (size_t i = 0; i < K; i += 2) { + ASSERT_EQ(packed[n * Ldb + (i >> 1)], ref[n * Ldb + (i >> 1)]) + << " seed " << seed_ + << " n " << n << " i " << i; + } + } + } + + template + void TestPrepack() { + constexpr size_t Bits = 4; + constexpr size_t Ldb = (((K + BlkLen - 1) & (~(BlkLen - 1))) * Bits + 7) / 8; + constexpr size_t BufferSize = N * Ldb; + auto InitializeBuffer = [this](uint8_t* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = static_cast(distrib_(gen_)); + } + }; + + const auto* input = input_.GetFilledBuffer(BufferSize, InitializeBuffer); + auto* packed = packed_.GetBuffer(BufferSize, true); + auto* ref = ref_.GetBuffer(BufferSize, true); + MlasQNBitGemmPackQuantBData( + N, K, Bits, BlkLen, MLAS_QNBIT_GEMM_COMPUTE_TYPE::HQNBIT_CompFp16, input, packed, + nullptr, false, nullptr, nullptr); + Prepack(input, ref); + Check(packed, ref); + } + + public: + MlasNeonFp16PrepackTest() + : seed_(rd_()), gen_(seed_), distrib_(0, 255) { + } + + static const char* GetTestSuiteName() { + return "NeonFp16Prepack"; + } + + void ExecuteShort(void) override { + TestPrepack<1, 1, 16>(); + TestPrepack<1, 15, 16>(); + TestPrepack<1, 31, 16>(); + TestPrepack<8, 1, 16>(); + TestPrepack<8, 16, 16>(); + TestPrepack<9, 31, 16>(); + TestPrepack<9, 33, 32>(); + TestPrepack<15, 33, 16>(); + TestPrepack<17, 67, 16>(); + TestPrepack<17, 96, 128>(); + TestPrepack<263, 263, 16>(); + } +}; + +class MlasNeonFp16DequantBTest : public MlasTestBase { + private: + std::random_device rd_; // a seed source for the random number engine + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + std::uniform_int_distribution<> distrib_; + std::uniform_real_distribution _distribFp; + MatrixGuardBuffer input_, zero_points_; + MatrixGuardBuffer dequant_, ref_, scales_; + + MLAS_FORCEINLINE + uint8_t GetInt4(uint8_t v, size_t i) { + return (i & 1) ? (v >> 4) : (v & 0x0f); + } + + template + void DequantB(const uint8_t* src, MLAS_FP16* dst, const MLAS_FP16* scales, const uint8_t* zero_points) { + constexpr size_t blkNum = (K + BlkLen - 1) / BlkLen; + constexpr size_t ld_src = (blkNum * BlkLen + 1) / 2; + constexpr size_t ld_dst = blkNum * BlkLen; + constexpr size_t ld_zp = (blkNum + 1) / 2; + size_t n = 0; + for (; n + 8 <= N; n += 8) { + size_t i_src = n * ld_src, i_dst = n * ld_dst, i_scale = n * blkNum, i_zp = n * ld_zp; + for (size_t blk = 0; blk < blkNum; i_zp += (blk & 1), ++blk, ++i_scale) { + for (size_t i = 0; i < BlkLen; i += 2, i_dst += 8) { + for (size_t j = 0; j < 8; ++j, ++i_src, ++i_dst) { + uint8_t v = src[i_src]; + float v0 = static_cast(GetInt4(v, 0)); + float v1 = static_cast(GetInt4(v, 1)); + float zp = static_cast(UseZeroPoints ? GetInt4(zero_points[i_zp + ld_zp * j], blk) : 8); + float scale = scales[i_scale + blkNum * j]; + dst[i_dst] = MLAS_FP16(v0 * scale - zp * scale); + dst[i_dst + 8] = MLAS_FP16(v1 * scale - zp * scale); + } + } + } + } + + for (; n < N; ++n) { + size_t i_src = n * ld_src, i_dst = n * ld_dst, i_scale = n * blkNum, i_zp = n * ld_zp; + for (size_t blk = 0; blk < blkNum; i_zp += (blk & 1), ++blk, ++i_scale) { + float zp = static_cast(UseZeroPoints ? GetInt4(zero_points[i_zp], blk) : 8); + float scale = scales[i_scale]; + for (size_t i = 0; i < BlkLen; i += 16, i_dst += 8) { + for (size_t j = 0; j < 16; j += 2, ++i_src, ++i_dst) { + uint8_t v = src[i_src]; + float v0 = static_cast(GetInt4(v, 0)); + float v1 = static_cast(GetInt4(v, 1)); + dst[i_dst] = MLAS_FP16(v0 * scale - zp * scale); + dst[i_dst + 8] = MLAS_FP16(v1 * scale - zp * scale); + } + } + } + } + } + + MLAS_FORCEINLINE + bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) { + float f0 = std::abs(v0.ToFloat()), f1 = std::abs(v1.ToFloat()); + return std::abs(f0 - f1) <= f1 * rtol + atol; + } + + template + MLAS_FORCEINLINE void Check(const MLAS_FP16* target, const MLAS_FP16* ref) { + size_t n = 0; + for (; n + 8 <= N; n += 8) { + for (size_t i = 0; i < K; ++i) { + for (size_t j = 0; j < 8; ++j) { + size_t idx = n * Ldb + i * 8 + j; + ASSERT_TRUE(FloatEqual(target[idx], ref[idx], 0.01f, 0.01f)) + << " seed " << seed_ + << " v0 " << target[idx] << " v1 " << ref[idx] + << " n " << n << " i " << i << " j " << j; + } + } + } + + for (; n < N; ++n) { + for (size_t i = 0; i < K; ++i) { + size_t idx = n * Ldb + i; + ASSERT_TRUE(FloatEqual(target[idx], ref[idx], 0.01f, 0.01f)) + << " seed " << seed_ + << " v0 " << target[idx] << " v1 " << ref[idx] + << " n " << n << " i " << i; + } + } + } + + template + void TestDequant() { + constexpr size_t BlkNum = (K + BlkLen - 1) / BlkLen; + constexpr size_t BCount = BlkNum * BlkLen * N; + constexpr size_t ScaleCount = N * BlkNum; + constexpr size_t ZpSize = N * ((BlkNum + 1) / 2); + + auto InitializeBuffer_i8 = [this](uint8_t* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = static_cast(distrib_(gen_)); + } + }; + + auto InitializeBuffer_fp16 = [this](MLAS_FP16* buffer, size_t count) { + for (size_t i = 0; i < count; i++) { + buffer[i] = MLAS_FP16(_distribFp(gen_)); + } + }; + + const auto* input = input_.GetFilledBuffer(BCount / 2, InitializeBuffer_i8); + const auto* zero_points = zero_points_.GetFilledBuffer(ZpSize, InitializeBuffer_i8); + auto* dequant = dequant_.GetBuffer(BCount); + auto* ref = ref_.GetBuffer(BCount); + const auto* scales = scales_.GetFilledBuffer(ScaleCount, InitializeBuffer_fp16); + GetMlasPlatform().QNBitGemmDispatch->HQ4BitBlkDequantBForHgemm_CompFp16( + BlkLen, dequant, reinterpret_cast(input), scales, + UseZeroPoints ? reinterpret_cast(zero_points) : nullptr, + N, K, BlkNum); + DequantB(input, ref, scales, zero_points); + Check(dequant, ref); + } + + public: + MlasNeonFp16DequantBTest() + : seed_(rd_()), gen_(seed_), distrib_(0, 255), _distribFp(0.5f, 2.0f) { + } + + static const char* GetTestSuiteName() { + return "NeonFp16DequantB"; + } + + void ExecuteShort(void) override { + TestDequant<1, 1, 16, false>(); + TestDequant<1, 1, 16, true>(); + TestDequant<1, 15, 16, false>(); + TestDequant<1, 15, 16, true>(); + TestDequant<1, 31, 16, false>(); + TestDequant<1, 31, 16, true>(); + TestDequant<8, 1, 16, false>(); + TestDequant<8, 1, 16, true>(); + TestDequant<8, 16, 16, false>(); + TestDequant<8, 16, 16, true>(); + TestDequant<9, 31, 16, false>(); + TestDequant<9, 31, 16, true>(); + TestDequant<9, 33, 32, false>(); + TestDequant<9, 33, 32, true>(); + TestDequant<15, 33, 16, false>(); + TestDequant<15, 33, 16, true>(); + TestDequant<17, 67, 16, false>(); + TestDequant<17, 67, 16, true>(); + TestDequant<17, 96, 128, false>(); + TestDequant<17, 96, 128, true>(); + TestDequant<263, 263, 16, false>(); + TestDequant<263, 263, 16, true>(); + } +}; + +class MlasNeonFp16HQ4BitGemmKernelTest : public MlasTestBase { + private: + std::random_device rd_; // a seed source for the random number engine + unsigned int seed_; + std::mt19937 gen_; // mersenne_twister_engine seeded with rd() + MatrixGuardBuffer A_, B_, C_, ref_, bias_; + + MLAS_FORCEINLINE + void InitializeBuffer(MLAS_FP16* buffer, float min, float max, size_t count) { + std::uniform_real_distribution distrib(min, max); + for (size_t i = 0; i < count; i++) { + buffer[i] = MLAS_FP16(distrib(gen_)); + } + } + + MLAS_FORCEINLINE + bool FloatEqual(MLAS_FP16 v0, MLAS_FP16 v1, float rtol, float atol) { + float f0 = v0.ToFloat(), f1 = v1.ToFloat(); + return std::abs(f0 - f1) <= std::abs(f1 * rtol) + atol; + } + + template + float GetBVal(const MLAS_FP16* B, size_t n, size_t k) { + size_t i; + if ((N & (~7)) > n) { + size_t full8 = n & (~7); + i = full8 * ldb + 8 * k + (n - full8); + } else { + i = n * ldb + k; + } + return B[i].ToFloat(); + } + + template + void MatMul(const MLAS_FP16* A, const MLAS_FP16* B, const MLAS_FP16* bias, MLAS_FP16* C) { + for (size_t m = 0; m < M; ++m) { + for (size_t n = 0; n < N; ++n) { + float accu = UseBias ? bias[n] : 0.0f; + for (size_t k = 0; k < K; ++k) { + float a = A[m * K + k].ToFloat(); + float b = GetBVal(B, n, k); + accu = accu + a * b; + } + C[m * N + n] = MLAS_FP16(accu); + } + } + } + + template + MLAS_FORCEINLINE void Check(const MLAS_FP16* target, const MLAS_FP16* ref) { + for (size_t m = 0; m < M; ++m) { + for (size_t n = 0; n < N; ++n) { + size_t i = m * Ldc + n; + ASSERT_TRUE(FloatEqual(target[i], ref[i], 0.015f, 0.03f)) + << " seed " << seed_ + << " v0 " << target[i] << " v1 " << ref[i] + << " m " << m << " n " << n; + } + } + } + + template + void TestHQ4BitGemmKernel() { + static_assert(M <= 2); + constexpr size_t BlkNum = (K + BlkLen - 1) / BlkLen; + constexpr size_t ldb = BlkNum * BlkLen; + + const auto* A = A_.GetFilledBuffer(M * K, [this](MLAS_FP16* p, size_t t) { + InitializeBuffer(p, -0.25f, 0.25f, t); + }); + const auto* B = B_.GetFilledBuffer(ldb * N, [this](MLAS_FP16* p, size_t t) { + InitializeBuffer(p, -0.25f, 0.25f, t); + }); + auto* C = C_.GetBuffer(M * N, true); + auto* ref = ref_.GetBuffer(M * N, true); + auto* bias = bias_.GetFilledBuffer(N, [this](MLAS_FP16* p, size_t t) { + InitializeBuffer(p, -5.0f, 5.0f, t); + }); + + GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmKernel_CompFp16( + A, B, UseBias ? bias : nullptr, C, M, N, K, K, ldb, N); + + MatMul(A, B, bias, ref); + Check(C, ref); + } + + public: + MlasNeonFp16HQ4BitGemmKernelTest() + : seed_(rd_()), gen_(seed_) { + } + + static const char* GetTestSuiteName() { + return "NeonFp16HQ4BitGemmKernel"; + } + + template + void ExecuteShort_T(void) { + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + TestHQ4BitGemmKernel(); + } + + void ExecuteShort(void) override { + ExecuteShort_T<1>(); + ExecuteShort_T<2>(); + } +}; + +static UNUSED_VARIABLE bool added_to_main = AddTestRegister([](bool is_short_execute) { + size_t count = 0; + if (is_short_execute) { + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + if (GetMlasPlatform().QNBitGemmDispatch) { + if (GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmPackQuantBData) { + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + } + if (GetMlasPlatform().QNBitGemmDispatch->HQ4BitBlkDequantBForHgemm_CompFp16) { + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + } + if (GetMlasPlatform().QNBitGemmDispatch->HQ4BitGemmKernel_CompFp16) { + count += MlasDirectShortExecuteTests::RegisterShortExecute(); + } + } + } + return count; +}); + +#endif // defined(MLAS_F16VEC_INTRINSICS_SUPPORTED) && defined(MLAS_TARGET_ARM64) diff --git a/onnxruntime/test/mlas/unittest/test_util.h b/onnxruntime/test/mlas/unittest/test_util.h index 8eecda900ff27..a000e353f370d 100644 --- a/onnxruntime/test/mlas/unittest/test_util.h +++ b/onnxruntime/test/mlas/unittest/test_util.h @@ -115,24 +115,24 @@ class MatrixGuardBuffer { return GetFilledBuffer( Elements, [](T* start, size_t size) { - std::fill_n(start, size, T(0)); + std::fill_n(start, size, T(0.0f)); }); } return GetFilledBuffer( Elements, [](T* start, size_t size) { - constexpr int offset = -21; - constexpr int range = 43; + constexpr float offset = -21.f; + constexpr float range = 43.f; - int FillValue = 11; + float FillValue = 11.f; T* FillAddress = start; for (size_t i = 0; i < size; i++) { auto itemv = FillValue - offset; *FillAddress++ = (T)(itemv); - FillValue += 7; - FillValue %= range; + FillValue += 7.f; + FillValue = FillValue >= range ? FillValue - range : FillValue; } }); } From 59280095539aa721096cb85045a4a4b267de33a1 Mon Sep 17 00:00:00 2001 From: Peishen Yan Date: Sat, 16 Nov 2024 09:58:35 +0800 Subject: [PATCH 08/28] [WebNN EP] Support Einsum op (#19558) Adds support for einsum via WebNN matmul, transpose, reshape, reducesum, identity and element-wise binary ops. --- js/web/docs/webnn-operators.md | 1 + .../core/providers/webnn/builders/helper.h | 1 + .../webnn/builders/impl/einsum_op_builder.cc | 793 ++++++++++++++++++ .../webnn/builders/op_builder_factory.cc | 4 + .../webnn/builders/op_builder_factory.h | 1 + 5 files changed, 800 insertions(+) create mode 100644 onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc diff --git a/js/web/docs/webnn-operators.md b/js/web/docs/webnn-operators.md index 6b13658874b4d..2ff127e9a0bf2 100644 --- a/js/web/docs/webnn-operators.md +++ b/js/web/docs/webnn-operators.md @@ -29,6 +29,7 @@ operators and the supported opset domain/versions in **WebNN EP** by ONNX Runtim | Div | ai.onnx(7-12, 13, 14+) | div | ✓ | ✓ | | | DequantizeLinear | ai.onnx(10-12, 13-18, 19-20, 21-22, 23+) | dequantizeLinear | ✗ | ✓ | | | Dropout | ai.onnx(7-9, 10-11, 12, 13-21, 22+) | identity | ✓ | ✓ | Only supports test mode | +| Einsum | ai.onnx(12+) | reshape, transpose, matmul, reduceSum, mul, triangular | ✓ | ✓ | | | Elu | ai.onnx(7+) | elu | ✓ | ✓ | WebNN CPU backend only supports 'alpha' value is 1.0 | | Equal | ai.onnx(7-10, 11-12, 13-18, 19+) | equal | ✓ | ✓ | | | Erf | ai.onnx(7-9, 10-12, 13+) | erf | ✓ | ✓ | | diff --git a/onnxruntime/core/providers/webnn/builders/helper.h b/onnxruntime/core/providers/webnn/builders/helper.h index 218677a1945d5..aa84fb0bd43af 100644 --- a/onnxruntime/core/providers/webnn/builders/helper.h +++ b/onnxruntime/core/providers/webnn/builders/helper.h @@ -210,6 +210,7 @@ static const InlinedHashMap op_map = { {"DequantizeLinear", "dequantizeLinear"}, {"Dropout", "identity"}, {"DynamicQuantizeLinear", "dynamicQuantizeLinear"}, + {"Einsum", "matmul"}, {"Elu", "elu"}, {"Equal", "equal"}, {"Erf", "erf"}, diff --git a/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc new file mode 100644 index 0000000000000..931854d0f33c1 --- /dev/null +++ b/onnxruntime/core/providers/webnn/builders/impl/einsum_op_builder.cc @@ -0,0 +1,793 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Copyright (c) Intel Corporation. All rights reserved. +// Licensed under the MIT License. + +#include "core/common/safeint.h" +#include "core/framework/tensorprotoutils.h" +#include "core/optimizer/initializer.h" +#include "core/providers/common.h" +#include "core/providers/cpu/tensor/reshape_helper.h" +#include "core/providers/shared/utils/utils.h" +#include "core/providers/webnn/builders/helper.h" +#include "core/providers/webnn/builders/model_builder.h" +#include "core/providers/webnn/builders/op_builder_factory.h" + +#include "base_op_builder.h" + +namespace onnxruntime { +namespace webnn { + +class EinsumOpBuilder : public BaseOpBuilder { + // Add operator related. + + private: + Status AddToModelBuilderImpl(ModelBuilder& model_builder, const Node& node, + const logging::Logger& logger) const override ORT_MUST_USE_RESULT; + + // Operator support related. + bool IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, const Node& node, + const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + bool HasSupportedInputsImpl(const Node& node, const emscripten::val& wnn_limits, + const logging::Logger& logger) const override; +}; + +// Helper functions, thanks for DML EP's OperatorHelper. +enum class RecognizedOperatorType { + None, + Identity, + ReduceSum, + Transpose, + Diagonal, + Multiply, + Pairwise, + Total, +}; + +struct RecognizedOperatorInfo { + RecognizedOperatorType recognized_operator_type; + std::initializer_list component_ranks; + std::initializer_list label_indices; +}; + +struct Component { + uint32_t label_index_begin; + uint32_t label_index_end; + + uint32_t GetDimensionCount() const noexcept { + return label_index_end - label_index_begin; + } + gsl::span GetLabels(gsl::span labels) const { + return labels.subspan(label_index_begin, label_index_end - label_index_begin); + } +}; + +bool ParseEquationComponents(const Node& node, + const std::string_view equation, + std::vector& label_indices, + std::vector& components, + std::vector& output_dimensions, + uint32_t& num_labels, + const logging::Logger& logger) { + // Parse an equation like 'ij,jk->ik' into components {ij, jk, ik} mapping letters to + // numeric indices {(0,1}, {1,2}, {0,2}}. The last component is the output. + // Read first to last character in equation, looking for letters, commas, and one arrow. + // The ellipsis is not supported. + std::map label_maps; + std::set repeated_labels; + + num_labels = 0; + Component current_component = {}; + bool at_output = false; + bool end_flag = false; + + for (const char* it = equation.data(); !end_flag; ++it) { + // std::string.data() promises the end of the string is '\0' + char ch = *it; + + if ((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z')) { + const auto [i, inserted] = label_maps.insert({ch, num_labels}); + if (inserted) { + if (at_output) { + LOGS(logger, VERBOSE) << "Found label in equation output not matching any label from inputs."; + return false; + } + ++num_labels; + } else if (!at_output) { + repeated_labels.insert(ch); + } + label_indices.push_back(i->second); + } else if (ch == ' ') { + continue; + } else { + current_component.label_index_end = static_cast(label_indices.size()); + components.push_back(current_component); + current_component.label_index_begin = current_component.label_index_end; + + switch (ch) { + case ',': + break; + + case '-': + ++it; + if (*it != '>') { + LOGS(logger, VERBOSE) << "Expected '->' for output."; + return false; + } + if (at_output) { + LOGS(logger, VERBOSE) << "Only one output arrow '->' is valid."; + return false; + } + at_output = true; + break; + + case '.': + // Ellipsis is unsupported + LOGS(logger, VERBOSE) << "Ellipsis is unsupported."; + return false; + + case '\0': + end_flag = true; + break; // End of string. + + default: + LOGS(logger, VERBOSE) << "Unsupported character in equation string."; + return false; + } + } + } + + // If no explicit output was given, generate an implicit output by ordering all the + // labels in alphabetic order (by ASCII value consistent with numpy, so Z < a). + // Exclude any labels that occurred more than once, as these cancel out. + if (!at_output) { + for (auto i : label_maps) { + if (repeated_labels.count(i.first) == 0) { + label_indices.push_back(i.second); + } + } + + current_component.label_index_end = static_cast(label_indices.size()); + components.push_back(current_component); + } + return true; +} + +// For two inputs A,B and one output C +Status PairwiseOperandProcess(ModelBuilder& model_builder, + const Node& node, + const std::vector& label_indices, + const std::vector& components, + const std::vector& output_dimensions, + uint32_t num_labels, + emscripten::val& output, + const logging::Logger& logger) { + auto input_a_labels = components[0].GetLabels(label_indices); + auto input_b_labels = components[1].GetLabels(label_indices); + auto output_labels = components[2].GetLabels(label_indices); + + /* + Step 1. Transpose and Reshape + + (0/1,0/1,0/1) means dim i whether appears in (A,B,C) + For new A, it has three segments [...a_1..., a_2, a_3], a_1 has multiple dims, a_2 and a_3 only have one dim respectively + For new B, it has three segments [...b_1..., b_2, b_3], b_1 has multiple dims, b_2 and b_3 only have one dim respectively + a_1 and b_1 are batch dims, and [a_2,a_3], [b_2,b_3] are for matmul + + case (1,0,0) and (0,1,0): reduce, here we treat it as batch dimension, and reduceSum at the end. + add additional dim for B/A + case (1,1,1): batch dimension, put it in the front. + case (1,0,1): gemm dim for A, put it in a_2 + case (0,1,1): gemm dim for B, put it in b_3 + case (1,1,0): summation dim / gemm dim for both A and B, put it in a_3 and b_2 + + Notes: + # of (1,1,0) maybe > 1, flatten / reshape a_3 and b_2 + # of (1,1,0) maybe = 0, add one additional dim for a_3 and b_2 + */ + + // The index in input/output of the dim index + std::map input_a_axes_map, input_b_axes_map, output_axes_map; + + for (uint32_t i = 0; i <= num_labels + 1; ++i) { + input_a_axes_map[i] = input_b_axes_map[i] = output_axes_map[i] = -1; + } + int32_t index = 0; + for (auto axis : input_a_labels) { + input_a_axes_map[axis] = index++; + } + index = 0; + for (auto axis : input_b_labels) { + input_b_axes_map[axis] = index++; + } + index = 0; + for (auto axis : output_labels) { + output_axes_map[axis] = index++; + } + + // Inputs Reshape + // a_0 = [a_1,a_2,a_3], b_0 = [b_1,b_2,b_3] + std::vector a_0, a_1, a_2, a_3, b_0, b_1, b_2, b_3; + uint32_t a_idx = input_a_labels.size(); + uint32_t b_idx = input_b_labels.size(); + bool a_flag = false; // whether a_2 has element + bool b_flag = false; // whether b_3 has element + + for (uint32_t i = 0; i < num_labels; ++i) { + if (input_a_axes_map[i] != -1) { + if (input_b_axes_map[i] != -1) { + if (output_axes_map[i] != -1) { + // The index in input/output of the dim index + a_1.push_back(i); + b_1.push_back(i); + } else { + // (1,1,0) push back in the middle for b and end for a + a_3.push_back(i); + b_2.push_back(i); + } + } else { + // (1,0,x) push back in the middle for a. If more than one, push back in the front for a, b. + if (a_flag) { + a_1.push_back(i); + b_1.push_back(i); + input_b_axes_map[i] = b_idx++; + } else { + a_2.push_back(i); + a_flag = true; + } + } + } else { + // (0,1,x) push back in the end for b. If more than one, push back in the front for a, b. + if (input_b_axes_map[i] != -1) { + if (b_flag) { + a_1.push_back(i); + b_1.push_back(i); + input_a_axes_map[i] = a_idx++; + } else { + b_3.push_back(i); + b_flag = true; + } + } + } + } + + // Matrix multiplication can be formatted in (...,i,j) * (...,j,k) ==> (...,i,k) + // Even inner and outer product can be reformatted as this. + // Inner product (1,i) * (i,1) ==> (1,1) + // Outer product (i,1) * (1,j) ==> (i,j) + // i.e., in our expression, (a_2,a_3) * (b_2,b_3) ==> (a_2,b_3) + + if (!a_flag) { + // Lack of a_2 element, add a new a_2, whose dim value = 1 + a_2.push_back(num_labels + 1); + input_a_axes_map[num_labels + 1] = a_idx++; + } + if (!b_flag) { + // Lack of b_3 element, add a new b_3, whose dim value = 1 + b_3.push_back(num_labels + 2); + input_b_axes_map[num_labels + 2] = b_idx++; + b_idx++; + } + + if (a_3.empty()) { + // Lack of a_3 and b_2 elements, add a new a_3 for A and a new b_2 for B, whose dim value = 1 + a_3.push_back(num_labels); + b_2.push_back(num_labels); + input_a_axes_map[num_labels] = a_idx; + input_b_axes_map[num_labels] = b_idx; + } + + a_0 = a_1; + b_0 = b_1; + a_0.insert(a_0.end(), a_2.begin(), a_2.end()); + a_0.insert(a_0.end(), a_3.begin(), a_3.end()); + b_0.insert(b_0.end(), b_2.begin(), b_2.end()); + b_0.insert(b_0.end(), b_3.begin(), b_3.end()); + + std::vector permutation_a, permutation_b; + for (uint32_t i = 0; i < a_0.size(); ++i) { + permutation_a.push_back(static_cast(input_a_axes_map[a_0[i]])); + permutation_b.push_back(static_cast(input_b_axes_map[b_0[i]])); + } + + const auto& input_defs = node.InputDefs(); + emscripten::val input_a = model_builder.GetOperand(input_defs[0]->Name()); + emscripten::val input_b = model_builder.GetOperand(input_defs[1]->Name()); + std::vector new_a_shape, new_b_shape; + if (input_a_labels.size() < a_0.size()) { + std::vector input_a_shape; + ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_a_shape, logger), "Cannot get shape"); + std::transform(input_a_shape.begin(), input_a_shape.end(), std::back_inserter(new_a_shape), + [](int64_t i) { return static_cast(i); }); + for (uint32_t i = 0; i < a_0.size() - input_a_labels.size(); ++i) { + new_a_shape.push_back(SafeInt(1)); + } + + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name() + "_reshape"); + input_a = model_builder.GetBuilder().call("reshape", + input_a, + emscripten::val::array(new_a_shape), + options); + } + if (input_b_labels.size() < b_0.size()) { + std::vector input_b_shape; + ORT_RETURN_IF_NOT(GetShape(*input_defs[1], input_b_shape, logger), "Cannot get shape"); + std::transform(input_b_shape.begin(), input_b_shape.end(), std::back_inserter(new_b_shape), + [](int64_t i) { return static_cast(i); }); + for (uint32_t i = 0; i < b_0.size() - input_b_labels.size(); ++i) { + new_b_shape.push_back(SafeInt(1)); + } + + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name() + "_reshape"); + input_b = model_builder.GetBuilder().call("reshape", + input_b, + emscripten::val::array(new_b_shape), + options); + } + + // Inputs Transpose + std::vector sequence(permutation_a.size()); + std::iota(sequence.begin(), sequence.end(), 0); + if (permutation_a != sequence) { + emscripten::val options = emscripten::val::object(); + options.set("permutation", emscripten::val::array(permutation_a)); + options.set("label", node.Name() + "_transpose"); + input_a = model_builder.GetBuilder().call("transpose", input_a, options); + } + if (permutation_b != sequence) { + emscripten::val options = emscripten::val::object(); + options.set("permutation", emscripten::val::array(permutation_b)); + options.set("label", node.Name() + "_transpose"); + input_b = model_builder.GetBuilder().call("transpose", input_b, options); + } + + // Input Reshape: if the number of (1,1,0) > 1, flatten the b_2 and a_3 dims. + if (a_3.size() > 1) { + if (new_a_shape.empty()) { + std::vector input_a_shape; + ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_a_shape, logger), "Cannot get shape"); + std::transform(input_a_shape.begin(), input_a_shape.end(), std::back_inserter(new_a_shape), + [](int64_t i) { return static_cast(i); }); + } + if (new_b_shape.empty()) { + std::vector input_b_shape; + ORT_RETURN_IF_NOT(GetShape(*input_defs[1], input_b_shape, logger), "Cannot get shape"); + std::transform(input_b_shape.begin(), input_b_shape.end(), std::back_inserter(new_b_shape), + [](int64_t i) { return static_cast(i); }); + } + std::vector new_new_a_shape, new_new_b_shape; + uint32_t a_dim = 1, b_dim = 1; + for (auto idx : a_1) { + new_new_a_shape.push_back(new_a_shape[idx]); + } + for (auto idx : a_2) { + new_new_a_shape.push_back(new_a_shape[idx]); + } + for (auto idx : a_3) { + a_dim *= new_a_shape[idx]; + } + new_new_a_shape.push_back(a_dim); + for (auto idx : b_1) { + new_new_b_shape.push_back(new_b_shape[idx]); + } + for (auto idx : b_2) { + b_dim *= new_b_shape[idx]; + } + new_new_b_shape.push_back(b_dim); + for (auto idx : b_3) { + new_new_b_shape.push_back(new_b_shape[idx]); + } + + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name() + "_reshape"); + input_a = model_builder.GetBuilder().call("reshape", + input_a, + emscripten::val::array(new_new_a_shape), + options); + input_b = model_builder.GetBuilder().call("reshape", + input_b, + emscripten::val::array(new_b_shape), + options); + } + + // Step 2. Matmul + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name() + "_matmul"); + output = model_builder.GetBuilder().call("matmul", input_a, input_b, options); + std::vector output_indices = a_1; + output_indices.push_back(a_2.back()); + output_indices.push_back(b_3.back()); + + /* + Step 3. Output Transpose: + Use the following fast permutation calculation algorithm + to calculate the permutation of transpose. + sequence x[] -> sequence y[] : permutation p[] + x[s[i]] = i, y[t[i]] = i, p[t[i]] = s[i] + output_indices is x and target_output_indices is y + */ + std::vector target_output_indices(output_labels.begin(), output_labels.end()); + + // map output dim labels to 0 ~ n-1 + std::vector output_indices_sorted(output_indices.begin(), output_indices.end()); + std::map mapping; + std::sort(output_indices_sorted.begin(), output_indices_sorted.end()); + for (size_t i = 0; i < output_indices_sorted.size(); i++) { + mapping[output_indices_sorted[i]] = i; + } + + for (size_t i = 0; i < output_indices.size(); i++) { + output_indices[i] = mapping[output_indices[i]]; + if (i < target_output_indices.size()) { + target_output_indices[i] = mapping[target_output_indices[i]]; + } + } + + uint32_t pad = target_output_indices.size(); + std::vector s(output_indices.size(), -1); + std::vector t(output_indices.size(), -1); + std::vector p(output_indices.size(), 0); + for (uint32_t i = 0; i < output_indices.size(); ++i) { + s[output_indices[i]] = i; + if (i < target_output_indices.size()) { + t[target_output_indices[i]] = i; + } + } + for (uint32_t i = 0; i < output_indices.size(); ++i) { + if (t[i] == -1) { + t[i] = pad++; + } + p[static_cast(t[i])] = static_cast(s[i]); + } + + std::vector sequence_o(output_indices.size()); + std::iota(sequence_o.begin(), sequence_o.end(), 0); + if (p != sequence_o) { + emscripten::val options = emscripten::val::object(); + options.set("permutation", emscripten::val::array(p)); + options.set("label", node.Name() + "_transpose"); + output = model_builder.GetBuilder().call("transpose", output, options); + } + + // Step 4. Output ReduceSum + if (output_labels.size() < output_indices.size()) { + std::vector axes_data; + for (uint32_t i = output_labels.size(); i < output_indices.size(); ++i) { + axes_data.push_back(SafeInt(i)); + } + emscripten::val options_reduce = emscripten::val::object(); + options_reduce.set("axes", emscripten::val::array(axes_data)); + options_reduce.set("label", node.Name() + "_reduceSum"); + output = model_builder.GetBuilder().call("reduceSum", output, options_reduce); + } + return Status::OK(); +} + +RecognizedOperatorType DetermineRecognizedOperatorType(const std::vector& label_indices, + const std::vector& components, + const std::vector& output_dimensions) { + if (components.empty()) return RecognizedOperatorType::None; + + auto equals = [](gsl::span a, gsl::span b) { + return std::equal(a.begin(), a.end(), b.begin(), b.end()); + }; + + std::array component_ranks; + if (components.size() > component_ranks.size()) { + // So far, not support for more than two inputs and one output. + return RecognizedOperatorType::None; + } else if (components.size() == 2) { // one input + auto input_labels = components[0].GetLabels(label_indices); + auto output_labels = components[1].GetLabels(label_indices); + if (input_labels.size() == output_labels.size()) { + if (equals(input_labels, output_labels)) { + // Identity: input labels = output labels + return RecognizedOperatorType::Identity; + } else { + return RecognizedOperatorType::Transpose; + } + } else if (input_labels.size() == input_labels.back() + 1) { + // ReduceSum: There is no repeated character in input. + return RecognizedOperatorType::ReduceSum; + } else if (input_labels.size() == input_labels.back() + 2) { + // Diagonal: One repeated character in input, ii->i / iij->ij / iijk -> ijk. + return RecognizedOperatorType::Diagonal; + } else { + return RecognizedOperatorType::None; + } + } else if (components.size() == 3) { // two inputs + auto input_A_labels = components[0].GetLabels(label_indices); + auto input_B_labels = components[1].GetLabels(label_indices); + auto output_labels = components[2].GetLabels(label_indices); + if (equals(input_A_labels, output_labels) && equals(input_B_labels, output_labels)) { // element-wise product + return RecognizedOperatorType::Multiply; + } + } + + return RecognizedOperatorType::Pairwise; +} + +// Add operator related. + +Status EinsumOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, + const Node& node, + const logging::Logger& logger) const { + const auto& input_defs = node.InputDefs(); + emscripten::val output = emscripten::val::object(); + + NodeAttrHelper helper(node); + const auto equation = helper.Get("equation", std::string(" ")); + + std::vector label_indices; + std::vector components; + std::vector output_dimensions; + uint32_t num_labels; + ORT_RETURN_IF_NOT(ParseEquationComponents(node, equation, label_indices, components, output_dimensions, + num_labels, logger), + "Error parsing equation components."); + + RecognizedOperatorType recognized_operator_type = DetermineRecognizedOperatorType(label_indices, components, + output_dimensions); + + switch (recognized_operator_type) { + case RecognizedOperatorType::Multiply: { + emscripten::val a = model_builder.GetOperand(node.InputDefs()[0]->Name()); + emscripten::val b = model_builder.GetOperand(node.InputDefs()[1]->Name()); + emscripten::val options = emscripten::val::object(); + options.set("label", node.Name() + "_mul"); + output = model_builder.GetBuilder().call("mul", a, b, options); + } break; + case RecognizedOperatorType::ReduceSum: { + auto kept_axes = components.back().GetLabels(label_indices); + std::vector reduced_axes; + uint32_t kept_axes_mask = 0; + for (auto axis : kept_axes) { + kept_axes_mask |= (1 << axis); + } + std::vector input_shape; + ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape"); + for (uint32_t axis = 0, axis_count = static_cast(input_shape.size()); axis < axis_count; ++axis) { + if (~kept_axes_mask & (1 << axis)) { + reduced_axes.push_back(axis); + } + } + + emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name()); + emscripten::val options = emscripten::val::object(); + options.set("keepDimensions", false); + options.set("axes", emscripten::val::array(reduced_axes)); + options.set("label", node.Name() + "_reduceSum"); + + output = model_builder.GetBuilder().call("reduceSum", input, options); + + // transpose output + std::vector output_labels_sorted(kept_axes.begin(), kept_axes.end()); + std::map mapping; + std::sort(output_labels_sorted.begin(), output_labels_sorted.end()); + + auto equals = [](std::vector a, gsl::span b) { + return std::equal(a.begin(), a.end(), b.begin(), b.end()); + }; + if (equals(output_labels_sorted, kept_axes)) { + break; + } + + for (size_t i = 0; i < output_labels_sorted.size(); i++) { + mapping[output_labels_sorted[i]] = i; + } + std::vector permutation; + for (auto idx : kept_axes) { + permutation.push_back(mapping[idx]); + } + emscripten::val options_transpose = emscripten::val::object(); + options.set("permutation", emscripten::val::array(permutation)); + options.set("label", node.Name() + "_transpose"); + output = model_builder.GetBuilder().call("transpose", output, options); + } break; + case RecognizedOperatorType::Diagonal: { + emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name()); + auto input_labels = components[0].GetLabels(label_indices); + auto output_labels = components[1].GetLabels(label_indices); + uint32_t diagonal_idx_1, diagonal_idx_2; + uint32_t permutation_idx = 0; + for (uint32_t idx = 0; idx < input_labels.size(); idx++) { + if (idx != input_labels[idx]) { + diagonal_idx_1 = input_labels[idx]; + diagonal_idx_2 = idx; + break; + } + } + + // tranpose input + std::vector permutation(input_labels.size()); + for (uint32_t idx = 0; idx < input_labels.size(); idx++) { + if (idx != diagonal_idx_1 && idx != diagonal_idx_2) { + permutation[permutation_idx++] = idx; + } + } + permutation[permutation_idx++] = diagonal_idx_1; + permutation[permutation_idx] = diagonal_idx_2; + + emscripten::val options = emscripten::val::object(); + options.set("permutation", emscripten::val::array(permutation)); + options.set("label", node.Name() + "_transpose"); + output = model_builder.GetBuilder().call("transpose", input, options); + + // triu + tril = diagonal + emscripten::val options_trilu = emscripten::val::object(); + options_trilu.set("label", node.Name() + "_triangular"); + output = model_builder.GetBuilder().call("triangular", output, options_trilu); // triu + options_trilu.set("upper", false); + output = model_builder.GetBuilder().call("triangular", output, options_trilu); // tril + + // reducesum to achieve the diagonal values + std::vector input_shape; + std::vector reduced_axes; + ORT_RETURN_IF_NOT(GetShape(*input_defs[0], input_shape, logger), "Cannot get shape"); + if (input_shape[diagonal_idx_1] > input_shape[diagonal_idx_2]) { + reduced_axes.push_back(input_labels.size() - 2); + } else { + reduced_axes.push_back(input_labels.size() - 1); + } + emscripten::val options_reduce = emscripten::val::object(); + options_reduce.set("keepDimensions", false); + options_reduce.set("axes", emscripten::val::array(reduced_axes)); + options_reduce.set("label", node.Name() + "_reduceSum"); + output = model_builder.GetBuilder().call("reduceSum", output, options_reduce); // triu + + // transpose output + std::vector target_output_indices(output_labels.begin(), output_labels.end()); + std::vector output_indices(permutation.begin(), permutation.end() - 1); + + // Use the fast permutation calculation algorithm mentioned above + std::vector s(output_indices.size(), -1); + std::vector t(output_indices.size(), -1); + std::vector p(output_indices.size(), 0); + for (uint32_t i = 0; i < output_indices.size(); ++i) { + s[output_indices[i]] = i; + t[target_output_indices[i]] = i; + } + for (uint32_t i = 0; i < output_indices.size(); ++i) { + p[static_cast(t[i])] = static_cast(s[i]); + } + + std::vector sequence_o(output_indices.size()); + std::iota(sequence_o.begin(), sequence_o.end(), 0); + if (p != sequence_o) { + emscripten::val options_transpose = emscripten::val::object(); + options.set("permutation", emscripten::val::array(p)); + options.set("label", node.Name() + "_transpose"); + output = model_builder.GetBuilder().call("transpose", output, options); + } + } break; + + case RecognizedOperatorType::Transpose: { + emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name()); + assert(components.front().GetDimensionCount() == components.back().GetDimensionCount()); + // Remap transposed strides using the component labels from input to output. + auto output_labels = components.back().GetLabels(label_indices); + + std::vector permutation{output_labels.begin(), output_labels.end()}; + emscripten::val options = emscripten::val::object(); + options.set("permutation", emscripten::val::array(permutation)); + options.set("label", node.Name() + "_transpose"); + output = model_builder.GetBuilder().call("transpose", input, options); + } break; + + case RecognizedOperatorType::Identity: { + emscripten::val input = model_builder.GetOperand(node.InputDefs()[0]->Name()); + output = input; + } break; + + case RecognizedOperatorType::Pairwise: { + ORT_RETURN_IF_ERROR(PairwiseOperandProcess(model_builder, node, label_indices, components, + output_dimensions, num_labels, output, logger)); + } break; + + default: + break; + } + + model_builder.AddOperand(node.OutputDefs()[0]->Name(), std::move(output)); + return Status::OK(); +} + +// Operator support related. + +bool EinsumOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& /* initializers */, + const Node& node, + const WebnnDeviceType device_type, + const logging::Logger& logger) const { + const auto& input_defs = node.InputDefs(); + + if (input_defs.size() > 2) { + // TODO: Support more than two inputs. + LOGS(logger, VERBOSE) << "EinSum only supports up to two inputs."; + return false; + } + + NodeAttrHelper helper(node); + const auto equation = helper.Get("equation", std::string(" ")); + std::vector label_indices; + std::vector components; + std::vector output_dimensions; + uint32_t num_labels; + + if (!ParseEquationComponents(node, equation, label_indices, components, + output_dimensions, num_labels, logger)) { + LOGS(logger, VERBOSE) << "EinSum input equation is illegal."; + return false; + } + + if (static_cast(input_defs.size()) + 1 != components.size()) { + LOGS(logger, VERBOSE) << "EinSum input tensor count is inconsistent with the equation component count."; + return false; + } + + RecognizedOperatorType recognized_operator_type = DetermineRecognizedOperatorType(label_indices, components, + output_dimensions); + if (recognized_operator_type == RecognizedOperatorType::None) { + LOGS(logger, VERBOSE) << "The equation is not supported in Einsum."; + return false; + } + + return true; +} + +bool EinsumOpBuilder::HasSupportedInputsImpl(const Node& node, const emscripten::val& wnn_limits, + const logging::Logger& logger) const { + const auto& input_defs = node.InputDefs(); + + const auto& op_type = node.OpType(); + int32_t input0_type; + int32_t input1_type; + bool has_input1 = input_defs.size() > 1 && input_defs[1]->Exists(); + + if (!GetType(*input_defs[0], input0_type, logger) || + (has_input1 && !GetType(*input_defs[1], input1_type, logger))) { + return false; + } + + if (has_input1 && input0_type != input1_type) { + LOGS(logger, VERBOSE) << "[" << op_type + << "] Input data types should be the same."; + return false; + } + + NodeAttrHelper helper(node); + const auto equation = helper.Get("equation", std::string(" ")); + std::vector label_indices; + std::vector components; + std::vector output_dimensions; + uint32_t num_labels; + + if (!ParseEquationComponents(node, equation, label_indices, + components, output_dimensions, num_labels, logger)) { + LOGS(logger, VERBOSE) << "EinSum input equation is illegal."; + return false; + } + + RecognizedOperatorType recognized_operator_type = DetermineRecognizedOperatorType(label_indices, components, + output_dimensions); + + if (recognized_operator_type == RecognizedOperatorType::None) { + LOGS(logger, VERBOSE) << "The equation is not supported in Einsum."; + return false; + } else if (recognized_operator_type == RecognizedOperatorType::Pairwise) { + // Map to WebNN's gemm or matmul + return IsDataTypeSupportedByOp("MatMul", input0_type, wnn_limits, "a", "inputs", logger); + } else if (recognized_operator_type == RecognizedOperatorType::ReduceSum) { + return IsDataTypeSupportedByOp("ReduceSum", input0_type, wnn_limits, "input", "inputs", logger); + } else { + return IsDataTypeSupportedByOp("Identity", input0_type, wnn_limits, "input", "inputs", logger); + } +} + +void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { + op_registrations.builders.push_back(std::make_unique()); + op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); +} + +} // namespace webnn +} // namespace onnxruntime diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc index ae095d24b8bf0..4b26ca65f109e 100644 --- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc +++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.cc @@ -95,6 +95,10 @@ static OpBuilderRegistrations CreateOpBuilderRegistrations() { CreateDynamicQuantizeLinearOpBuilder("DynamicQuantizeLinear", op_registrations); } + { // Einsum + CreateEinsumOpBuilder("Einsum", op_registrations); + } + { // Expand CreateExpandOpBuilder("Expand", op_registrations); } diff --git a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h index a492d3e60f6cb..22bd6cd0cfa9f 100644 --- a/onnxruntime/core/providers/webnn/builders/op_builder_factory.h +++ b/onnxruntime/core/providers/webnn/builders/op_builder_factory.h @@ -30,6 +30,7 @@ void CreateCumSumOpBuilder(const std::string& op_type, OpBuilderRegistrations& o void CreateDropoutOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateDynamicQuantizeLinearOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateExpandOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); +void CreateEinsumOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateFlattenOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGatherOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); void CreateGatherElementsOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations); From 101ed10e5e316a002bdba4e3667a3c323e2d299c Mon Sep 17 00:00:00 2001 From: liqun Fu Date: Sun, 17 Nov 2024 14:51:16 -0800 Subject: [PATCH 09/28] Refactor SkipLayerNorm and handle beta properly (#22862) Signed-off-by: Liqun Fu Signed-off-by: Liqun Fu Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- .../contrib_ops/cpu/skip_layer_norm.cc | 186 ++++++++---------- 1 file changed, 78 insertions(+), 108 deletions(-) diff --git a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc index 3e70f848675cb..d5b8961cf8c5a 100644 --- a/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc +++ b/onnxruntime/contrib_ops/cpu/skip_layer_norm.cc @@ -96,79 +96,6 @@ void ComputeJob( } } -void ComputeJob( - const MLFloat16* input_data, - const MLFloat16* skip_data, - const float* prepacked_skip_fp32_data, - const float* gamma_float_ptr, - const float* beta_float_ptr, - const float* bias_float_ptr, - float* output_float_ptr, - ptrdiff_t task_idx, - int hidden_size, - int64_t skip_size, - float epsilon, - bool simplified, - MLFloat16* output_data, - MLFloat16* skip_input_bias_add_output_data, - AllocatorPtr alloc) { - auto offset = task_idx * hidden_size; - const MLFloat16* p_input = input_data + offset; - MLFloat16* p_output = output_data + offset; - MLFloat16* p_skip_input_bias_add_output = skip_input_bias_add_output_data == nullptr ? nullptr : skip_input_bias_add_output_data + offset; - - float mean(0.0f); - float mean_square(0.0f); - const size_t num_elems = static_cast(hidden_size); - - IAllocatorUniquePtr input_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(p_input, input_float_uptr.get(), num_elems); - - IAllocatorUniquePtr skip_float_uptr = nullptr; - if (prepacked_skip_fp32_data == nullptr && skip_data) { - const MLFloat16* p_skip = skip_data + (offset % skip_size); - skip_float_uptr = IAllocator::MakeUniquePtr(alloc, num_elems); - MlasConvertHalfToFloatBuffer(p_skip, skip_float_uptr.get(), num_elems); - } - - const float* input_float_ptr = input_float_uptr.get(); - const float* skip_float_ptr = prepacked_skip_fp32_data ? prepacked_skip_fp32_data : skip_float_uptr.get(); - for (size_t h = 0; h < num_elems; h++) { - float val = input_float_ptr[h] + skip_float_ptr[h]; - - if (bias_float_ptr) { - val += bias_float_ptr[h]; - } - - output_float_ptr[h] = val; - mean += val; - mean_square += val * val; - } - - if (nullptr != p_skip_input_bias_add_output) { - MlasConvertFloatToHalfBuffer(output_float_ptr, p_skip_input_bias_add_output, num_elems); - } - - mean = mean / hidden_size; - if (simplified) { - mean_square = sqrt(mean_square / hidden_size + epsilon); - } else { - mean_square = sqrt(mean_square / hidden_size - mean * mean + epsilon); - } - - for (size_t h = 0; h < num_elems; h++) { - if (simplified) { - output_float_ptr[h] = output_float_ptr[h] / mean_square * gamma_float_ptr[h]; - } else if (nullptr == beta_float_ptr) { - output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h]; - } else { - output_float_ptr[h] = (output_float_ptr[h] - mean) / mean_square * gamma_float_ptr[h] + beta_float_ptr[h]; - } - } - - MlasConvertFloatToHalfBuffer(output_float_ptr, p_output, num_elems); -} - void ConvertMLFloat16ToFloatIfNeeded(const Tensor& tensor, AllocatorPtr alloc, IAllocatorUniquePtr& dest, bool& is_packed) { if (tensor.GetElementType() == utils::ToTensorProtoElementType()) { auto tensor_data_ptr = tensor.Data(); @@ -200,8 +127,8 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { const Tensor* input = p_ctx->Input(0); const Tensor* skip = prepacked_skip_fp32_data_ ? nullptr : p_ctx->Input(1); const Tensor* gamma = prepacked_gamma_fp32_data_ ? nullptr : p_ctx->Input(2); - const Tensor* beta = prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3); - const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(4); + const Tensor* beta = simplified ? nullptr : (prepacked_beta_fp32_data_ ? nullptr : p_ctx->Input(3)); + const Tensor* bias = prepacked_bias_fp32_data_ ? nullptr : p_ctx->Input(simplified ? 3 : 4); Tensor* output = p_ctx->Output(0, input->Shape()); // For inferencing, we support one more optional output which is the sum of the input and skip tensors Tensor* skip_input_bias_add_output = p_ctx->Output(3, input->Shape()); @@ -232,56 +159,93 @@ Status SkipLayerNorm::Compute(OpKernelContext* p_ctx) const { // For inferencing, we support one more optional output which is the sum of the input and skip tensors T* skip_input_bias_add_output_data = skip_input_bias_add_output == nullptr ? nullptr : skip_input_bias_add_output->MutableData(); - const int64_t skip_size = skip ? skip->Shape().Size() : prepacked_skip_fp32_size_; - AllocatorPtr alloc; - ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); - - IAllocatorUniquePtr output_fp32; - IAllocatorUniquePtr gamma_fp32; - IAllocatorUniquePtr beta_fp32; - IAllocatorUniquePtr bias_fp32; - if constexpr (std::is_same_v) { + const size_t total_data_size = static_cast(input->Shape().Size()); + + AllocatorPtr alloc; + ORT_RETURN_IF_ERROR(p_ctx->GetTempSpaceAllocator(&alloc)); + + IAllocatorUniquePtr input_fp32; + IAllocatorUniquePtr output_fp32; + IAllocatorUniquePtr skip_input_bias_add_output_fp32; + IAllocatorUniquePtr skip_fp32; + IAllocatorUniquePtr gamma_fp32; + IAllocatorUniquePtr beta_fp32; + IAllocatorUniquePtr bias_fp32; + + const float* input_data_f = nullptr; + const float* skip_data_f = nullptr; + const float* gamma_data_f = nullptr; + const float* beta_data_f = nullptr; + const float* bias_data_f = nullptr; + float* output_data_f = nullptr; + float* skip_input_bias_add_output_data_f = nullptr; + const size_t num_elems = static_cast(hidden_size); - output_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); + input_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + MlasConvertHalfToFloatBuffer(input_data, input_fp32.get(), total_data_size); + input_data_f = input_fp32.get(); + + output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + output_data_f = output_fp32.get(); + + skip_input_bias_add_output_fp32 = IAllocator::MakeUniquePtr(alloc, total_data_size); + skip_input_bias_add_output_data_f = skip_input_bias_add_output_fp32.get(); - if (prepacked_gamma_fp32_data_ == nullptr && gamma_data) { + if (skip_data) { + skip_fp32 = IAllocator::MakeUniquePtr(alloc, static_cast(skip_size)); + MlasConvertHalfToFloatBuffer(skip_data, skip_fp32.get(), static_cast(skip_size)); + skip_data_f = skip_fp32.get(); + } else if (prepacked_skip_fp32_data_) { + skip_data_f = prepacked_skip_fp32_data_.get(); + } + + if (gamma_data) { gamma_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(gamma_data, gamma_fp32.get(), num_elems); + gamma_data_f = gamma_fp32.get(); + } else if (prepacked_gamma_fp32_data_) { + gamma_data_f = prepacked_gamma_fp32_data_.get(); } - if (prepacked_beta_fp32_data_ == nullptr && beta_data) { + if (beta_data) { beta_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(beta_data, beta_fp32.get(), num_elems); + beta_data_f = beta_fp32.get(); + } else if (prepacked_beta_fp32_data_) { + beta_data_f = prepacked_beta_fp32_data_.get(); } - if (prepacked_bias_fp32_data_ == nullptr && bias_data) { + if (bias_data) { bias_fp32 = IAllocator::MakeUniquePtr(alloc, num_elems); MlasConvertHalfToFloatBuffer(bias_data, bias_fp32.get(), num_elems); + bias_data_f = bias_fp32.get(); + } else if (prepacked_bias_fp32_data_) { + bias_data_f = prepacked_bias_fp32_data_.get(); } - } - concurrency::ThreadPool::TryBatchParallelFor( - p_ctx->GetOperatorThreadPool(), static_cast(task_count), - [&](ptrdiff_t task_idx) { - if constexpr (std::is_same_v) { - ComputeJob(input_data, skip_data, - prepacked_skip_fp32_data_.get(), - prepacked_gamma_fp32_data_ ? prepacked_gamma_fp32_data_.get() : gamma_fp32.get(), - prepacked_beta_fp32_data_ ? prepacked_beta_fp32_data_.get() : beta_fp32.get(), - prepacked_bias_fp32_data_ ? prepacked_bias_fp32_data_.get() : bias_fp32.get(), - output_fp32.get(), - task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, - skip_input_bias_add_output_data, alloc); - } else { + concurrency::ThreadPool::TryBatchParallelFor( + p_ctx->GetOperatorThreadPool(), static_cast(task_count), + [&](ptrdiff_t task_idx) { + ComputeJob(input_data_f, skip_data_f, gamma_data_f, beta_data_f, bias_data_f, task_idx, hidden_size, skip_size, + epsilon_, simplified, output_data_f, skip_input_bias_add_output_data_f); + }, + 0); + MlasConvertFloatToHalfBuffer(output_data_f, output_data, total_data_size); + if (skip_input_bias_add_output_data != nullptr) + MlasConvertFloatToHalfBuffer(skip_input_bias_add_output_data_f, skip_input_bias_add_output_data, total_data_size); + } else { + concurrency::ThreadPool::TryBatchParallelFor( + p_ctx->GetOperatorThreadPool(), static_cast(task_count), + [&](ptrdiff_t task_idx) { ComputeJob(input_data, skip_data, gamma_data, beta_data, bias_data, task_idx, hidden_size, skip_size, epsilon_, simplified, output_data, skip_input_bias_add_output_data); - } - }, - 0); + }, + 0); + } return Status::OK(); } @@ -290,16 +254,22 @@ template Status SkipLayerNorm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, bool& is_packed, PrePackedWeights* prepacked_weights) { ORT_UNUSED_PARAMETER(prepacked_weights); - is_packed = false; if (input_idx == 1) { // skip prepacked_skip_fp32_size_ = tensor.Shape().Size(); ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_skip_fp32_data_, is_packed); } else if (input_idx == 2) { // gamma ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_gamma_fp32_data_, is_packed); - } else if (input_idx == 3) { // beta - ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); + } else if (input_idx == 3) { + if constexpr (simplified) { + // bias + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); + } else { + // beta + ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_beta_fp32_data_, is_packed); + } } else if (input_idx == 4) { // bias + ORT_ENFORCE(!simplified, "SkipSimplifiedLayerNormalization should only has 4 inputs (input, skip, gamma, and beta). Got 5."); ConvertMLFloat16ToFloatIfNeeded(tensor, alloc, prepacked_bias_fp32_data_, is_packed); } From 135d8b2beb1232acec600f79e184fb8774577e7e Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Mon, 18 Nov 2024 10:46:23 +0800 Subject: [PATCH 10/28] Fix CUDA/DML package exception caused by ENABLE_CUDA_NHWC_OPS (#22851) ### Description Now, ENABLE_CUDA_NHWC_OPS is enabled by default. It adds a new chance to create cuda provider while both cuda/dml are enabled ### Motivation and Context --- onnxruntime/test/lora/lora_test.cc | 10 ++++++++++ onnxruntime/test/util/default_providers.cc | 6 ++++++ .../github/azure-pipelines/templates/win-ci.yml | 1 + 3 files changed, 17 insertions(+) diff --git a/onnxruntime/test/lora/lora_test.cc b/onnxruntime/test/lora/lora_test.cc index e8291a36447ca..9d8febb453739 100644 --- a/onnxruntime/test/lora/lora_test.cc +++ b/onnxruntime/test/lora/lora_test.cc @@ -201,6 +201,16 @@ TEST(LoraAdapterTest, Load) { #ifdef USE_CUDA TEST(LoraAdapterTest, VerifyDeviceCopy) { + // These checks for CUDA/DML combined Package, Be careful when you want to remove it! + if (DefaultCudaExecutionProvider() == nullptr) { + GTEST_SKIP() << "Skip This Test Due to this EP is null"; + } +#ifdef USE_DML + if (DefaultDmlExecutionProvider() != nullptr) { + GTEST_FAIL() << "It should not run with DML EP"; + } +#endif + auto cpu_ep = DefaultCpuExecutionProvider(); auto cpu_allocator = cpu_ep->CreatePreferredAllocators()[0]; auto cuda_ep = DefaultCudaExecutionProvider(); diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index 62bdedd833025..cb53db42304be 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -140,6 +140,12 @@ std::unique_ptr DefaultCudaExecutionProvider() { #ifdef ENABLE_CUDA_NHWC_OPS std::unique_ptr DefaultCudaNHWCExecutionProvider() { #if defined(USE_CUDA) +#ifdef USE_DML + const std::string no_cuda_ep_test = Env::Default().GetEnvironmentVar("NO_CUDA_TEST"); + if (no_cuda_ep_test == "1") { + return nullptr; + } +#endif OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; provider_options.use_tf32 = false; diff --git a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml index b03055d5192a8..2816352b1f189 100644 --- a/tools/ci_build/github/azure-pipelines/templates/win-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/win-ci.yml @@ -413,6 +413,7 @@ stages: workingDirectory: '$(Build.BinariesDirectory)' env: NO_CUDA_TEST: '1' + GTEST_FILTER: -*CudaNhwcTypedTest* - task: PythonScript@0 displayName: 'test excludes DML' condition: and(succeeded(), eq('${{ parameters.runTests}}', true)) From 02a0be3599fd5a517169fca10a0f105e21070dab Mon Sep 17 00:00:00 2001 From: Yi-Hong Lyu Date: Mon, 18 Nov 2024 06:58:21 -0800 Subject: [PATCH 11/28] Optimize Transpose around QLinearSoftmax (#22849) ### Description - Improved Transpose around QLinearSoftmax in Level 3 NHWC Transformer. - Removed redundant code HandleQLinearConcat, HandleQLinearBinaryOp. ### Motivation and Context By merging and eliminating redundant transpose , the Image Segmentation i8 model (MobileNetv2 + DeepLabv3) achieves a 2.34X speedup. --- .../onnx_transpose_optimization.cc | 4 +- .../onnx_transpose_optimization.h | 3 ++ .../ort_transpose_optimization.cc | 14 ++----- .../optimizer/transpose_optimizer_test.cc | 41 +++++++++++++++++++ 4 files changed, 50 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc index 81a842eb87db1..10cb6eb97bdd6 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc +++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.cc @@ -1654,14 +1654,14 @@ static bool HandleSplit(HandlerArgs& args) { constexpr HandlerInfo split_handler = {&FirstInput, &HandleSplit}; -static bool HandleConcat(HandlerArgs& args) { +bool HandleConcat(HandlerArgs& args) { return HandleSimpleNodeWithAxis(args); } constexpr HandlerInfo concat_handler = {&AllInputs, &HandleConcat}; // Handles Softmax, Hardmax, and LogSoftmax -static bool HandleSoftHardMax(HandlerArgs& args) { +bool HandleSoftHardMax(HandlerArgs& args) { if (args.ctx.opset >= 13) { return HandleSimpleNodeWithAxis(args, /*default_axis*/ -1); } diff --git a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h index 0095ead75f0c8..f65bd6aa82fbb 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h +++ b/onnxruntime/core/optimizer/transpose_optimization/onnx_transpose_optimization.h @@ -71,6 +71,9 @@ bool HandleSimpleNodeBroadcast(HandlerArgs& args); // Transposes all inputs and all outputs. Updates axis attribute. bool HandleSimpleNodeWithAxis(HandlerArgs& args, std::optional default_axis = std::nullopt); +bool HandleConcat(HandlerArgs& args); +bool HandleSoftHardMax(HandlerArgs& args); + // base handlers that are used by extended handlers. add from transpose_optimizer.cc as needed. bool HandleReduceOps(HandlerArgs& args); bool HandleResize([[maybe_unused]] HandlerArgs& args); diff --git a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc index 8eaac3d34c3af..824ab20a84668 100644 --- a/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc +++ b/onnxruntime/core/optimizer/transpose_optimization/ort_transpose_optimization.cc @@ -34,10 +34,6 @@ static bool EPAwareHandleResize(HandlerArgs& args) { constexpr HandlerInfo ep_aware_resize_handler = {&FirstInput, &EPAwareHandleResize}; -static bool HandleQLinearConcat(HandlerArgs& args) { - return HandleSimpleNodeWithAxis(args); -} - std::vector QLinearConcatInputs(OptimizerCtx& ctx, api::NodeRef& node) { (void)ctx; std::vector indices; @@ -48,11 +44,7 @@ std::vector QLinearConcatInputs(OptimizerCtx& ctx, api::NodeRef& node) { return indices; } -constexpr HandlerInfo q_linear_concat_handler = {&QLinearConcatInputs, &HandleQLinearConcat}; - -static bool HandleQLinearBinaryOp(HandlerArgs& args) { - return HandleSimpleNodeBroadcast(args); -} +constexpr HandlerInfo q_linear_concat_handler = {&QLinearConcatInputs, &HandleConcat}; std::vector QLinearBinaryOpInputs(OptimizerCtx&, api::NodeRef&) { // Inputs are: [A, A_scale, A_zero_point, B, B_scale, B_zero_point, C_scale, C_zero_point], @@ -60,7 +52,7 @@ std::vector QLinearBinaryOpInputs(OptimizerCtx&, api::NodeRef&) { return {0, 3}; } -constexpr HandlerInfo q_linear_binary_op_handler = {&QLinearBinaryOpInputs, &HandleQLinearBinaryOp}; +constexpr HandlerInfo q_linear_binary_op_handler = {&QLinearBinaryOpInputs, &HandleSimpleNodeBroadcast}; static bool HandleQLinearPoolOp(HandlerArgs& args) { // Swap between channel first/last variants. Only works for applicable values of perm. @@ -129,6 +121,7 @@ constexpr HandlerInfo max_pool_op_handler = {&FirstInput, &HandleMaxPool}; constexpr HandlerInfo node_1_inp_handler = {&FirstInput, &HandleSimpleNode}; constexpr HandlerInfo reduce_op_handler = {&FirstInput, &HandleReduceOps}; +constexpr HandlerInfo soft_hard_max_handler = {&FirstInput, &HandleSoftHardMax}; constexpr HandlerInfo contrib_quantize_dequantize_linear_handler = {&FirstInput, &HandleContribQuantizeDequantizeLinear}; @@ -148,6 +141,7 @@ const HandlerMap& OrtExtendedHandlers() { {"com.microsoft.QLinearMul", q_linear_binary_op_handler}, {"com.microsoft.QLinearReduceMean", reduce_op_handler}, {"com.microsoft.QLinearSigmoid", node_1_inp_handler}, + {"com.microsoft.QLinearSoftmax", soft_hard_max_handler}, }; return map; diff --git a/onnxruntime/test/optimizer/transpose_optimizer_test.cc b/onnxruntime/test/optimizer/transpose_optimizer_test.cc index 35ba1a3369597..f6fce37322c10 100644 --- a/onnxruntime/test/optimizer/transpose_optimizer_test.cc +++ b/onnxruntime/test/optimizer/transpose_optimizer_test.cc @@ -22,6 +22,7 @@ #include "test/optimizer/graph_transform_test_builder.h" #include "test/providers/internal_testing/internal_testing_execution_provider.h" #include "test/util/include/asserts.h" +#include "test/util/include/default_providers.h" #include "test/util/include/inference_session_wrapper.h" #include "test/util/include/test_utils.h" @@ -3800,6 +3801,46 @@ TEST(TransposeOptimizerTests, TestCast) { /*opset_version*/ {15, 18}); } +TEST(TransposeOptimizerTests, TestQLinearSoftmax) { + auto build_test_case_1 = [&](ModelTestBuilder& builder) { + auto* input0_arg = MakeInput(builder, std::nullopt, {1, 384, 384, 21}, 0, 255); + auto* transpose_1_out_0 = builder.MakeIntermediate(); + auto* input_x_scale = builder.MakeScalarInitializer(0.5086354613304138); + auto* input_x_zero_point = builder.MakeScalarInitializer(74); + auto* input_y_scale = builder.MakeScalarInitializer(0.003921568859368563); + auto* input_y_zero_point = builder.MakeScalarInitializer(0); + auto* qlinearsoftmax_1_out_0 = builder.MakeIntermediate(); + auto* transpose_2_out_0 = builder.MakeOutput(); + + auto& transpose_1 = builder.AddNode("Transpose", {input0_arg}, {transpose_1_out_0}); + transpose_1.AddAttribute("perm", std::vector{0, 3, 1, 2}); + auto& qlinearsoftmax_1 = builder.AddNode("QLinearSoftmax", + {transpose_1_out_0, input_x_scale, input_x_zero_point, input_y_scale, input_y_zero_point}, + {qlinearsoftmax_1_out_0}, kMSDomain); + qlinearsoftmax_1.AddAttribute("axis", static_cast(1)); + qlinearsoftmax_1.AddAttribute("opset", static_cast(13)); + auto& transpose_2 = builder.AddNode("Transpose", {qlinearsoftmax_1_out_0}, {transpose_2_out_0}); + transpose_2.AddAttribute("perm", std::vector{0, 2, 3, 1}); + }; + + auto check_optimized_graph_1 = [&](InferenceSessionWrapper& session) { + int transpose_cost = EstimateTransposeCost(session.GetGraph()); + EXPECT_EQ(transpose_cost, 0); + }; + + TransformerTester(build_test_case_1, + check_optimized_graph_1, + TransformerLevel::Level2, + TransformerLevel::Level3, + /*opset_version*/ 13, + /*per_sample_tolerance*/ 0.0, + /*relative_per_sample_tolerance*/ 0.0, + /*transformer*/ nullptr, + /*add_session_options*/ {}, + /*disabled_optimizers*/ {}, + /*ep*/ DefaultCpuExecutionProvider()); +} + TEST(TransposeOptimizerTests, TestBroadcastReusedInputs) { auto build_test_case_1 = [&](ModelTestBuilder& builder) { auto* input0_arg = MakeInput(builder, {{-1, -1, 3, 4}}, {1, 2, 3, 4}, 0.0, 1.0); From c4f3742bb456a33ee9c826ce4e6939f8b84ce5b0 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 18 Nov 2024 09:16:41 -0800 Subject: [PATCH 12/28] Replace INFINITY by std::numeric_limits::infinity() (#22868) Replace INFINITY by `std::numeric_limits::infinity()` to avoid build errors with Visual Studio 2022 v17.12 Preview 5 ### Motivation and Context https://github.com/microsoft/onnxruntime/issues/22728 --- .../cuda/bert/flash_attention/flash_fwd_kernel.h | 12 ++++++------ .../contrib_ops/cuda/bert/flash_attention/mask.h | 15 ++++++++------- .../cuda/bert/flash_attention/softmax.h | 13 +++++++++---- .../cuda/bert/lean_attention/lean_fwd_kernel.h | 10 ++++++---- .../contrib_ops/cuda/bert/lean_attention/mask.h | 16 ++++++++-------- .../cuda/bert/lean_attention/softmax.h | 14 +++++++++----- .../cuda/bert/ngram_repeat_block_impl.cu | 4 ++-- .../core/optimizer/attention_fusion_helper.h | 4 +++- .../core/providers/xnnpack/detail/utils.cc | 5 +++-- onnxruntime/core/providers/xnnpack/math/gemm.cc | 7 +++++-- .../core/providers/xnnpack/math/matmul.cc | 5 +++-- .../core/providers/xnnpack/nn/average_pool.cc | 4 ++-- .../core/providers/xnnpack/nn/conv_base.cc | 4 ++-- .../core/providers/xnnpack/nn/max_pool.cc | 6 ++++-- 14 files changed, 70 insertions(+), 49 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h index e961bab399326..d46d9597a758f 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_kernel.h @@ -98,7 +98,7 @@ inline __device__ void compute_attn_1rowblock(const Params& params, const int bi for (int m = 0; m < size<1>(tOgO); ++m) { const int row = get<0>(tOcO(0, m, 0)); if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) { - gLSE(row) = INFINITY; + gLSE(row) = std::numeric_limits::infinity(); } } return; @@ -499,7 +499,7 @@ inline __device__ void compute_attn_1rowblock_splitkv(const Params& params, cons for (int m = 0; m < size<1>(tOgOaccum); ++m) { const int row = get<0>(tOcO(0, m, 0)); if (row < binfo.actual_seqlen_q - m_block * kBlockM && get<1>(tOcO(0, m, 0)) == 0) { - gLSEaccum(row) = Split ? -INFINITY : INFINITY; + gLSEaccum(row) = Split ? -std::numeric_limits::infinity() : std::numeric_limits::infinity(); } } return; @@ -1061,7 +1061,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { for (int l = 0; l < kNLsePerThread; ++l) { const int row = l * kRowsPerLoadLSE + tidx / kBlockM; const int col = tidx % kBlockM; - ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -INFINITY; + ElementAccum lse = (row < params.num_splits && col < params.b * params.h * params.seqlen_q - bidx * kBlockM) ? gLSEaccum(row, col) : -std::numeric_limits::infinity(); if (row < kMaxSplits) { sLSE[row][col] = lse; } @@ -1082,7 +1082,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { for (int l = 0; l < kNLsePerThread; ++l) { const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose; const int col = tidx / kRowsPerLoadTranspose; - lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -INFINITY; + lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE[row][col] : -std::numeric_limits::infinity(); // if (bidx == 0 && tidx < 32) { printf("tidx = %d, row = %d, col = %d, lse = %f\n", tidx, row, col, lse_accum(l)); } } @@ -1094,7 +1094,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { } MaxOp max_op; lse_max = Allreduce::run(lse_max, max_op); - lse_max = lse_max == -INFINITY ? 0.0f : lse_max; // In case all local LSEs are -inf + lse_max = lse_max == -std::numeric_limits::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf float lse_sum = expf(lse_accum(0) - lse_max); #pragma unroll for (int l = 1; l < kNLsePerThread; ++l) { @@ -1104,7 +1104,7 @@ inline __device__ void combine_attn_seqk_parallel(const Params& params) { lse_sum = Allreduce::run(lse_sum, sum_op); // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum. - ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max; + ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? std::numeric_limits::infinity() : logf(lse_sum) + lse_max; // if (bidx == 0 && tidx < 32) { printf("tidx = %d, lse = %f, lse_max = %f, lse_logsum = %f\n", tidx, lse_accum(0), lse_max, lse_logsum); } if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h index 0998155eba635..71434002f8df1 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/mask.h @@ -4,6 +4,7 @@ #pragma once +#include #include namespace onnxruntime { @@ -28,7 +29,7 @@ __forceinline__ __device__ void apply_mask(Tensor& tensor, const // Without the "make_coord" we get wrong results #pragma unroll for (int mi = 0; mi < size<0>(tensor); ++mi) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -59,7 +60,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor& tensor, for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -96,7 +97,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx( #pragma unroll for (int ni = 0; ni < size<1, 1>(tensor); ++ni) { if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) { - tensor(mi, ni) = -INFINITY; + tensor(mi, ni) = -std::numeric_limits::infinity(); } } // if (cute::thread0()) { @@ -151,7 +152,7 @@ struct Mask { } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -181,18 +182,18 @@ struct Mask { } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (Is_local) { if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (!Causal_mask && !Is_local && !Is_even_MN) { // Causal and Local already handles MN masking if (col_idx >= max_seqlen_k) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h index 7e0095cb39bd9..7fe506e01a9b9 100644 --- a/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/flash_attention/softmax.h @@ -4,6 +4,7 @@ #pragma once #include +#include #include @@ -71,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor& tenso // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. // If we don't have float around M_LOG2E the multiplication is done in fp64. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E)); + const float max_scaled = max(mi) == -std::numeric_limits::infinity() + ? 0.f + : max(mi) * (Scale_max ? scale : float(M_LOG2E)); #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { // Instead of computing exp(x - max), we compute exp2(x * log_2(e) - @@ -99,7 +102,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor& ten max(mi) = Allreduce<4>::run(max(mi), max_op); // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale; + const float max_scaled = max(mi) == -std::numeric_limits::infinity() ? 0.f : max(mi) * scale; sum(mi) = 0; #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { @@ -143,7 +146,7 @@ struct Softmax { for (int mi = 0; mi < size(row_max); ++mi) { float scores_max_cur = !Check_inf ? row_max(mi) - : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi)); + : (row_max(mi) == -std::numeric_limits::infinity() ? 0.0f : row_max(mi)); float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2); row_sum(mi) *= scores_scale; #pragma unroll @@ -169,7 +172,9 @@ struct Softmax { for (int mi = 0; mi < size<0>(acc_o_rowcol); ++mi) { float sum = smooth_softmax ? row_sum(mi) + expf(-row_max(mi) * softmax_scale) : row_sum(mi); float inv_sum = (sum == 0.f || sum != sum) ? 1.f : 1.f / sum; - lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum); + lse(mi) = (sum == 0.f || sum != sum) + ? (Split ? -std::numeric_limits::infinity() : std::numeric_limits::infinity()) + : row_max(mi) * softmax_scale + __logf(sum); float scale = inv_sum; #pragma unroll for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h index 5be69ea0af55c..bd54b404420e5 100644 --- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h +++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/lean_fwd_kernel.h @@ -825,7 +825,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i const int row = l * kRowsPerLoadLSE + tidx / kBlockM; const int col = tidx % kBlockM; // We skip the first row = 0, as we already populated it in shared memory. - ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -INFINITY; + ElementAccum lse = (row > 0 && row < total_splits && col < params.b * params.h * (index_t)params.seqlen_q - row_offset_lseaccum) ? gLSEaccumRead(row, col) : -std::numeric_limits::infinity(); if (row > 0 && row < kMaxSplits) { sLSE(row, col) = lse; @@ -857,7 +857,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i for (int l = 0; l < kNLsePerThread; ++l) { const int row = l * kRowsPerLoadTranspose + tidx % kRowsPerLoadTranspose; const int col = tidx / kRowsPerLoadTranspose; - lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -INFINITY; + lse_accum(l) = (row < kMaxSplits && col < kBlockM) ? sLSE(row, col) : -std::numeric_limits::infinity(); #if defined(DEBUG_LEAN_ATTENTION) if (threadIdx.x == 0 && blockIdx.z == tracing_block) { @@ -874,7 +874,7 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i } MaxOp max_op; lse_max = Allreduce::run(lse_max, max_op); - lse_max = lse_max == -INFINITY ? 0.0f : lse_max; // In case all local LSEs are -inf + lse_max = lse_max == -std::numeric_limits::infinity() ? 0.0f : lse_max; // In case all local LSEs are -inf float lse_sum = expf(lse_accum(0) - lse_max); #pragma unroll for (int l = 1; l < kNLsePerThread; ++l) { @@ -884,7 +884,9 @@ inline __device__ void lean_compute_attn_impl_ver3(const Params& params, const i lse_sum = Allreduce::run(lse_sum, sum_op); // For the case where all local lse == -INFINITY, we want to set lse_logsum to INFINITY. Otherwise // lse_logsum is log(0.0) = -INFINITY and we get NaN when we do lse_accum(l) - lse_logsum. - ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) ? INFINITY : logf(lse_sum) + lse_max; + ElementAccum lse_logsum = (lse_sum == 0.f || lse_sum != lse_sum) + ? std::numeric_limits::infinity() + : logf(lse_sum) + lse_max; // if (tidx % kRowsPerLoadTranspose == 0 && tidx / kRowsPerLoadTranspose < kBlockM) { gLSE(tidx / kRowsPerLoadTranspose) = lse_logsum; } // Store the scales exp(lse - lse_logsum) in shared memory. #pragma unroll diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h index d63c80b012de6..2d33418d69667 100644 --- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h +++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/mask.h @@ -3,7 +3,7 @@ ******************************************************************************/ #pragma once - +#include #include namespace onnxruntime { @@ -28,7 +28,7 @@ __forceinline__ __device__ void apply_mask(Tensor& tensor, const // Without the "make_coord" we get wrong results #pragma unroll for (int mi = 0; mi < size<0>(tensor); ++mi) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -59,7 +59,7 @@ __forceinline__ __device__ void apply_mask_local(Tensor& tensor, for (int j = 0; j < size<1, 0>(tensor); ++j) { const int col_idx = col_idx_base + j; if (col_idx >= col_idx_limit_right || (HasWSLeft && col_idx < col_idx_limit_left)) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -96,7 +96,7 @@ __forceinline__ __device__ void apply_mask_causal_w_idx( #pragma unroll for (int ni = 0; ni < size<1, 1>(tensor); ++ni) { if (col_idx_offset_ + get<1>(idx_rowcol(0, ni)) >= col_idx_limit) { - tensor(mi, ni) = -INFINITY; + tensor(mi, ni) = -std::numeric_limits::infinity(); } } // if (cute::thread0()) { @@ -152,7 +152,7 @@ struct Mask { } if constexpr (!Is_even_MN) { if (col_idx >= max_seqlen_k) { - tensor(mi, make_coord(j, nj)) = -INFINITY; + tensor(mi, make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } @@ -182,18 +182,18 @@ struct Mask { } if constexpr (Causal_mask) { if (col_idx >= col_idx_limit_right) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (Is_local) { if (col_idx >= col_idx_limit_right || col_idx < col_idx_limit_left) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } if constexpr (!Causal_mask && !Is_local && !Is_even_MN) { // Causal and Local already handles MN masking if (col_idx >= max_seqlen_k) { - tensor(make_coord(i, mi), make_coord(j, nj)) = -INFINITY; + tensor(make_coord(i, mi), make_coord(j, nj)) = -std::numeric_limits::infinity(); } } } diff --git a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h index ad66389848e6e..0b6ffb3f1985a 100644 --- a/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h +++ b/onnxruntime/contrib_ops/cuda/bert/lean_attention/softmax.h @@ -3,7 +3,7 @@ ******************************************************************************/ #pragma once - +#include #include #include @@ -72,7 +72,9 @@ __forceinline__ __device__ void scale_apply_exp2(Tensor& tenso // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. // If we don't have float around M_LOG2E the multiplication is done in fp64. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * (Scale_max ? scale : float(M_LOG2E)); + const float max_scaled = max(mi) == -std::numeric_limits::infinity() + ? 0.f + : max(mi) * (Scale_max ? scale : float(M_LOG2E)); #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { // Instead of computing exp(x - max), we compute exp2(x * log_2(e) - @@ -107,7 +109,7 @@ __forceinline__ __device__ void max_scale_exp2_sum(Tensor& ten max(mi) = Allreduce<4>::run(max(mi), max_op); // If max is -inf, then all elements must have been -inf (possibly due to masking). // We don't want (-inf - (-inf)) since that would give NaN. - const float max_scaled = max(mi) == -INFINITY ? 0.f : max(mi) * scale; + const float max_scaled = max(mi) == -std::numeric_limits::infinity() ? 0.f : max(mi) * scale; sum(mi) = 0; #pragma unroll for (int ni = 0; ni < size<1>(tensor); ++ni) { @@ -151,7 +153,7 @@ struct Softmax { for (int mi = 0; mi < size(row_max); ++mi) { float scores_max_cur = !Check_inf ? row_max(mi) - : (row_max(mi) == -INFINITY ? 0.0f : row_max(mi)); + : (row_max(mi) == -std::numeric_limits::infinity() ? 0.0f : row_max(mi)); float scores_scale = exp2f((scores_max_prev(mi) - scores_max_cur) * softmax_scale_log2); row_sum(mi) *= scores_scale; #pragma unroll @@ -181,7 +183,9 @@ struct Softmax { // printf("sum: %f, inv_sum: %f\n", sum, inv_sum); // printf("mi %d row_max %f softmax_scale %f\n", mi, row_max(mi), softmax_scale); // } - lse(mi) = (sum == 0.f || sum != sum) ? (Split ? -INFINITY : INFINITY) : row_max(mi) * softmax_scale + __logf(sum); + lse(mi) = (sum == 0.f || sum != sum) + ? (Split ? -std::numeric_limits::infinity() : std::numeric_limits::infinity()) + : row_max(mi) * softmax_scale + __logf(sum); float scale = !Is_dropout ? inv_sum : inv_sum * rp_dropout; #pragma unroll for (int ni = 0; ni < size<1>(acc_o_rowcol); ++ni) { diff --git a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu index 8a04ede231a27..ab809d12a89ad 100644 --- a/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu +++ b/onnxruntime/contrib_ops/cuda/bert/ngram_repeat_block_impl.cu @@ -6,7 +6,7 @@ Licensed under the MIT License. /* Kernel implementation for blocking repeated n-grams. */ - +#include #include "core/providers/cuda/cu_inc/common.cuh" #include "contrib_ops/cuda/bert/ngram_repeat_block_impl.h" @@ -48,7 +48,7 @@ __global__ void banRepeatedTokens(const int64_t* __restrict__ tokens, } if (is_banned == true) { auto token_to_be_banned = tokens_shm[col + no_repeat_ngram_size - 1]; - lprobs[lprob_start + token_to_be_banned] = -INFINITY; + lprobs[lprob_start + token_to_be_banned] = -std::numeric_limits::infinity(); } } diff --git a/onnxruntime/core/optimizer/attention_fusion_helper.h b/onnxruntime/core/optimizer/attention_fusion_helper.h index 267a82b72670c..935114c40d1a7 100644 --- a/onnxruntime/core/optimizer/attention_fusion_helper.h +++ b/onnxruntime/core/optimizer/attention_fusion_helper.h @@ -1,5 +1,6 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include #include "onnx/defs/shape_inference.h" #include "onnx/defs/tensor_proto_util.h" #include "core/framework/tensorprotoutils.h" @@ -767,7 +768,8 @@ bool MatchInputMaskSubgraph(const Graph& graph, const Node& layer_norm, const No } // check where has X=-Infinity - if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), -INFINITY, true)) { + if (!optimizer_utils::IsInitializerWithExpectedValue(graph, *(where.InputDefs()[1]), + -std::numeric_limits::infinity(), true)) { DEBUG_LOG("where const not matched."); return false; } diff --git a/onnxruntime/core/providers/xnnpack/detail/utils.cc b/onnxruntime/core/providers/xnnpack/detail/utils.cc index 4eef14dddecd3..2adf8339b4b66 100644 --- a/onnxruntime/core/providers/xnnpack/detail/utils.cc +++ b/onnxruntime/core/providers/xnnpack/detail/utils.cc @@ -5,6 +5,7 @@ #include #include #include +#include #include "core/common/common.h" #include "core/common/safeint.h" @@ -239,8 +240,8 @@ std::unique_ptr FuseActivation(const NodeUnit& node_un def.attributes = node_unit.GetNode().GetAttributes(); // use infinity as the default as that's what xnnpack uses if min/max are not set - float min = -INFINITY; - float max = INFINITY; + float min = -std::numeric_limits::infinity(); + float max = std::numeric_limits::infinity(); const auto& activation_type = activation.OpType(); if (activation_type == "Clip") { diff --git a/onnxruntime/core/providers/xnnpack/math/gemm.cc b/onnxruntime/core/providers/xnnpack/math/gemm.cc index 35a06cb7eb89f..a3ff3b585ae45 100644 --- a/onnxruntime/core/providers/xnnpack/math/gemm.cc +++ b/onnxruntime/core/providers/xnnpack/math/gemm.cc @@ -2,6 +2,9 @@ // Licensed under the MIT License. #include "gemm.h" + +#include + #include "core/framework/transpose_helper.h" #include "core/providers/utils.h" #include "core/providers/xnnpack/xnnpack_init.h" @@ -140,8 +143,8 @@ Status Gemm::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr, auto weights_cache = GetWeightsCache(); xnn_status status = xnn_status::xnn_status_uninitialized; struct xnn_operator* p = nullptr; - float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY; - float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY; + float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits::infinity(); if (op_compute_type_ == OpComputeType::op_compute_type_fp32) { const float* bias_data = nullptr; if (C_matrix_exists_) { diff --git a/onnxruntime/core/providers/xnnpack/math/matmul.cc b/onnxruntime/core/providers/xnnpack/math/matmul.cc index 44a6fb4ee835a..f574238195ffd 100644 --- a/onnxruntime/core/providers/xnnpack/math/matmul.cc +++ b/onnxruntime/core/providers/xnnpack/math/matmul.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "matmul.h" +#include #include "core/providers/cpu/math/matmul_helper.h" #include "core/providers/xnnpack/xnnpack_init.h" @@ -109,8 +110,8 @@ Status MatMul::PrePack(const Tensor& tensor, int input_idx, AllocatorPtr alloc, xnn_weights_cache_t weight_cache = nullptr; #endif - float foutput_min = -INFINITY; - float foutput_max = INFINITY; + float foutput_min = -std::numeric_limits::infinity(); + float foutput_max = std::numeric_limits::infinity(); if (op_type_ == OpComputeType::op_compute_type_fp32) { status = xnn_create_fully_connected_nc_f32( shape_broadcast[0], // size_t input_channels, diff --git a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc index 1c8ed556e90d7..1fc941d9f52f6 100644 --- a/onnxruntime/core/providers/xnnpack/nn/average_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/average_pool.cc @@ -33,8 +33,8 @@ Status CreateXnnpackKernel(const PoolAttributes& pool_attrs, if (pool_attrs.auto_pad == AutoPadType::SAME_UPPER) { flags |= XNN_FLAG_TENSORFLOW_SAME_PADDING; } - float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY; - float foutput_max = clip_min_max ? clip_min_max->second : INFINITY; + float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits::infinity(); xnn_status status = xnn_status_unsupported_parameter; if (avgpool_type == OpComputeType::op_compute_type_fp32) { status = xnn_create_average_pooling2d_nhwc_f32(input_padding_top, input_padding_right, diff --git a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc index e0723c0e7690e..458e6000c8d70 100644 --- a/onnxruntime/core/providers/xnnpack/nn/conv_base.cc +++ b/onnxruntime/core/providers/xnnpack/nn/conv_base.cc @@ -54,8 +54,8 @@ Status CreateXnnpackKernel(const ConvAttributes& conv_attrs, xnn_status status = xnn_status::xnn_status_uninitialized; p = nullptr; - float foutput_min = clip_min_max ? clip_min_max->first : -INFINITY; - float foutput_max = clip_min_max ? clip_min_max->second : INFINITY; + float foutput_min = clip_min_max ? clip_min_max->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max ? clip_min_max->second : std::numeric_limits::infinity(); // with the following IC and OC number, we can cover depthwise and regular conv at the same time // the equation 'IC (group_input_channels) == C ' set up when group_count==1 (regular convolution) // and OC (group_output_channels) follows the same rule. diff --git a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc index 6742e51e55082..c828ae9400174 100644 --- a/onnxruntime/core/providers/xnnpack/nn/max_pool.cc +++ b/onnxruntime/core/providers/xnnpack/nn/max_pool.cc @@ -3,6 +3,8 @@ #include "max_pool.h" +#include + #include "core/graph/graph.h" #include "core/providers/utils.h" #include "core/providers/xnnpack/xnnpack_init.h" @@ -168,8 +170,8 @@ MaxPool::MaxPool(const OpKernelInfo& info) auto input_dtype = X_arg.TypeAsProto()->tensor_type().elem_type(); xnn_status status = xnn_status_invalid_state; struct xnn_operator* p = nullptr; - float foutput_min = clip_min_max_ ? clip_min_max_->first : -INFINITY; - float foutput_max = clip_min_max_ ? clip_min_max_->second : INFINITY; + float foutput_min = clip_min_max_ ? clip_min_max_->first : -std::numeric_limits::infinity(); + float foutput_max = clip_min_max_ ? clip_min_max_->second : std::numeric_limits::infinity(); if (input_dtype == ONNX_NAMESPACE::TensorProto_DataType_FLOAT) { maxpool_type_ = OpComputeType::op_compute_type_fp32; status = xnn_create_max_pooling2d_nhwc_f32(input_padding_top, input_padding_right, From e597eaed4afe255b7eda15f57a63a7b399952158 Mon Sep 17 00:00:00 2001 From: Jiajia Qin Date: Tue, 19 Nov 2024 04:52:48 +0800 Subject: [PATCH 13/28] [js/webgpu] Optimize transpose as reshape when suitable (#22870) BUG #22031 --- js/web/lib/wasm/jsep/webgpu/ops/transpose.ts | 95 ++++++++++++++++---- js/web/test/data/ops/transpose.jsonc | 24 +++++ 2 files changed, 102 insertions(+), 17 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts index 1fd99d085e0ed..21225a77b189b 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/transpose.ts @@ -48,17 +48,61 @@ const squeezeShape = (shape: readonly number[], adjustedPerm: number[]): { newSh return { newShape, newPerm }; }; +const isTransposeReshape = (perm: number[], shape: readonly number[]) => { + // As long as the dims with values > 1 stay in the same order, it's a reshape. + // Example: Shape=(1,1,1024,4096) -> perm=(2,0,3,1). + let lastPermutedAxis = 0; + for (let i = 0; i < perm.length; ++i) { + if (shape[perm[i]] === 1) { + continue; + } + if (perm[i] < lastPermutedAxis) { + return false; + } + lastPermutedAxis = perm[i]; + } + return true; +}; + export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: number[]): ProgramInfo => { const inputDataType = inputTensor.dataType; const inputRank = inputTensor.dims.length; const perm = getAdjustedPerm(inputRank, permAttr); const outputShape = getOutputShape(inputTensor.dims, perm); + let newInputShape = inputTensor.dims; + let newOutputShape = outputShape; + const transposeAsReshape = isTransposeReshape(perm, inputTensor.dims); + let getShaderSource; + if (transposeAsReshape) { + getShaderSource = (shaderHelper: ShaderHelper) => { + const input = inputVariable('input', inputDataType, newInputShape, 4); + const output = outputVariable('output', inputDataType, newOutputShape, 4); + return ` + ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)} + ${shaderHelper.mainStart()} + ${shaderHelper.guardAgainstOutOfBoundsWorkgroupSizes('uniforms.output_size')} + output[global_idx] = input[global_idx]; + }`; + }; + + return { + name: 'TransposeCopy', + shaderCache: { inputDependencies: ['type'] }, + getRunData: () => { + const outputSize = ShapeUtil.size(outputShape); + return { + outputs: [{ dims: outputShape, dataType: inputTensor.dataType }], + dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */ / 4 /* components */) }, + programUniforms: [{ type: DataType.uint32, data: Math.ceil(outputSize / 4) }], + }; + }, + getShaderSource, + }; + } const { newShape, newPerm } = squeezeShape(inputTensor.dims, perm); const channelsLast = ShapeUtil.areEqual(newPerm, [2, 3, 1]); const channelsFirst = ShapeUtil.areEqual(newPerm, [3, 1, 2]); - const useShared = (newShape.length === 2 && newPerm[0] > newPerm[1]) || channelsLast || channelsFirst; - let newInputShape = useShared ? newShape : inputTensor.dims; - let newOutputShape = outputShape; + const useShared = newShape.length === 2 || channelsLast || channelsFirst; if (useShared) { newInputShape = channelsLast ? [newShape[0], newShape[1] * newShape[2]] @@ -66,13 +110,11 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu ? [newShape[0] * newShape[1], newShape[2]] : newShape; newOutputShape = [newInputShape[1], newInputShape[0]]; - } - const input = inputVariable('a', inputDataType, newInputShape.length); - const output = outputVariable('output', inputDataType, newOutputShape.length); - const tileSize = 16; - let getShaderSource; - if (useShared) { - getShaderSource = (shaderHelper: ShaderHelper) => ` + const tileSize = 16; + getShaderSource = (shaderHelper: ShaderHelper) => { + const input = inputVariable('a', inputDataType, newInputShape.length); + const output = outputVariable('output', inputDataType, newOutputShape.length); + return ` ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)} var tile : array, ${tileSize}>; ${shaderHelper.mainStart([tileSize, tileSize, 1])} @@ -92,8 +134,29 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu ${output.setByIndices(`${output.type.indices}(output_row, output_col)`, 'tile[local_id.x][local_id.y]')} } }`; - } else { - getShaderSource = (shaderHelper: ShaderHelper) => ` + }; + return { + name: 'TransposeShared', + shaderCache: { inputDependencies: ['type'] }, + getRunData: () => { + const outputSize = ShapeUtil.size(outputShape); + return { + outputs: [{ dims: outputShape, dataType: inputTensor.dataType }], + dispatchGroup: { x: Math.ceil(newOutputShape[1] / tileSize), y: Math.ceil(newOutputShape[0] / tileSize) }, + programUniforms: [ + { type: DataType.uint32, data: outputSize }, + ...createTensorShapeVariables(newInputShape, newOutputShape), + ], + }; + }, + getShaderSource, + }; + } + + getShaderSource = (shaderHelper: ShaderHelper) => { + const input = inputVariable('a', inputDataType, newInputShape.length); + const output = outputVariable('output', inputDataType, newOutputShape.length); + return ` ${shaderHelper.registerUniform('output_size', 'u32').declareVariables(input, output)} ${permFunctionBody(perm, inputRank, input, output)} @@ -106,17 +169,15 @@ export const createTransposeProgramInfo = (inputTensor: TensorView, permAttr: nu ${output.setByOffset('global_idx', input.getByIndices('aIndices'))} }`; - } + }; return { - name: useShared ? 'TransposeShared' : 'Transpose', + name: 'Transpose', shaderCache: { hint: `${permAttr}`, inputDependencies: ['rank'] }, getRunData: () => { const outputSize = ShapeUtil.size(outputShape); return { outputs: [{ dims: outputShape, dataType: inputTensor.dataType }], - dispatchGroup: useShared - ? { x: Math.ceil(newOutputShape[1] / tileSize), y: Math.ceil(newOutputShape[0] / tileSize) } - : { x: Math.ceil(outputSize / 64 /* workgroup size */) }, + dispatchGroup: { x: Math.ceil(outputSize / 64 /* workgroup size */) }, programUniforms: [ { type: DataType.uint32, data: outputSize }, ...createTensorShapeVariables(newInputShape, newOutputShape), diff --git a/js/web/test/data/ops/transpose.jsonc b/js/web/test/data/ops/transpose.jsonc index a7265d6444118..d431ceb1712a5 100644 --- a/js/web/test/data/ops/transpose.jsonc +++ b/js/web/test/data/ops/transpose.jsonc @@ -263,6 +263,30 @@ } ] }, + { + "name": "Transpose as reshape - perms:[1, 0, 2, 4, 3]", + "operator": "Transpose", + "attributes": [{ "name": "perm", "data": [1, 0, 2, 4, 3], "type": "ints" }], + "cases": [ + { + "name": "T[3, 1, 2, 1, 4]", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], + "dims": [3, 1, 2, 1, 4], + "type": "float32" + } + ], + "outputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24], + "dims": [1, 3, 2, 4, 1], + "type": "float32" + } + ] + } + ] + }, { "name": "Transpose - perms:[1, 0]", "operator": "Transpose", From 497b06f0a9a48c3f5e6de221254f00229984bfa3 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Mon, 18 Nov 2024 20:10:36 -0800 Subject: [PATCH 14/28] [QNN EP] QNN SDK 2.28.2 (#22844) ### Description - Updates pipelines to use QNN SDK 2.28.2.241116. - Re-enable LayerNormalization unit tests that failed with accuracy errors with the previous QNN SDK (2.28.0). - Update QNN EP to no longer provide a dummy bias for LayerNorm if the QNN SDK version is >= 2.28.0. ### Motivation and Context Use the latest QNN SDK. This version improves inference latency for certain customer models. --- .../opbuilder/layer_norm_op_builder.cc | 6 ++--- .../qnn/builder/qnn_backend_manager.cc | 10 ++++++- .../test/providers/qnn/gather_op_htp_test.cc | 1 + .../test/providers/qnn/layer_norm_test.cc | 27 +++++++------------ .../test/providers/qnn/matmul_test.cpp | 2 +- .../test/providers/qnn/simple_op_htp_test.cc | 1 + ...arm64-v8a-QNN-crosscompile-ci-pipeline.yml | 2 +- .../c-api-noopenmp-packaging-pipelines.yml | 2 +- .../azure-pipelines/linux-qnn-ci-pipeline.yml | 2 +- .../azure-pipelines/py-packaging-pipeline.yml | 2 +- .../qnn-ep-nuget-packaging-pipeline.yml | 2 +- .../stages/py-cpu-packaging-stage.yml | 2 +- .../templates/jobs/download_linux_qnn_sdk.yml | 2 +- .../templates/jobs/download_win_qnn_sdk.yml | 2 +- .../templates/py-linux-qnn.yml | 2 +- .../templates/py-win-arm64-qnn.yml | 2 +- .../templates/py-win-arm64ec-qnn.yml | 2 +- .../templates/py-win-x64-qnn.yml | 2 +- .../azure-pipelines/templates/qnn-ep-win.yml | 2 +- .../win-qnn-arm64-ci-pipeline.yml | 2 +- .../azure-pipelines/win-qnn-ci-pipeline.yml | 2 +- 21 files changed, 39 insertions(+), 38 deletions(-) diff --git a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc index d089235ceaa02..d1a0e88686f39 100644 --- a/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc +++ b/onnxruntime/core/providers/qnn/builder/opbuilder/layer_norm_op_builder.cc @@ -87,10 +87,10 @@ Status LayerNormOpBuilder::ProcessInputs(QnnModelWrapper& qnn_model_wrapper, ORT_RETURN_IF_ERROR(ProcessInput(qnn_model_wrapper, inputs[BIAS_IDX], logger, input_names)); } -#if QNN_API_VERSION_MAJOR == 2 && (QNN_API_VERSION_MINOR >= 17) +#if QNN_API_VERSION_MAJOR == 2 && QNN_API_VERSION_MINOR >= 17 && QNN_API_VERSION_MINOR <= 20 if (!has_bias_input && IsNpuBackend(qnn_model_wrapper.GetQnnBackendType())) { - // Bias is implicit. QNN SDK 2.24+ (QNN API version 2.17+) has a validation bug for implicit bias inputs, - // so provide an explicit bias of all 0 (quantized int32). + // Bias is implicit. QNN SDK 2.24 to 2.27 (QNN API version 2.17 to 2.20) has a validation bug for + // implicit bias inputs, so provide an explicit bias of all 0 (quantized int32). TensorInfo x_input_info = {}; ORT_RETURN_IF_ERROR(qnn_model_wrapper.GetTensorInfo(inputs[X_IDX], x_input_info)); diff --git a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc index bfc2102bdaac2..f37c91aa0413b 100644 --- a/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc +++ b/onnxruntime/core/providers/qnn/builder/qnn_backend_manager.cc @@ -14,6 +14,7 @@ #include "DSP/QnnDspCommon.h" #include "HTP/QnnHtpCommon.h" #include "HTP/QnnHtpContext.h" +#include "Saver/QnnSaver.h" #include #include "core/framework/endian_utils.h" #include "core/common/logging/capture.h" @@ -1040,7 +1041,14 @@ Status QnnBackendManager::ExtractBackendProfilingInfo() { const QnnProfile_EventId_t* profile_events{nullptr}; uint32_t num_events{0}; Qnn_ErrorHandle_t result = qnn_interface_.profileGetEvents(profile_backend_handle_, &profile_events, &num_events); - ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events. Error: ", QnnErrorHandleToString(result)); + if (!qnn_saver_path_.empty()) { // Using QNN Saver backend + // QNN SDK 2.28.2 returns QNN_SAVER_ERROR_DUMMY_RETVALUE, but previous QNN versions return QNN_PROFILE_NO_ERROR. + // We accept both values. + ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result && QNN_SAVER_ERROR_DUMMY_RETVALUE != result, + "Failed to get profile events. Error: ", QnnErrorHandleToString(result)); + } else { + ORT_RETURN_IF(QNN_PROFILE_NO_ERROR != result, "Failed to get profile events. Error: ", QnnErrorHandleToString(result)); + } if (num_events > 0) { LOGS(*logger_, VERBOSE) << "profile_events: " << profile_events << " num_events: " << num_events; diff --git a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc index 019d619f9be49..55177cc7ed131 100644 --- a/onnxruntime/test/providers/qnn/gather_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/gather_op_htp_test.cc @@ -132,6 +132,7 @@ TEST_F(QnnHTPBackendTests, GatherOp_IndicesDynamicInt32_Axis0) { } // disabled for QNN 2.28.0.241029 failed for accuracy validation +// Also fails on QNN 2.28.2. // qdq@QNN_EP val: 3.6094117164611816 (err: 1.3094117641448975, err/output_range: 22.19342041015625%) // qdq@CPU_EP val: 2.2905881404876709 (err: 0.0094118118286132812, err/output_range: 0.15952222049236298%) // abs(qdq@QNN_EP - qdq@CPU_EP) / output_range = 22.033897399902344% diff --git a/onnxruntime/test/providers/qnn/layer_norm_test.cc b/onnxruntime/test/providers/qnn/layer_norm_test.cc index 2773568dde717..947ac19be40a8 100644 --- a/onnxruntime/test/providers/qnn/layer_norm_test.cc +++ b/onnxruntime/test/providers/qnn/layer_norm_test.cc @@ -188,15 +188,11 @@ TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_StaticBias_AU8_WU8_B ExpectedEPNodeAssignment::All); } -// QNN 2.27 accuracy issue -// Inaccuracy detected for output 'output_0', element 0 -// output_range=1.2245157957077026, tolerance=0.40000000596046448%. -// Expected val (f32@CPU_EP): -0 -// qdq@QNN_EP val: 0.19133351743221283 (err: 0.19133351743221283, err/output_range: 15.625238418579102%) -// qdq@CPU_EP val: 0 (err: 0, err/output_range: 0%) -TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_QNN2_24_ImplicitBias_ValidationBug) { - // QNN 2.24 LayerNorm fails validation (intermittent) if the bias input is not provided. QNN EP will provide an - // explicit bias of all zeros to get around this bug. +TEST_F(QnnHTPBackendTests, LayerNorm1D_QNN2_24_ImplicitBias_ValidationBug) { + // QNN 2.24 to 2.27: LayerNorm fails validation (intermittent) if the bias input is not provided. QNN EP will provide + // an explicit bias of all zeros to get around this bug. + // QNN 2.28.0: Validation bug is fixed, but get accuracy errors. + // QNN 2.28.2: All fixed. for (size_t i = 0; i < 15; i++) { // Run it multiple times since this is an intermittent bug. RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 1.0f, 6)), TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), @@ -207,14 +203,9 @@ TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_QNN2_24_ImplicitBias_ValidationB } } -// Test accuracy of 16-bit QDQ LayerNorm with a static scale input. -// QNN 2.27 accuracy issue -// Inaccuracy detected for output 'output_0', element 0 -// output_range=1.224743127822876, tolerance=0.40000000596046448%. -// Expected val (f32@CPU_EP): -0 -// qdq@QNN_EP val: 0.19136904180049896 (err: 0.19136904180049896, err/output_range: 15.625238418579102%) -// qdq@CPU_EP val: 0 (err: 0, err/output_range: 0%) -TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_StaticScale_AU16_WU8) { +TEST_F(QnnHTPBackendTests, LayerNorm1D_LastAxis_StaticScale_AU16_WU8) { + // QNN 2.28.0: Get accuracy errors. + // QNN 2.28.2: All fixed. RunLayerNormQDQTest(TestInputDef({1, 2, 3}, false, GetFloatDataInRange(0.0f, 10.0f, 6)), TestInputDef({3}, true, GetFloatDataInRange(0.0f, 1.0f, 3)), // Static TestInputDef(), @@ -225,7 +216,7 @@ TEST_F(QnnHTPBackendTests, DISABLED_LayerNorm1D_LastAxis_StaticScale_AU16_WU8) { // Test accuracy of 8-bit QDQ LayerNorm with a dynamic scale input. // -// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22. +// TODO(adrianlizarraga): Fails to finalize with QNN SDK 2.22. Still fails on QNN SDK 2.28.2. // Verbose logs: // Starting stage: Graph Transformations and Optimizations // C:\...\QNN\HTP\HTP\src\hexagon\prepare\graph_prepare.cc:203:ERROR:could not create op: q::flat_to_vtcm diff --git a/onnxruntime/test/providers/qnn/matmul_test.cpp b/onnxruntime/test/providers/qnn/matmul_test.cpp index 800457d906940..5c6967761b1db 100644 --- a/onnxruntime/test/providers/qnn/matmul_test.cpp +++ b/onnxruntime/test/providers/qnn/matmul_test.cpp @@ -273,7 +273,7 @@ TEST_F(QnnHTPBackendTests, MatMulOp_PerChannel_A16_WeightUInt4) { } // Test QDQ per-channel MatMul with int8 act, int4 weights (static) -// QNN 2.27 regression +// QNN 2.27 regression. Also fails on QNN 2.28.2. // Failed to finalize QNN graph. Error code: 1002 TEST_F(QnnHTPBackendTests, DISABLED_MatMulOp_PerChannel_AS8_WeightInt4) { std::vector input0_data = GetFloatDataInRange(-5.0f, 5.0f, 6); diff --git a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc index 05731976c453f..7541d94bac0c6 100644 --- a/onnxruntime/test/providers/qnn/simple_op_htp_test.cc +++ b/onnxruntime/test/providers/qnn/simple_op_htp_test.cc @@ -230,6 +230,7 @@ TEST_F(QnnHTPBackendTests, UnaryOp_Tanh) { } // disabled for QNN 2.28.0.241029 backendValidateOpConfig failed +// still fails on QNN 2.28.2. // QnnDsp [4294967295] has incorrect Value -32768, expected equal to 0. // QnnDsp validateNativeOps node_token_6:qti.aisw:Tanh htp op validator failed 3110 // QnnDsp registered validator failed => 3110 diff --git a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml index 20252220da8f9..c3dbee336b69d 100644 --- a/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/android-arm64-v8a-QNN-crosscompile-ci-pipeline.yml @@ -32,7 +32,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 43cd21dcfccd0..069b803f436f3 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -62,7 +62,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 resources: repositories: diff --git a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml index 03859b1548fd2..d3826d90f9073 100644 --- a/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-qnn-ci-pipeline.yml @@ -33,7 +33,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 jobs: - job: Build_QNN_EP diff --git a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml index c329e0926b046..bd33282fd494e 100644 --- a/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-packaging-pipeline.yml @@ -59,7 +59,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.28.0.241029 + default: 2.28.2.241116 trigger: none diff --git a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml index f2c0561368a9e..d54b8018c232a 100644 --- a/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/qnn-ep-nuget-packaging-pipeline.yml @@ -2,7 +2,7 @@ parameters: - name: QnnSdk displayName: QNN SDK Version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 - name: build_config displayName: Build Configuration diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml index c4d57a66da519..72df94c9ea672 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cpu-packaging-stage.yml @@ -59,7 +59,7 @@ parameters: - name: qnn_sdk_version type: string displayName: 'QNN SDK version. Only for QNN packages.' - default: 2.28.0.241029 + default: 2.28.2.241116 stages: - ${{ if eq(parameters.enable_windows_cpu, true) }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml index 6fdc9cb366f29..179a846509cc1 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_linux_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.28.0.241029' + default: '2.28.2.241116' steps: - script: | diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml index 6b318664d1b12..9df8b249f681e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/download_win_qnn_sdk.yml @@ -1,7 +1,7 @@ parameters: - name: QnnSDKVersion type: string - default: '2.28.0.241029' + default: '2.28.2.241116' steps: - powershell: | diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml index d2ce7c84aa40d..b1cec2284df65 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-qnn.yml @@ -26,7 +26,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 jobs: - job: Linux_py_qnn_Wheels_x64 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml index 48d3849d38321..e07f0afa6109c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml index 0cf0f076c1d7a..8cc647c2464f3 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-arm64ec-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml index 4f7c886635aca..466fee92d0d5e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-x64-qnn.yml @@ -7,7 +7,7 @@ parameters: - name: QNN_SDK displayName: QNN SDK Version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 - name: ENV_SETUP_SCRIPT type: string diff --git a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml index 764599145a7e8..aa0b6bf6d391e 100644 --- a/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml +++ b/tools/ci_build/github/azure-pipelines/templates/qnn-ep-win.yml @@ -1,5 +1,5 @@ parameters: - QnnSdk: '2.28.0.241029' + QnnSdk: '2.28.2.241116' build_config: 'RelWithDebInfo' IsReleaseBuild: false DoEsrp: false diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml index 59a8dac9b1988..5c013fae6be0b 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-arm64-ci-pipeline.yml @@ -33,7 +33,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 jobs: - job: 'build' diff --git a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml index cd173347f8167..53700c58c7e7d 100644 --- a/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/win-qnn-ci-pipeline.yml @@ -33,7 +33,7 @@ parameters: - name: QnnSdk displayName: QNN SDK version type: string - default: 2.28.0.241029 + default: 2.28.2.241116 jobs: - job: 'build' From a0d36a508c323f536c267c0cff00886201fc7df9 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Mon, 18 Nov 2024 23:56:59 -0800 Subject: [PATCH 15/28] Move C# doc Github Action to Windows (#22880) ### Description Move C# doc Github Action to Windows machines, to avoid having dependency on Mono which I think is getting deprecated. ### Motivation and Context --- .github/workflows/publish-csharp-apidocs.yml | 17 +++++++++-------- csharp/ApiDocs/docfx.json | 2 +- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/.github/workflows/publish-csharp-apidocs.yml b/.github/workflows/publish-csharp-apidocs.yml index c704adb263db4..7cca0969a168b 100644 --- a/.github/workflows/publish-csharp-apidocs.yml +++ b/.github/workflows/publish-csharp-apidocs.yml @@ -20,18 +20,17 @@ permissions: jobs: build: - runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-Ubuntu2204-AMD-CPU"] + runs-on: ["self-hosted", "1ES.Pool=onnxruntime-github-vs2022-mms"] env: DOCFXVERSION: 2.62.2 steps: - uses: actions/checkout@v4 - - name: Setup .NET - uses: actions/setup-dotnet@v4 - with: - dotnet-version: 8.0.x - name: Install DocFX run: | dotnet tool update -g docfx + - name: Update PATH + run: | + Add-Content -Value "$env:USERPROFILE\.dotnet\tools" -Encoding utf8 -Path $env:GITHUB_PATH # NOTE: We need to restore Microsoft.ML.OnnxRuntime.csproj manually to set IncludeMobileTargets=false # docfx doesn't seem to be able to do that properly resulting in build errors - name: Restore dependencies @@ -50,10 +49,12 @@ jobs: - name: Log source commit run: git rev-parse --short HEAD > csharp/ApiDocs/csharp/source-version.txt - name: Move C# docs into site + shell: pwsh run: | - mkdir -p _site/docs/api - rm -rf _site/docs/api/csharp - mv csharp/ApiDocs/csharp _site/docs/api/csharp + New-Item -Path _site/docs/api -Force -ItemType "Directory" | Out-Null + $OutputDirectory="_site/docs/api/csharp" + if (Test-Path $OutputDirectory) { Remove-Item -Recurse -Force $OutputDirectory } + Move-Item -Path csharp\ApiDocs\csharp -Destination $OutputDirectory - name: Upload docs artifact uses: actions/upload-artifact@v4 with: diff --git a/csharp/ApiDocs/docfx.json b/csharp/ApiDocs/docfx.json index 0671d4aeb7d95..88a3283ad76e8 100644 --- a/csharp/ApiDocs/docfx.json +++ b/csharp/ApiDocs/docfx.json @@ -14,7 +14,7 @@ "disableDefaultFilter": false, "noRestore": true, "properties": { - "AllowUnsafeBlocks": true, + "AllowUnsafeBlocks": "true", "TargetFramework": "net8.0", "Nullable": "enable", "LangVersion": "8.0", From 56e4fda8a84a7a128a052e7ec6d2a6f09cb2a3e6 Mon Sep 17 00:00:00 2001 From: Chi Lo <54722500+chilo-ms@users.noreply.github.com> Date: Tue, 19 Nov 2024 09:08:54 -0800 Subject: [PATCH 16/28] [TensorRT EP] Revert "Add new provider option to exclude nodes from running on TRT" (#22878) - Revert https://github.com/microsoft/onnxruntime/pull/22681 - But still implicitly exclude DDS ops for TRT 10. Will later provide better PR to add trt_op_types_to_exclude provider option. --- .../tensorrt/tensorrt_provider_options.h | 36 ++++++----- .../tensorrt/tensorrt_execution_provider.cc | 43 ++++--------- .../tensorrt/tensorrt_execution_provider.h | 1 - .../tensorrt_execution_provider_info.cc | 6 -- .../tensorrt_execution_provider_info.h | 3 - .../tensorrt/tensorrt_provider_factory.cc | 1 - .../core/session/provider_bridge_ort.cc | 8 +-- .../python/onnxruntime_pybind_state.cc | 5 +- .../providers/tensorrt/tensorrt_basic_test.cc | 60 ------------------- 9 files changed, 33 insertions(+), 130 deletions(-) diff --git a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h index 5e5319a34ee9f..ec9be80a63574 100644 --- a/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h +++ b/include/onnxruntime/core/providers/tensorrt/tensorrt_provider_options.h @@ -71,23 +71,21 @@ struct OrtTensorRTProviderOptionsV2 { * directory by means of the "trt_onnx_model_folder_path" option. * */ - int trt_dump_ep_context_model{0}; // Dump EP context node model - const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path. - int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data - int trt_weight_stripped_engine_enable{0}; // Enable weight-stripped engine build. Default 0 = false, - // nonzero = true - const char* trt_onnx_model_folder_path{nullptr}; // Folder path relative to the current working directory for - // the ONNX model containing the weights (applicable only when - // the "trt_weight_stripped_engine_enable" option is enabled) - const void* trt_onnx_bytestream{nullptr}; // The byte stream of th original ONNX model containing the weights - // (applicable only when the "trt_weight_stripped_engine_enable" - // option is enabled) - // can be updated using: UpdateTensorRTProviderOptionsWithValue - size_t trt_onnx_bytestream_size{0}; // size of the byte stream provided as "trt_onnx_bytestream" - // can be updated using: UpdateTensorRTProviderOptionsWithValue - const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix - int trt_engine_hw_compatible{0}; // Enable hardware compatibility. Default 0 = false, nonzero = true - const char* trt_op_types_to_exclude{"NonMaxSuppression,NonZero,RoiAlign"}; // Exclude specific ops from running on TRT. - // There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) from TRT versions 10.0 to 10.7. - // TRT EP excludes DDS ops from running on TRT by default, user can override default value with empty string to include all ops. + int trt_dump_ep_context_model{0}; // Dump EP context node model + const char* trt_ep_context_file_path{nullptr}; // Specify file name to dump EP context node model. Can be a path or a file name or a file name with path. + int trt_ep_context_embed_mode{0}; // Specify EP context embed mode. Default 0 = context is engine cache path, 1 = context is engine binary data + int trt_weight_stripped_engine_enable{0}; // Enable weight-stripped engine build. Default 0 = false, + // nonzero = true + const char* trt_onnx_model_folder_path{nullptr}; // Folder path relative to the current working directory for + // the ONNX model containing the weights (applicable only when + // the "trt_weight_stripped_engine_enable" option is enabled) + const void* trt_onnx_bytestream{nullptr}; // The byte stream of th original ONNX model containing the weights + // (applicable only when the "trt_weight_stripped_engine_enable" + // option is enabled) + // can be updated using: UpdateTensorRTProviderOptionsWithValue + size_t trt_onnx_bytestream_size{0}; // size of the byte stream provided as "trt_onnx_bytestream" + // can be updated using: UpdateTensorRTProviderOptionsWithValue + + const char* trt_engine_cache_prefix{nullptr}; // specify engine cache prefix + int trt_engine_hw_compatible{0}; // Enable hardware compatibility. Default 0 = false, nonzero = true }; diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc index 1a5cf6ababdfc..a7330c9bcf13d 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.cc @@ -1379,8 +1379,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv profile_opt_shapes = info.profile_opt_shapes; cuda_graph_enable_ = info.cuda_graph_enable; engine_hw_compatible_ = info.engine_hw_compatible; - op_types_to_exclude_ = info.op_types_to_exclude; - } else { try { const std::string max_partition_iterations_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kMaxPartitionIterations); @@ -1567,11 +1565,6 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv cuda_graph_enable_ = (std::stoi(cuda_graph_enable_env) == 0 ? false : true); } - const std::string op_types_to_exclude_env = onnxruntime::GetEnvironmentVar(tensorrt_env_vars::kOpTypesToExclude); - if (!op_types_to_exclude_env.empty()) { - op_types_to_exclude_ = op_types_to_exclude_env; - } - } catch (const std::invalid_argument& ex) { LOGS_DEFAULT(WARNING) << "[TensorRT EP] Invalid Argument (from environment variables): " << ex.what(); } catch (const std::out_of_range& ex) { @@ -1773,8 +1766,7 @@ TensorrtExecutionProvider::TensorrtExecutionProvider(const TensorrtExecutionProv << ", trt_ep_context_embed_mode: " << ep_context_embed_mode_ << ", trt_cache_prefix: " << cache_prefix_ << ", trt_engine_hw_compatible: " << engine_hw_compatible_ - << ", trt_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_ - << ", trt_op_types_to_exclude: " << op_types_to_exclude_; + << ", trt_onnx_model_bytestream_size_: " << onnx_model_bytestream_size_; } TensorrtExecutionProvider::~TensorrtExecutionProvider() { @@ -2442,18 +2434,6 @@ bool TensorrtExecutionProvider::DetectTensorRTGraphCycles(SubGraphCollection_t& return cycle_detected; } -std::set GetExcludedNodeSet(std::string node_list_to_exclude) { - std::set set; - if (!node_list_to_exclude.empty()) { - std::stringstream node_list(node_list_to_exclude); - std::string node; - while (std::getline(node_list, node, ',')) { - set.insert(node); - } - } - return set; -} - std::vector> TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, const IKernelLookup& /*kernel_lookup*/) const { @@ -2486,14 +2466,17 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, std::vector nodes_vector(number_of_ort_nodes); std::iota(std::begin(nodes_vector), std::end(nodes_vector), 0); - std::set exclude_set = GetExcludedNodeSet(op_types_to_exclude_); + std::set exclude_ops_set; - // Print excluded nodes, if any. - std::set::iterator it; - for (it = exclude_set.begin(); it != exclude_set.end(); ++it) { - std::string op = *it; - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Exclude \"" << op << "\" from running on TRT, if any."; - LOGS_DEFAULT(VERBOSE) << "[TensorRT EP] Remove \"" << op << "\" from trt_op_types_to_exclude or specify trt_op_types_to_exclude with empty string to include the op in the input to TRT parser. However, it still depends on TRT parser to determine the eligibility of this op for TRT."; + /* + * There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. + * TRT EP automatically excludes DDS ops from running on TRT. + */ + if (trt_version_ >= 100000 && trt_version_ < 110000) { + exclude_ops_set.insert("NonMaxSuppression"); + exclude_ops_set.insert("NonZero"); + exclude_ops_set.insert("RoiAlign"); + LOGS_DEFAULT(VERBOSE) << "There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) in TRT 10. TRT EP automatically excludes DDS ops from running on TRT, if applicable"; } SubGraphCollection_t parser_nodes_vector, supported_nodes_vector; @@ -2502,7 +2485,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, /* Iterate all the nodes and exclude the node if: * 1. It's a control flow op and its subgraph(s) is not fully TRT eligible. - * 2. It's in the exlucded set which specified by trt_op_types_to_exclude. + * 2. It's a DDS op. */ for (const auto& index : nodes_vector) { const auto& node = graph.GetNode(node_index[index]); @@ -2538,7 +2521,7 @@ TensorrtExecutionProvider::GetCapability(const GraphViewer& graph, } // Exclude any ops, if applicable - if (exclude_set.find(node->OpType()) != exclude_set.end()) { + if (exclude_ops_set.find(node->OpType()) != exclude_ops_set.end()) { supported_node = false; } diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h index 9d8af02ba10e6..9e3a03417d917 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider.h @@ -57,7 +57,6 @@ static const std::string kDumpEpContextModel = "ORT_DUMP_EP_CONTEXT_MODEL"; static const std::string kEpContextEmbedMode = "ORT_EP_CONTEXT_EMBED_MODE"; static const std::string kEpContextComputeCapabilityEnable = "ORT_EP_CONTEXT_COMPUTE_CAPABILITY_ENABLE"; static const std::string kEngineCachePrefix = "ORT_TENSORRT_CACHE_PREFIX"; -static const std::string kOpTypesToExclude = "ORT_TENSORRT_OP_TYPES_TO_EXCLUDE"; // Old env variable for backward compatibility static const std::string kEngineCachePath = "ORT_TENSORRT_ENGINE_CACHE_PATH"; } // namespace tensorrt_env_vars diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc index bc0d00ec6791f..63b6d35072290 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.cc @@ -56,7 +56,6 @@ constexpr const char* kDumpEpContextModel = "trt_dump_ep_context_model"; constexpr const char* kEngineHwCompatible = "trt_engine_hw_compatible"; constexpr const char* kONNXBytestream = "trt_onnx_bytestream"; constexpr const char* kONNXBytestreamSize = "trt_onnx_bytestream_size"; -constexpr const char* kOpTypesToExclude = "trt_op_types_to_exclude"; } // namespace provider_option_names } // namespace tensorrt @@ -135,7 +134,6 @@ TensorrtExecutionProviderInfo TensorrtExecutionProviderInfo::FromProviderOptions return Status::OK(); }) .AddAssignmentToReference(tensorrt::provider_option_names::kONNXBytestreamSize, info.onnx_bytestream_size) - .AddAssignmentToReference(tensorrt::provider_option_names::kOpTypesToExclude, info.op_types_to_exclude) .Parse(options)); // add new provider option here. info.user_compute_stream = user_compute_stream; @@ -190,7 +188,6 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const TensorrtE {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.engine_hw_compatible)}, {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(info.onnx_bytestream)}, {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.onnx_bytestream_size)}, - {tensorrt::provider_option_names::kOpTypesToExclude, MakeStringWithClassicLocale(info.op_types_to_exclude)}, }; return options; } @@ -209,7 +206,6 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor const std::string kProfilesOptShapes_ = empty_if_null(info.trt_profile_opt_shapes); const std::string kEpContextFilePath_ = empty_if_null(info.trt_ep_context_file_path); const std::string kOnnxModelFolderPath_ = empty_if_null(info.trt_onnx_model_folder_path); - const std::string kOpTypesToExclude_ = empty_if_null(info.trt_op_types_to_exclude); const ProviderOptions options{ {tensorrt::provider_option_names::kDeviceId, MakeStringWithClassicLocale(info.device_id)}, @@ -255,7 +251,6 @@ ProviderOptions TensorrtExecutionProviderInfo::ToProviderOptions(const OrtTensor {tensorrt::provider_option_names::kEngineHwCompatible, MakeStringWithClassicLocale(info.trt_engine_hw_compatible)}, {tensorrt::provider_option_names::kONNXBytestream, MakeStringWithClassicLocale(reinterpret_cast(info.trt_onnx_bytestream))}, {tensorrt::provider_option_names::kONNXBytestreamSize, MakeStringWithClassicLocale(info.trt_onnx_bytestream_size)}, - {tensorrt::provider_option_names::kOpTypesToExclude, kOpTypesToExclude_}, }; return options; } @@ -360,6 +355,5 @@ void TensorrtExecutionProviderInfo::UpdateProviderOptions(void* provider_options trt_provider_options_v2.trt_engine_hw_compatible = internal_options.engine_hw_compatible; trt_provider_options_v2.trt_onnx_bytestream = internal_options.onnx_bytestream; trt_provider_options_v2.trt_onnx_bytestream_size = internal_options.onnx_bytestream_size; - trt_provider_options_v2.trt_op_types_to_exclude = copy_string_if_needed(internal_options.op_types_to_exclude); } } // namespace onnxruntime diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h index 767f320d760a8..fa1bbd6d3d7e6 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h +++ b/onnxruntime/core/providers/tensorrt/tensorrt_execution_provider_info.h @@ -60,9 +60,6 @@ struct TensorrtExecutionProviderInfo { int ep_context_embed_mode{0}; std::string engine_cache_prefix{""}; bool engine_hw_compatible{false}; - // There is a known performance issue with the DDS ops (NonMaxSuppression, NonZero and RoiAlign) from TRT versions 10.0 to 10.7. - // TRT EP excludes DDS ops from running on TRT by default, user can override default value of trt_op_types_to_exclude with empty string to include all ops. - std::string op_types_to_exclude{"NonMaxSuppression,NonZero,RoiAlign"}; static TensorrtExecutionProviderInfo FromProviderOptions(const ProviderOptions& options); static ProviderOptions ToProviderOptions(const TensorrtExecutionProviderInfo& info); diff --git a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc index e4521ddd18ade..e242788ff389a 100644 --- a/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc +++ b/onnxruntime/core/providers/tensorrt/tensorrt_provider_factory.cc @@ -118,7 +118,6 @@ struct Tensorrt_Provider : Provider { info.engine_hw_compatible = options.trt_engine_hw_compatible != 0; info.onnx_bytestream = options.trt_onnx_bytestream; info.onnx_bytestream_size = options.trt_onnx_bytestream_size; - info.op_types_to_exclude = options.trt_op_types_to_exclude == nullptr ? "" : options.trt_op_types_to_exclude; return std::make_shared(info); } diff --git a/onnxruntime/core/session/provider_bridge_ort.cc b/onnxruntime/core/session/provider_bridge_ort.cc index 0aa93bce354e8..ecbdd31160c7a 100644 --- a/onnxruntime/core/session/provider_bridge_ort.cc +++ b/onnxruntime/core/session/provider_bridge_ort.cc @@ -2294,11 +2294,8 @@ ORT_API_STATUS_IMPL(OrtApis::UpdateTensorRTProviderOptions, #ifdef USE_TENSORRT onnxruntime::ProviderOptions provider_options_map; for (size_t i = 0; i != num_keys; ++i) { - // Don't allow key and value to be empty except the value of trt_op_types_to_exclude - if (provider_options_keys[i] == nullptr || - provider_options_keys[i][0] == '\0' || - (provider_options_values[i] == nullptr && strcmp("trt_op_types_to_exclude", provider_options_keys[i])) || - (provider_options_values[i][0] == '\0' && strcmp("trt_op_types_to_exclude", provider_options_keys[i]))) { + if (provider_options_keys[i] == nullptr || provider_options_keys[i][0] == '\0' || + provider_options_values[i] == nullptr || provider_options_values[i][0] == '\0') { return OrtApis::CreateStatus(ORT_INVALID_ARGUMENT, "key/value cannot be empty"); } @@ -2413,7 +2410,6 @@ ORT_API(void, OrtApis::ReleaseTensorRTProviderOptions, _Frees_ptr_opt_ OrtTensor delete[] ptr->trt_profile_opt_shapes; delete[] ptr->trt_ep_context_file_path; delete[] ptr->trt_onnx_model_folder_path; - if (!ptr->trt_op_types_to_exclude) delete[] ptr->trt_op_types_to_exclude; } std::unique_ptr p(ptr); diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 05ca3c6c15793..4d9583be0ef0f 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -526,7 +526,7 @@ std::unique_ptr CreateExecutionProviderInstance( // and TRT EP instance, so it won't be released.) std::string calibration_table, cache_path, cache_prefix, timing_cache_path, lib_path, trt_tactic_sources, trt_extra_plugin_lib_paths, min_profile, max_profile, opt_profile, ep_context_file_path, - onnx_model_folder_path, trt_op_types_to_exclude{"NonMaxSuppression,NonZero,RoiAlign"}; + onnx_model_folder_path; auto it = provider_options_map.find(type); if (it != provider_options_map.end()) { OrtTensorRTProviderOptionsV2 params; @@ -824,9 +824,6 @@ std::unique_ptr CreateExecutionProviderInstance( } else { ORT_THROW("[ERROR] [TensorRT] The value for the key 'trt_engine_hw_compatible' should be 'True' or 'False'. Default value is 'False'.\n"); } - } else if (option.first == "trt_op_types_to_exclude") { - trt_op_types_to_exclude = option.second; - params.trt_op_types_to_exclude = trt_op_types_to_exclude.c_str(); } else { ORT_THROW("Invalid TensorRT EP option: ", option.first); } diff --git a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc index b4199548ae515..63327a028c6f4 100644 --- a/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc +++ b/onnxruntime/test/providers/tensorrt/tensorrt_basic_test.cc @@ -612,66 +612,6 @@ TEST(TensorrtExecutionProviderTest, EPContextNode) { RunSession(session_object9, run_options, feeds, output_names, expected_dims_mul_m, expected_values_mul_m); } -TEST(TensorrtExecutionProviderTest, ExcludeOpsTest) { - /* The mnist.onnx looks like this: - * Conv - * | - * Add - * . - * . - * | - * MaxPool - * | - * . - * . - * MaxPool - * | - * Reshape - * | - * MatMul - * . - * . - * - */ - PathString model_name = ORT_TSTR("testdata/mnist.onnx"); - SessionOptions so; - so.session_logid = "TensorrtExecutionProviderExcludeOpsTest"; - RunOptions run_options; - run_options.run_tag = so.session_logid; - InferenceSession session_object{so, GetEnvironment()}; - auto cuda_provider = DefaultCudaExecutionProvider(); - auto cpu_allocator = cuda_provider->CreatePreferredAllocators()[1]; - std::vector dims_op_x = {1, 1, 28, 28}; - std::vector values_op_x(784, 1.0f); // 784=1*1*28*28 - OrtValue ml_value_x; - CreateMLValue(cpu_allocator, dims_op_x, values_op_x, &ml_value_x); - NameMLValMap feeds; - feeds.insert(std::make_pair("Input3", ml_value_x)); - - // prepare outputs - std::vector output_names; - output_names.push_back("Plus214_Output_0"); - std::vector fetches; - - RemoveCachesByType("./", ".engine"); - OrtTensorRTProviderOptionsV2 params; - params.trt_engine_cache_enable = 1; - params.trt_op_types_to_exclude = "MaxPool"; - std::unique_ptr execution_provider = TensorrtExecutionProviderWithOptions(¶ms); - EXPECT_TRUE(session_object.RegisterExecutionProvider(std::move(execution_provider)).IsOK()); - auto status = session_object.Load(model_name); - ASSERT_TRUE(status.IsOK()); - status = session_object.Initialize(); - ASSERT_TRUE(status.IsOK()); - status = session_object.Run(run_options, feeds, output_names, &fetches); - ASSERT_TRUE(status.IsOK()); - - std::vector engine_files; - engine_files = GetCachesByType("./", ".engine"); - // The whole graph should be partitioned into 3 TRT subgraphs and 2 cpu nodes - ASSERT_EQ(engine_files.size(), 3); -} - TEST(TensorrtExecutionProviderTest, TRTPluginsCustomOpTest) { PathString model_name = ORT_TSTR("testdata/trt_plugin_custom_op_test.onnx"); SessionOptions so; From 0d00fc31303c30e5261c7230e227ecfe119e9245 Mon Sep 17 00:00:00 2001 From: Caroline Zhu Date: Tue, 19 Nov 2024 09:27:51 -0800 Subject: [PATCH 17/28] [mobile] Fix for mac-ios-packaging pipeline (#22879) ### Description Appends variant name to the Browserstack artifacts that are published so that we don't run into the error: "##[error]Artifact browserstack_test_artifacts already exists for build 609095." [Working pipeline run](https://aiinfra.visualstudio.com/Lotus/_build/results?buildId=609503&view=results) ### Motivation and Context - onnxruntime-ios-packaging-pipeline has been failing --- .../templates/stages/mac-ios-packaging-build-stage.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml index b6a214154e680..5d7ea5e7b2727 100644 --- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml @@ -161,7 +161,7 @@ stages: # Publish the BrowserStack artifacts first so that if the next step fails, the artifacts will still be published # so that users can attempt to locally debug - publish: "$(Build.ArtifactStagingDirectory)" - artifact: "browserstack_test_artifacts" + artifact: "browserstack_test_artifacts_${{ lower(parameters.packageVariant) }}" displayName: "Publish BrowserStack test artifacts" - script: | From 8a06f13301d193cb1363f01be98e800079c541f4 Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Wed, 20 Nov 2024 02:22:02 +0800 Subject: [PATCH 18/28] [WebNN] Remove wasm.currentContext check (#22886) If a WebNN session is threw early, this check for `wasm.currentContext` will break all the following WebNN sessions, this often happens in npm tests. --- js/web/lib/wasm/wasm-core-impl.ts | 3 --- 1 file changed, 3 deletions(-) diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index f3794a72efbe8..81d1b73efc9d4 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -291,9 +291,6 @@ export const createSession = async ( const providerName = typeof provider === 'string' ? provider : provider.name; if (providerName === 'webnn') { wasm.shouldTransferToMLTensor = false; - if (wasm.currentContext) { - throw new Error('WebNN execution provider is already set.'); - } if (typeof provider !== 'string') { const webnnOptions = provider as InferenceSession.WebNNExecutionProviderOption; const context = (webnnOptions as InferenceSession.WebNNOptionsWithMLContext)?.context; From 5b787121e892ac2b74c6df2927d334296e32d097 Mon Sep 17 00:00:00 2001 From: Wanming Lin Date: Wed, 20 Nov 2024 04:44:23 +0800 Subject: [PATCH 19/28] [WebNN] Check split's output name (#22884) Chromium will rename split's output name from "output" to "outputs" in `OpSupportLimits` to align with spec, the EP should check which name is available to make it compatible. --- .../webnn/builders/impl/split_op_builder.cc | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc index c4ccc80d44c0f..db10720f72762 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/split_op_builder.cc @@ -28,6 +28,8 @@ class SplitOpBuilder : public BaseOpBuilder { private: bool IsOpSupportedImpl(const InitializedTensorSet& initializers, const Node& node, const WebnnDeviceType /* device_type */, const logging::Logger& logger) const override; + bool HasSupportedOutputsImpl(const Node& node, const emscripten::val& wnn_limits, + const logging::Logger& logger) const override; }; // Add operator related. @@ -163,6 +165,23 @@ bool SplitOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, return true; } +bool SplitOpBuilder::HasSupportedOutputsImpl(const Node& node, + const emscripten::val& wnn_limits, + const logging::Logger& logger) const { + const auto& output_defs = node.OutputDefs(); + const auto& op_type = node.OpType(); + int32_t output_type = 0; + + if (GetType(*output_defs[0], output_type, logger)) { + // Chromium has changed the output name of split from 'output' to 'outputs', + // to avoid breaking the existing API, we need to check both names. + std::string wnn_output_name = wnn_limits["split"]["output"].isUndefined() ? "outputs" : "output"; + return IsDataTypeSupportedByOp(op_type, output_type, wnn_limits, wnn_output_name, "outputs", logger); + } + + return false; +} + void CreateSplitOpBuilder(const std::string& op_type, OpBuilderRegistrations& op_registrations) { op_registrations.builders.push_back(std::make_unique()); op_registrations.op_builder_map.emplace(op_type, op_registrations.builders.back().get()); From 13346fdf1824b3c26da85880739dde7240b71cbd Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Tue, 19 Nov 2024 14:13:33 -0800 Subject: [PATCH 20/28] Cleanup code (#22827) ### Description 1. Delete TVM EP because it is out of maintain 2. Delete ortmodule related docker files and scripts. --- ThirdPartyNotices.txt | 255 ---- cgmanifests/cgmanifest.json | 1082 ++++++++--------- cmake/CMakeLists.txt | 54 - cmake/external/tvm.cmake | 24 - cmake/onnxruntime.cmake | 2 - cmake/onnxruntime_codegen_tvm.cmake | 25 - cmake/onnxruntime_csharp.cmake | 4 - cmake/onnxruntime_providers.cmake | 7 - cmake/onnxruntime_providers_cuda.cmake | 3 +- cmake/onnxruntime_providers_tvm.cmake | 64 - cmake/onnxruntime_python.cmake | 33 - cmake/onnxruntime_unittests.cmake | 35 - .../NativeMethods.shared.cs | 3 - .../SessionOptions.shared.cs | 35 - .../InferenceTest.cs | 4 - docs/TVM_EP.md | 319 ----- .../notebooks/onnxruntime-tvm-tutorial.ipynb | 657 ---------- onnxruntime/core/codegen/common/common.cc | 284 ----- onnxruntime/core/codegen/common/common.h | 153 --- onnxruntime/core/codegen/common/creator.h | 76 -- onnxruntime/core/codegen/common/dispatcher.h | 76 -- onnxruntime/core/codegen/common/dump_array.h | 62 - onnxruntime/core/codegen/common/handle.h | 22 - onnxruntime/core/codegen/common/op_macro.h | 101 -- onnxruntime/core/codegen/common/profile.h | 37 - onnxruntime/core/codegen/common/registry.h | 72 -- onnxruntime/core/codegen/common/settings.cc | 78 -- onnxruntime/core/codegen/common/settings.h | 40 - onnxruntime/core/codegen/common/target_info.h | 33 - onnxruntime/core/codegen/common/utils.cc | 99 -- onnxruntime/core/codegen/common/utils.h | 45 - onnxruntime/core/codegen/mti/common.h | 16 - .../core/codegen/mti/debug/tvm_print.cc | 83 -- .../core/codegen/mti/debug/tvm_print.h | 19 - .../core/codegen/mti/math/binary_ops.cc | 70 -- .../core/codegen/mti/math/binary_ops.h | 42 - onnxruntime/core/codegen/mti/math/gemm.cc | 30 - onnxruntime/core/codegen/mti/math/gemm.h | 16 - .../core/codegen/mti/math/logsoftmax.cc | 18 - .../core/codegen/mti/math/logsoftmax.h | 11 - .../core/codegen/mti/math/matmul_ops.cc | 161 --- .../core/codegen/mti/math/matmul_ops.h | 23 - .../core/codegen/mti/math/reduce_ops.cc | 90 -- .../core/codegen/mti/math/reduce_ops.h | 72 -- onnxruntime/core/codegen/mti/math/softmax.cc | 18 - onnxruntime/core/codegen/mti/math/softmax.h | 11 - .../core/codegen/mti/math/unary_ops.cc | 155 --- onnxruntime/core/codegen/mti/math/unary_ops.h | 36 - onnxruntime/core/codegen/mti/mti_tvm_utils.cc | 203 ---- onnxruntime/core/codegen/mti/mti_tvm_utils.h | 71 -- onnxruntime/core/codegen/mti/nn/conv_ops.cc | 193 --- onnxruntime/core/codegen/mti/nn/conv_ops.h | 39 - onnxruntime/core/codegen/mti/nn/lstm.cc | 140 --- onnxruntime/core/codegen/mti/nn/lstm.h | 35 - onnxruntime/core/codegen/mti/nn/pool_ops.cc | 63 - onnxruntime/core/codegen/mti/nn/pool_ops.h | 36 - .../core/codegen/mti/tensor/cast_ops.cc | 37 - .../core/codegen/mti/tensor/cast_ops.h | 15 - .../core/codegen/mti/tensor/concat_ops.cc | 83 -- .../core/codegen/mti/tensor/concat_ops.h | 15 - onnxruntime/core/codegen/mti/tensor/crop.cc | 58 - onnxruntime/core/codegen/mti/tensor/crop.h | 17 - onnxruntime/core/codegen/mti/tensor/expand.cc | 30 - onnxruntime/core/codegen/mti/tensor/expand.h | 14 - onnxruntime/core/codegen/mti/tensor/gather.cc | 55 - onnxruntime/core/codegen/mti/tensor/gather.h | 17 - .../codegen/mti/tensor/gather_elements.cc | 45 - .../core/codegen/mti/tensor/gather_elements.h | 17 - .../core/codegen/mti/tensor/pad_ops.cc | 121 -- onnxruntime/core/codegen/mti/tensor/pad_ops.h | 34 - .../core/codegen/mti/tensor/reshape_ops.cc | 48 - .../core/codegen/mti/tensor/reshape_ops.h | 16 - .../core/codegen/mti/tensor/shape_op.cc | 25 - .../core/codegen/mti/tensor/shape_op.h | 14 - onnxruntime/core/codegen/mti/tensor/slice.cc | 91 -- onnxruntime/core/codegen/mti/tensor/slice.h | 19 - onnxruntime/core/codegen/mti/tensor/split.cc | 72 -- onnxruntime/core/codegen/mti/tensor/split.h | 25 - onnxruntime/core/codegen/mti/tensor/tile.cc | 40 - onnxruntime/core/codegen/mti/tensor/tile.h | 16 - .../core/codegen/mti/tensor/transpose.cc | 16 - .../core/codegen/mti/tensor/transpose.h | 16 - onnxruntime/core/codegen/mti/tensor/where.cc | 36 - onnxruntime/core/codegen/mti/tensor/where.h | 17 - .../codegen/passes/op_ir_creator/all_ops.h | 47 - .../passes/op_ir_creator/math/binary_ops.cc | 46 - .../codegen/passes/op_ir_creator/math/clip.cc | 48 - .../codegen/passes/op_ir_creator/math/gemm.cc | 39 - .../passes/op_ir_creator/math/logsoftmax.cc | 32 - .../passes/op_ir_creator/math/matmul.cc | 23 - .../math/quantize/matmul_integer.cc | 37 - .../passes/op_ir_creator/math/reduce_ops.cc | 111 -- .../passes/op_ir_creator/math/softmax.cc | 32 - .../passes/op_ir_creator/math/unary_funcs.h | 51 - .../passes/op_ir_creator/math/unary_ops.cc | 93 -- .../passes/op_ir_creator/math/variadic_ops.cc | 36 - .../codegen/passes/op_ir_creator/nn/conv.cc | 131 -- .../codegen/passes/op_ir_creator/nn/lstm.cc | 64 - .../passes/op_ir_creator/nn/pool_ops.cc | 51 - .../passes/op_ir_creator/tensor/cast.cc | 40 - .../passes/op_ir_creator/tensor/concat.cc | 30 - .../passes/op_ir_creator/tensor/crop.cc | 46 - .../passes/op_ir_creator/tensor/expand.cc | 26 - .../passes/op_ir_creator/tensor/gather.cc | 30 - .../op_ir_creator/tensor/gather_elements.cc | 32 - .../passes/op_ir_creator/tensor/pad.cc | 49 - .../op_ir_creator/tensor/reshape_ops.cc | 99 -- .../passes/op_ir_creator/tensor/shape_op.cc | 26 - .../passes/op_ir_creator/tensor/slice.cc | 71 -- .../passes/op_ir_creator/tensor/split.cc | 66 - .../passes/op_ir_creator/tensor/transpose.cc | 48 - .../passes/op_ir_creator/tensor/where.cc | 28 - .../passes/op_ir_creator/tvm_ir_builder.cc | 125 -- .../passes/op_ir_creator/tvm_ir_builder.h | 64 - .../passes/op_ir_creator/tvm_op_creator.cc | 37 - .../passes/op_ir_creator/tvm_op_creator.h | 84 -- .../codegen/passes/scheduler/all_schedules.h | 20 - .../passes/scheduler/ort_type_schedule.cc | 22 - .../passes/scheduler/schedule_utils.cc | 178 --- .../codegen/passes/scheduler/schedule_utils.h | 65 - .../passes/scheduler/tvm_rule_schedule.cc | 41 - .../passes/scheduler/tvm_schedule_builder.cc | 104 -- .../passes/scheduler/tvm_schedule_builder.h | 46 - .../codegen/passes/scheduler/tvm_scheduler.cc | 79 -- .../codegen/passes/scheduler/tvm_scheduler.h | 128 -- .../codegen/passes/utils/codegen_context.cc | 27 - .../codegen/passes/utils/codegen_context.h | 44 - .../codegen/passes/utils/ort_tvm_utils.cc | 194 --- .../core/codegen/passes/utils/ort_tvm_utils.h | 31 - .../codegen/passes/weight_layout/tiling_2d.cc | 105 -- .../codegen/passes/weight_layout/tiling_2d.h | 43 - .../passes/weight_layout/transpose_2d.cc | 64 - .../passes/weight_layout/transpose_2d.h | 33 - .../weight_layout/vertical_stripes_2d.cc | 77 -- .../weight_layout/vertical_stripes_2d.h | 40 - .../passes/weight_layout/weight_layout.cc | 91 -- .../passes/weight_layout/weight_layout.h | 68 -- onnxruntime/core/framework/utils.cc | 1 - .../core/platform/windows/stacktrace.cc | 1 - .../core/providers/get_execution_providers.cc | 8 - .../providers/provider_factory_creators.h | 4 - .../core/providers/tvm/custom_logging.cc | 52 - .../core/providers/tvm/hash_alg/hasher.cc | 30 - .../core/providers/tvm/hash_alg/hasher.h | 32 - .../providers/tvm/hash_alg/hasher_impl.cc | 39 - .../core/providers/tvm/hash_alg/hasher_impl.h | 42 - onnxruntime/core/providers/tvm/symbols.txt | 1 - .../core/providers/tvm/tvm_allocator.cc | 29 - .../core/providers/tvm/tvm_allocator.h | 45 - onnxruntime/core/providers/tvm/tvm_api.cc | 303 ----- onnxruntime/core/providers/tvm/tvm_api.h | 38 - onnxruntime/core/providers/tvm/tvm_common.h | 22 - .../core/providers/tvm/tvm_compiler.cc | 48 - onnxruntime/core/providers/tvm/tvm_compiler.h | 63 - onnxruntime/core/providers/tvm/tvm_defaults.h | 35 - .../core/providers/tvm/tvm_ep_options.cc | 273 ----- .../core/providers/tvm/tvm_ep_options.h | 76 -- .../providers/tvm/tvm_execution_provider.cc | 304 ----- .../providers/tvm/tvm_execution_provider.h | 71 -- .../providers/tvm/tvm_provider_factory.cc | 53 - .../tvm/tvm_provider_factory_creator.h | 19 - onnxruntime/core/providers/tvm/tvm_runner.cc | 26 - onnxruntime/core/providers/tvm/tvm_runner.h | 34 - .../core/providers/tvm/tvm_runner_impl.cc | 175 --- .../core/providers/tvm/tvm_runner_impl.h | 126 -- .../tvm/tvm_so_execution_provider.cc | 284 ----- .../providers/tvm/tvm_so_execution_provider.h | 72 -- onnxruntime/core/providers/tvm/tvm_utils.cc | 31 - onnxruntime/core/providers/tvm/tvm_utils.h | 70 -- .../core/providers/tvm/xpu_data_transfer.cc | 84 -- .../core/providers/tvm/xpu_data_transfer.h | 38 - .../core/session/provider_registration.cc | 9 - .../python/onnxruntime_pybind_state.cc | 10 - .../python/onnxruntime_pybind_state_common.h | 15 +- onnxruntime/python/providers/tvm/__init__.py | 10 - .../providers/tvm/extend_python_file.py | 54 - onnxruntime/python/providers/tvm/ort.py | 140 --- onnxruntime/test/framework/function_test.cc | 6 - .../test/platform/windows/stacktrace_test.cc | 1 - .../python/onnxruntime_test_python_tvm.py | 242 ---- onnxruntime/test/util/default_providers.cc | 8 - .../test/util/include/default_providers.h | 1 - onnxruntime/test/util/include/providers.h | 3 - tools/ci_build/build.py | 62 +- ...orttraining-linux-gpu-test-ci-pipeline.yml | 41 - ...py-packaging-training-cuda-stage-steps.yml | 209 ---- .../linux/docker/scripts/install_os_deps.sh | 1 - .../docker/scripts/install_python_deps.sh | 16 - .../stage1/requirements_rocm/requirements.txt | 2 - .../requirements.txt | 8 - .../requirements.txt | 7 - .../requirements_torch_cpu/requirements.txt | 3 - .../requirements.txt | 5 - .../stage1/torch_eager_cpu/requirements.txt | 11 - .../ortmodule/stage2/requirements.txt | 15 - .../ci_build/github/linux/run_dockerbuild.sh | 26 - .../pai/pai_huggingface_bert_large_test.sh | 43 - tools/ci_build/set-trigger-rules.py | 5 - .../nuget/generate_nuspec_for_native_nuget.py | 51 +- tools/scripts/python_test.sh | 8 +- 200 files changed, 512 insertions(+), 12715 deletions(-) delete mode 100644 cmake/external/tvm.cmake delete mode 100644 cmake/onnxruntime_codegen_tvm.cmake delete mode 100644 cmake/onnxruntime_providers_tvm.cmake delete mode 100644 docs/TVM_EP.md delete mode 100644 docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb delete mode 100644 onnxruntime/core/codegen/common/common.cc delete mode 100644 onnxruntime/core/codegen/common/common.h delete mode 100644 onnxruntime/core/codegen/common/creator.h delete mode 100644 onnxruntime/core/codegen/common/dispatcher.h delete mode 100644 onnxruntime/core/codegen/common/dump_array.h delete mode 100644 onnxruntime/core/codegen/common/handle.h delete mode 100644 onnxruntime/core/codegen/common/op_macro.h delete mode 100644 onnxruntime/core/codegen/common/profile.h delete mode 100644 onnxruntime/core/codegen/common/registry.h delete mode 100644 onnxruntime/core/codegen/common/settings.cc delete mode 100644 onnxruntime/core/codegen/common/settings.h delete mode 100644 onnxruntime/core/codegen/common/target_info.h delete mode 100644 onnxruntime/core/codegen/common/utils.cc delete mode 100644 onnxruntime/core/codegen/common/utils.h delete mode 100644 onnxruntime/core/codegen/mti/common.h delete mode 100644 onnxruntime/core/codegen/mti/debug/tvm_print.cc delete mode 100644 onnxruntime/core/codegen/mti/debug/tvm_print.h delete mode 100644 onnxruntime/core/codegen/mti/math/binary_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/math/binary_ops.h delete mode 100644 onnxruntime/core/codegen/mti/math/gemm.cc delete mode 100644 onnxruntime/core/codegen/mti/math/gemm.h delete mode 100644 onnxruntime/core/codegen/mti/math/logsoftmax.cc delete mode 100644 onnxruntime/core/codegen/mti/math/logsoftmax.h delete mode 100644 onnxruntime/core/codegen/mti/math/matmul_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/math/matmul_ops.h delete mode 100644 onnxruntime/core/codegen/mti/math/reduce_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/math/reduce_ops.h delete mode 100644 onnxruntime/core/codegen/mti/math/softmax.cc delete mode 100644 onnxruntime/core/codegen/mti/math/softmax.h delete mode 100644 onnxruntime/core/codegen/mti/math/unary_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/math/unary_ops.h delete mode 100644 onnxruntime/core/codegen/mti/mti_tvm_utils.cc delete mode 100644 onnxruntime/core/codegen/mti/mti_tvm_utils.h delete mode 100644 onnxruntime/core/codegen/mti/nn/conv_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/nn/conv_ops.h delete mode 100644 onnxruntime/core/codegen/mti/nn/lstm.cc delete mode 100644 onnxruntime/core/codegen/mti/nn/lstm.h delete mode 100644 onnxruntime/core/codegen/mti/nn/pool_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/nn/pool_ops.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/cast_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/cast_ops.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/concat_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/concat_ops.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/crop.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/crop.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/expand.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/expand.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/gather.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/gather.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/gather_elements.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/gather_elements.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/pad_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/pad_ops.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/reshape_ops.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/reshape_ops.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/shape_op.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/shape_op.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/slice.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/slice.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/split.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/split.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/tile.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/tile.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/transpose.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/transpose.h delete mode 100644 onnxruntime/core/codegen/mti/tensor/where.cc delete mode 100644 onnxruntime/core/codegen/mti/tensor/where.h delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc delete mode 100644 onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h delete mode 100644 onnxruntime/core/codegen/passes/scheduler/all_schedules.h delete mode 100644 onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc delete mode 100644 onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc delete mode 100644 onnxruntime/core/codegen/passes/scheduler/schedule_utils.h delete mode 100644 onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc delete mode 100644 onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc delete mode 100644 onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h delete mode 100644 onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc delete mode 100644 onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h delete mode 100644 onnxruntime/core/codegen/passes/utils/codegen_context.cc delete mode 100644 onnxruntime/core/codegen/passes/utils/codegen_context.h delete mode 100644 onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc delete mode 100644 onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc delete mode 100644 onnxruntime/core/codegen/passes/weight_layout/weight_layout.h delete mode 100644 onnxruntime/core/providers/tvm/custom_logging.cc delete mode 100644 onnxruntime/core/providers/tvm/hash_alg/hasher.cc delete mode 100644 onnxruntime/core/providers/tvm/hash_alg/hasher.h delete mode 100644 onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc delete mode 100644 onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h delete mode 100644 onnxruntime/core/providers/tvm/symbols.txt delete mode 100644 onnxruntime/core/providers/tvm/tvm_allocator.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_allocator.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_api.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_api.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_common.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_compiler.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_compiler.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_defaults.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_ep_options.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_ep_options.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_execution_provider.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_execution_provider.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_provider_factory.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_runner.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_runner.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_runner_impl.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_runner_impl.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_so_execution_provider.h delete mode 100644 onnxruntime/core/providers/tvm/tvm_utils.cc delete mode 100644 onnxruntime/core/providers/tvm/tvm_utils.h delete mode 100644 onnxruntime/core/providers/tvm/xpu_data_transfer.cc delete mode 100644 onnxruntime/core/providers/tvm/xpu_data_transfer.h delete mode 100644 onnxruntime/python/providers/tvm/__init__.py delete mode 100644 onnxruntime/python/providers/tvm/extend_python_file.py delete mode 100644 onnxruntime/python/providers/tvm/ort.py delete mode 100644 onnxruntime/test/python/onnxruntime_test_python_tvm.py delete mode 100644 tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml delete mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt delete mode 100644 tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt delete mode 100755 tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh diff --git a/ThirdPartyNotices.txt b/ThirdPartyNotices.txt index 20142e734dfac..26084ab42ec1c 100644 --- a/ThirdPartyNotices.txt +++ b/ThirdPartyNotices.txt @@ -2108,261 +2108,6 @@ SOFTWARE. _____ -TVM Open Deep Learning Compiler Stack - - Apache License - Version 2.0, January 2004 - http://www.apache.org/licenses/ - - TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION - - 1. Definitions. - - "License" shall mean the terms and conditions for use, reproduction, - and distribution as defined by Sections 1 through 9 of this document. - - "Licensor" shall mean the copyright owner or entity authorized by - the copyright owner that is granting the License. - - "Legal Entity" shall mean the union of the acting entity and all - other entities that control, are controlled by, or are under common - control with that entity. For the purposes of this definition, - "control" means (i) the power, direct or indirect, to cause the - direction or management of such entity, whether by contract or - otherwise, or (ii) ownership of fifty percent (50%) or more of the - outstanding shares, or (iii) beneficial ownership of such entity. - - "You" (or "Your") shall mean an individual or Legal Entity - exercising permissions granted by this License. - - "Source" form shall mean the preferred form for making modifications, - including but not limited to software source code, documentation - source, and configuration files. - - "Object" form shall mean any form resulting from mechanical - transformation or translation of a Source form, including but - not limited to compiled object code, generated documentation, - and conversions to other media types. - - "Work" shall mean the work of authorship, whether in Source or - Object form, made available under the License, as indicated by a - copyright notice that is included in or attached to the work - (an example is provided in the Appendix below). - - "Derivative Works" shall mean any work, whether in Source or Object - form, that is based on (or derived from) the Work and for which the - editorial revisions, annotations, elaborations, or other modifications - represent, as a whole, an original work of authorship. For the purposes - of this License, Derivative Works shall not include works that remain - separable from, or merely link (or bind by name) to the interfaces of, - the Work and Derivative Works thereof. - - "Contribution" shall mean any work of authorship, including - the original version of the Work and any modifications or additions - to that Work or Derivative Works thereof, that is intentionally - submitted to Licensor for inclusion in the Work by the copyright owner - or by an individual or Legal Entity authorized to submit on behalf of - the copyright owner. For the purposes of this definition, "submitted" - means any form of electronic, verbal, or written communication sent - to the Licensor or its representatives, including but not limited to - communication on electronic mailing lists, source code control systems, - and issue tracking systems that are managed by, or on behalf of, the - Licensor for the purpose of discussing and improving the Work, but - excluding communication that is conspicuously marked or otherwise - designated in writing by the copyright owner as "Not a Contribution." - - "Contributor" shall mean Licensor and any individual or Legal Entity - on behalf of whom a Contribution has been received by Licensor and - subsequently incorporated within the Work. - - 2. Grant of Copyright License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - copyright license to reproduce, prepare Derivative Works of, - publicly display, publicly perform, sublicense, and distribute the - Work and such Derivative Works in Source or Object form. - - 3. Grant of Patent License. Subject to the terms and conditions of - this License, each Contributor hereby grants to You a perpetual, - worldwide, non-exclusive, no-charge, royalty-free, irrevocable - (except as stated in this section) patent license to make, have made, - use, offer to sell, sell, import, and otherwise transfer the Work, - where such license applies only to those patent claims licensable - by such Contributor that are necessarily infringed by their - Contribution(s) alone or by combination of their Contribution(s) - with the Work to which such Contribution(s) was submitted. If You - institute patent litigation against any entity (including a - cross-claim or counterclaim in a lawsuit) alleging that the Work - or a Contribution incorporated within the Work constitutes direct - or contributory patent infringement, then any patent licenses - granted to You under this License for that Work shall terminate - as of the date such litigation is filed. - - 4. Redistribution. You may reproduce and distribute copies of the - Work or Derivative Works thereof in any medium, with or without - modifications, and in Source or Object form, provided that You - meet the following conditions: - - (a) You must give any other recipients of the Work or - Derivative Works a copy of this License; and - - (b) You must cause any modified files to carry prominent notices - stating that You changed the files; and - - (c) You must retain, in the Source form of any Derivative Works - that You distribute, all copyright, patent, trademark, and - attribution notices from the Source form of the Work, - excluding those notices that do not pertain to any part of - the Derivative Works; and - - (d) If the Work includes a "NOTICE" text file as part of its - distribution, then any Derivative Works that You distribute must - include a readable copy of the attribution notices contained - within such NOTICE file, excluding those notices that do not - pertain to any part of the Derivative Works, in at least one - of the following places: within a NOTICE text file distributed - as part of the Derivative Works; within the Source form or - documentation, if provided along with the Derivative Works; or, - within a display generated by the Derivative Works, if and - wherever such third-party notices normally appear. The contents - of the NOTICE file are for informational purposes only and - do not modify the License. You may add Your own attribution - notices within Derivative Works that You distribute, alongside - or as an addendum to the NOTICE text from the Work, provided - that such additional attribution notices cannot be construed - as modifying the License. - - You may add Your own copyright statement to Your modifications and - may provide additional or different license terms and conditions - for use, reproduction, or distribution of Your modifications, or - for any such Derivative Works as a whole, provided Your use, - reproduction, and distribution of the Work otherwise complies with - the conditions stated in this License. - - 5. Submission of Contributions. Unless You explicitly state otherwise, - any Contribution intentionally submitted for inclusion in the Work - by You to the Licensor shall be under the terms and conditions of - this License, without any additional terms or conditions. - Notwithstanding the above, nothing herein shall supersede or modify - the terms of any separate license agreement you may have executed - with Licensor regarding such Contributions. - - 6. Trademarks. This License does not grant permission to use the trade - names, trademarks, service marks, or product names of the Licensor, - except as required for reasonable and customary use in describing the - origin of the Work and reproducing the content of the NOTICE file. - - 7. Disclaimer of Warranty. Unless required by applicable law or - agreed to in writing, Licensor provides the Work (and each - Contributor provides its Contributions) on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or - implied, including, without limitation, any warranties or conditions - of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A - PARTICULAR PURPOSE. You are solely responsible for determining the - appropriateness of using or redistributing the Work and assume any - risks associated with Your exercise of permissions under this License. - - 8. Limitation of Liability. In no event and under no legal theory, - whether in tort (including negligence), contract, or otherwise, - unless required by applicable law (such as deliberate and grossly - negligent acts) or agreed to in writing, shall any Contributor be - liable to You for damages, including any direct, indirect, special, - incidental, or consequential damages of any character arising as a - result of this License or out of the use or inability to use the - Work (including but not limited to damages for loss of goodwill, - work stoppage, computer failure or malfunction, or any and all - other commercial damages or losses), even if such Contributor - has been advised of the possibility of such damages. - - 9. Accepting Warranty or Additional Liability. While redistributing - the Work or Derivative Works thereof, You may choose to offer, - and charge a fee for, acceptance of support, warranty, indemnity, - or other liability obligations and/or rights consistent with this - License. However, in accepting such obligations, You may act only - on Your own behalf and on Your sole responsibility, not on behalf - of any other Contributor, and only if You agree to indemnify, - defend, and hold each Contributor harmless for any liability - incurred by, or claims asserted against, such Contributor by reason - of your accepting any such warranty or additional liability. - - END OF TERMS AND CONDITIONS - - APPENDIX: How to apply the Apache License to your work. - - To apply the Apache License to your work, attach the following - boilerplate notice, with the fields enclosed by brackets "{}" - replaced with your own identifying information. (Don't include - the brackets!) The text should be enclosed in the appropriate - comment syntax for the file format. We also recommend that a - file or class name and description of purpose be included on the - same "printed page" as the copyright notice for easier - identification within third-party archives. - - Copyright {yyyy} {name of copyright owner} - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. - -CONTRIBUTORS - -TVM Contributors -================ -TVM adopts the Apache style model and governs by merit. We believe that it is important to create an inclusive community where everyone can use, -contribute to, and influence the direction of the project. We actively invite contributors who have earned the merit to be part of the development community. - -See the [community structure document](http://docs.tvm.ai/contribute/community.html) for the explanation of community structure and contribution guidelines. - -## Committers -- [Tianqi Chen](https://github.com/tqchen) (PMC) -- [Thierry Moreau](http://homes.cs.washington.edu/~moreau/) -- [Ziheng Jiang](https://github.com/ZihengJiang) -- [Haichen Shen](http://homes.cs.washington.edu/~haichen/) -- [Yizhi Liu](https://github.com/yzhliu) - -## Code Owners -- [Aditya Atluri](https://github.com/adityaatluri) ROCM -- [Leyuan Wang](https://github.com/Laurawly) TOPI -- [Yuwei Hu](https://github.com/Huyuwei) TOPI -- [Zhixun Tan](https://github.com/phisiart) OpenGL/WebGL backend -- [Nick Hynes](https://github.com/nhynes) SGX and secured computing -- [Lianmin Zheng](https://github.com/merrymercy) AutoTVM - -## Reviewers -- [Zhi Chen](https://github.com/zhiics) -- [Xiaoqiang Dan](https://github.com/xqdan) -- [Liangfu Chen](https://github.com/liangfu) -- [Masahiro Masuda](https://github.com/masahi) -- [Kazutaka Morita](https://github.com/kazum) -- [Tatsuya Nishiyama](https://github.com/nishi-t) -- [Pariksheet Pinjari](https://github.com/PariksheetPinjari909) -- [Jared Roesch](https://github.com/jroesch) -- [Siva](https://github.com/srkreddy1238) -- [Siju Samuel](https://github.com/siju-samuel) -- [Alex Weaver](https://github.com/alex-weaver) -- [Yao Wang](https://github.com/kevinthesun) -- [Jian Weng](https://github.com/were) -- [Eddie Yan](https://github.com/eqy) -- [Joshua Z. Zhang](https://github.com/zhreshold) - -## List of Contributors -- [Full List of Contributors](https://github.com/dmlc/tvm/graphs/contributors) - - To contributors: please add your name to the list. -- [Qiao Zhang](https://github.com/zhangqiaorjc) -- [Haolong Zhang](https://github.com/haolongzhangm) -- [Cody Hao Yu](https://github.com/comaniac) -- [Chris Nuernberger](https://github.com/cnuernber) - -_____ - FreeBSD: getopt.c file Copyright (c) 1987, 1993, 1994 diff --git a/cgmanifests/cgmanifest.json b/cgmanifests/cgmanifest.json index 1432193ac9080..46349f43923e2 100644 --- a/cgmanifests/cgmanifest.json +++ b/cgmanifests/cgmanifest.json @@ -1,578 +1,508 @@ { - "$schema": "https://json.schemastore.org/component-detection-manifest.json", - "Registrations": [ - { - "component": { - "type": "git", - "git": { - "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf", - "repositoryUrl": "https://github.com/abseil/abseil-cpp.git" - } - } - }, - { - "component": { - "Type": "maven", - "maven": { - "GroupId": "org.junit.platform", - "ArtifactId": "junit-platform-console-standalone", - "Version": "1.6.2" - }, - "DevelopmentDependency": true - } - }, - { - "component": { - "Type": "maven", - "maven": { - "GroupId": "com.google.protobuf", - "ArtifactId": "protobuf-java", - "Version": "3.21.7" - }, - "DevelopmentDependency": true - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "2379917985919ed3918dc12cad47f469f245be7a", - "repositoryUrl": "https://github.com/apache/tvm.git" - }, - "comments": "needed for TVM EP" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "cabe04d6d6b05356fa8f9741704924788f0dd762", - "repositoryUrl": "https://github.com/agauniyal/rang.git" - }, - "comments": "dependency from tvm" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "a3bcc6981d5dad3afb212689e2c7853d1b1ee45d", - "repositoryUrl": "https://github.com/NVIDIA/cutlass.git" - }, - "comments": "dependency from tvm" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "08f7c7e69f8ea61a0c4151359bc8023be8e9217b", - "repositoryUrl": "https://github.com/tlc-pack/libbacktrace.git" - }, - "comments": "dependency from tvm" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "36a91576edf633479c78649e050f18dd2ddc8103", - "repositoryUrl": "https://github.com/apache/incubator-tvm-vta.git" - }, - "comments": "dependency from tvm" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "111c9be5188f7350c2eac9ddaedd8cca3d7bf394", - "repositoryUrl": "https://github.com/kazuho/picojson.git" - }, - "comments": "dependency from tvm" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "b5e4186d7ab63458e79084842dced166be2ca5b5", - "repositoryUrl": "https://github.com/lammertb/libcrc.git" - }, - "comments": "dependency from tvm" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c", - "repositoryUrl": "https://github.com/dmlc/HalideIR.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3", - "repositoryUrl": "https://github.com/dmlc/dlpack.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8", - "repositoryUrl": "https://github.com/dmlc/dmlc-core.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20", - "repositoryUrl": "https://github.com/intel/mkl-dnn.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6", - "repositoryUrl": "https://github.com/tensorflow/tensorflow.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e", - "repositoryUrl": "https://github.com/glennrp/libpng.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0", - "repositoryUrl": "https://github.com/KjellKod/g3log.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "50893291621658f355bc5b4d450a8d06a563053d", - "repositoryUrl": "https://github.com/madler/zlib.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b", - "repositoryUrl": "https://github.com/Microsoft/CNTK.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92", - "repositoryUrl": "https://github.com/numpy/numpy.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2", - "repositoryUrl": "https://github.com/pytorch/pytorch.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d", - "repositoryUrl": "https://github.com/pytorch/pytorch.git" - }, - "comments": "pytorch 1.6 used by onnxruntime training image" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660", - "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0", - "repositoryUrl": "https://github.com/BVLC/caffe.git" - } - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "LLVM", - "Version": "9.0.0", - "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz" - } - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "FreeBSD GetOpt", - "Version": "12.0.0", - "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co" - } - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "Boost", - "Version": "1.69.0", - "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2" - } - } - }, - { - "component": { - "git": { - "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece", - "repositoryUrl": "https://github.com/grpc/grpc" - }, - "type": "git" - } - }, - { - "component": { - "git": { - "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420", - "repositoryUrl": "https://github.com/google/boringssl.git" - }, - "type": "git" - } - }, - { - "component": { - "git": { - "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce", - "repositoryUrl": "https://github.com/c-ares/c-ares.git" - }, - "type": "git" - } - }, - { - "component": { - "git": { - "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0", - "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git" - }, - "type": "git" - } - }, - { - "component": { - "git": { - "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba", - "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git" - }, - "type": "git" - } - }, - { - "component": { - "git": { - "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3", - "repositoryUrl": "https://github.com/google/upb.git" - }, - "type": "git" - } - }, - { - "component": { - "type": "other", - "Other": { - "Name": "Go", - "Version": "1.12.6", - "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz" - } - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "OpenMPI", - "Version": "4.0.0", - "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz" - } - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "OpenMPI", - "Version": "4.0.4", - "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz" - }, - "comments": "openmpi 4.0.4 used by onnxruntime training image" - } - }, - { - "component": { - "Type": "git", - "git": { - "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025", - "repositoryUrl": "https://github.com/mpi4py/mpi4py" - }, - "comments": "mpi4py 3.0.3 used by onnxruntime training image" - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "NCCL", - "Version": "2.4.8", - "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb", - "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml" - }, - "comments": "used by onnxruntime" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7", - "repositoryUrl": "https://github.com/openucx/ucx" - }, - "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277", - "repositoryUrl": "https://github.com/nodejs/node" - }, - "comments": "For Nodejs binding" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3", - "repositoryUrl": "https://github.com/pypa/manylinux" - }, - "comments": "For building our CI build docker image" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "c974557598645360fbabac71352b083117e3cc17", - "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake" - }, - "comments": "CMake 3.24.3. For building our CI build docker image" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a", - "repositoryUrl": "https://github.com/python/cpython" - }, - "comments": "Python 3.9.1" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b", - "repositoryUrl": "https://github.com/python/cpython" - }, - "comments": "Python 3.8.7" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c", - "repositoryUrl": "https://github.com/python/cpython" - }, - "comments": "Python 3.7.9" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270", - "repositoryUrl": "https://github.com/python/cpython" - }, - "comments": "Python 3.6.12" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f", - "repositoryUrl": "https://github.com/python/cpython" - }, - "comments": "Python 3.5.10" - } - }, - { - "component": { - "type": "pip", - "pip": { - "Name": "transformers", - "Version": "4.38.0" - }, - "comments": "Installed in the training docker image" - } - }, - { - "component": { - "type": "pip", - "pip": { - "Name": "msgpack", - "Version": "1.0.0" - }, - "comments": "Installed in the training docker image" - } - }, - { - "component": { - "type": "pip", - "pip": { - "Name": "tensorboardX", - "Version": "1.8" - }, - "comments": "Installed in the training docker image" - } - }, - { - "component": { - "type": "pip", - "pip": { - "Name": "tensorboard", - "Version": "2.3.0" - }, - "comments": "Installed in the training docker image" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56", - "repositoryUrl": "https://github.com/aappleby/smhasher" - }, - "comments": "MurmurHash3" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3", - "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework" - }, - "comments": "gtest-ios-framework" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c", - "repositoryUrl": "https://github.com/dmlc/dlpack.git" - }, - "comments": "dlpack" - } - }, - { - "component": { - "Type": "other", - "Other": { - "Name": "SQLite3", - "Version": "3.22.0", - "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb" - } - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140", - "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git" - }, - "comments": "dlfcn-win32" - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df", - "repositoryUrl": "https://github.com/python-pillow/Pillow.git" - }, - "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel." - } - }, - { - "component": { - "type": "git", - "git": { - "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac", - "repositoryUrl": "https://gitlab.com/libeigen/eigen.git" - } - } - } - ], - "Version": 1 + "$schema": "https://json.schemastore.org/component-detection-manifest.json", + "Registrations": [ + { + "component": { + "type": "git", + "git": { + "commitHash": "215105818dfde3174fe799600bb0f3cae233d0bf", + "repositoryUrl": "https://github.com/abseil/abseil-cpp.git" + } + } + }, + { + "component": { + "Type": "maven", + "maven": { + "GroupId": "org.junit.platform", + "ArtifactId": "junit-platform-console-standalone", + "Version": "1.6.2" + }, + "DevelopmentDependency": true + } + }, + { + "component": { + "Type": "maven", + "maven": { + "GroupId": "com.google.protobuf", + "ArtifactId": "protobuf-java", + "Version": "3.21.7" + }, + "DevelopmentDependency": true + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "e4a4c02764d37c9c3db0d64c4996651a3ef9513c", + "repositoryUrl": "https://github.com/dmlc/HalideIR.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "bee4d1dd8dc1ee4a1fd8fa6a96476c2f8b7492a3", + "repositoryUrl": "https://github.com/dmlc/dlpack.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "4d49691f1a9d944c3b0aa5e63f1db3cad1f941f8", + "repositoryUrl": "https://github.com/dmlc/dmlc-core.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "7de7e5d02bf687f971e7668963649728356e0c20", + "repositoryUrl": "https://github.com/intel/mkl-dnn.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "d860915b0198ddb96f93e9e97a789af156544dc6", + "repositoryUrl": "https://github.com/tensorflow/tensorflow.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "eddf9023206dc40974c26f589ee2ad63a4227a1e", + "repositoryUrl": "https://github.com/glennrp/libpng.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "217f52fb121ef92491e5d5f71394b07ce4ead1d0", + "repositoryUrl": "https://github.com/KjellKod/g3log.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "50893291621658f355bc5b4d450a8d06a563053d", + "repositoryUrl": "https://github.com/madler/zlib.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "d264a2603493fecda607c1d1cda87fedba77d36b", + "repositoryUrl": "https://github.com/Microsoft/CNTK.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "971e2e89d08deeae0139d3011d15646fdac13c92", + "repositoryUrl": "https://github.com/numpy/numpy.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "90537289a04ef5d572496240e2ac3a881be518d2", + "repositoryUrl": "https://github.com/pytorch/pytorch.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "b31f58de6fa8bbda5353b3c77d9be4914399724d", + "repositoryUrl": "https://github.com/pytorch/pytorch.git" + }, + "comments": "pytorch 1.6 used by onnxruntime training image" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "7389dbac82d362f296dc2746f10e43ffa1615660", + "repositoryUrl": "https://github.com/scikit-learn/scikit-learn.git" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "eeebdab16155d34ff8f5f42137da7df4d1c7eab0", + "repositoryUrl": "https://github.com/BVLC/caffe.git" + } + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "LLVM", + "Version": "9.0.0", + "DownloadUrl": "https://releases.llvm.org/9.0.0/llvm-9.0.0.src.tar.xz" + } + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "FreeBSD GetOpt", + "Version": "12.0.0", + "DownloadUrl": "https://svnweb.freebsd.org/base/release/12.0.0/lib/libc/stdlib/getopt.c?revision=341707&view=co" + } + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "Boost", + "Version": "1.69.0", + "DownloadUrl": "https://boostorg.jfrog.io/artifactory/main/release/1.69.0/source/boost_1_69_0.tar.bz2" + } + } + }, + { + "component": { + "git": { + "commitHash": "02a2a458ac15912d7d87cc1171e811b0c5219ece", + "repositoryUrl": "https://github.com/grpc/grpc" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "b29b21a81b32ec273f118f589f46d56ad3332420", + "repositoryUrl": "https://github.com/google/boringssl.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "3be1924221e1326df520f8498d704a5c4c8d0cce", + "repositoryUrl": "https://github.com/c-ares/c-ares.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "6599cac0965be8e5a835ab7a5684bbef033d5ad0", + "repositoryUrl": "https://github.com/llvm-mirror/libcxx.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "9245d481eb3e890f708ff2d7dadf2a10c04748ba", + "repositoryUrl": "https://github.com/llvm-mirror/libcxxabi.git" + }, + "type": "git" + } + }, + { + "component": { + "git": { + "commitHash": "9ce4a77f61c134bbed28bfd5be5cd7dc0e80f5e3", + "repositoryUrl": "https://github.com/google/upb.git" + }, + "type": "git" + } + }, + { + "component": { + "type": "other", + "Other": { + "Name": "Go", + "Version": "1.12.6", + "DownloadUrl": "https://dl.google.com/go/go1.12.6.linux-amd64.tar.gz" + } + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "OpenMPI", + "Version": "4.0.0", + "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.0.tar.gz" + } + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "OpenMPI", + "Version": "4.0.4", + "DownloadUrl": "https://download.open-mpi.org/release/open-mpi/v4.0/openmpi-4.0.4.tar.gz" + }, + "comments": "openmpi 4.0.4 used by onnxruntime training image" + } + }, + { + "component": { + "Type": "git", + "git": { + "commitHash": "7db3f9c741d3dfd8dda14ffb537ed251280d2025", + "repositoryUrl": "https://github.com/mpi4py/mpi4py" + }, + "comments": "mpi4py 3.0.3 used by onnxruntime training image" + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "NCCL", + "Version": "2.4.8", + "DownloadUrl": "https://docs.nvidia.com/deeplearning/sdk/nccl-install-guide/index.html" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "67afac65ce64fd4dce1494f43e565e8fe34bdffb", + "repositoryUrl": "https://android.googlesource.com/platform/frameworks/ml" + }, + "comments": "used by onnxruntime" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "c30b7da2301202da5f9f0529966944f110e5d6e7", + "repositoryUrl": "https://github.com/openucx/ucx" + }, + "comments": "middleware between IB verbs and OpenMPI used by onnxruntime training image" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "63d1e08e64e7e09408eb63cd8dd7c65ad766f277", + "repositoryUrl": "https://github.com/nodejs/node" + }, + "comments": "For Nodejs binding" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "aead4d751c2101e23336aa73f2380df83e7a13f3", + "repositoryUrl": "https://github.com/pypa/manylinux" + }, + "comments": "For building our CI build docker image" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "c974557598645360fbabac71352b083117e3cc17", + "repositoryUrl": "https://gitlab.kitware.com/cmake/cmake" + }, + "comments": "CMake 3.24.3. For building our CI build docker image" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "1e5d33e9b9b8631b36f061103a30208b206fd03a", + "repositoryUrl": "https://github.com/python/cpython" + }, + "comments": "Python 3.9.1" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "6503f05dd59e26a9986bdea097b3da9b3546f45b", + "repositoryUrl": "https://github.com/python/cpython" + }, + "comments": "Python 3.8.7" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "13c94747c74437e594b7fc242ff7da668e81887c", + "repositoryUrl": "https://github.com/python/cpython" + }, + "comments": "Python 3.7.9" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "c0a9afe2ac1820409e6173bd1893ebee2cf50270", + "repositoryUrl": "https://github.com/python/cpython" + }, + "comments": "Python 3.6.12" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "426b022776672fdf3d71ddd98d89af341c88080f", + "repositoryUrl": "https://github.com/python/cpython" + }, + "comments": "Python 3.5.10" + } + }, + { + "component": { + "type": "pip", + "pip": { + "Name": "transformers", + "Version": "4.38.0" + }, + "comments": "Installed in the training docker image" + } + }, + { + "component": { + "type": "pip", + "pip": { + "Name": "msgpack", + "Version": "1.0.0" + }, + "comments": "Installed in the training docker image" + } + }, + { + "component": { + "type": "pip", + "pip": { + "Name": "tensorboardX", + "Version": "1.8" + }, + "comments": "Installed in the training docker image" + } + }, + { + "component": { + "type": "pip", + "pip": { + "Name": "tensorboard", + "Version": "2.3.0" + }, + "comments": "Installed in the training docker image" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "92cf3702fcfaadc84eb7bef59825a23e0cd84f56", + "repositoryUrl": "https://github.com/aappleby/smhasher" + }, + "comments": "MurmurHash3" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "b89da3c5a0aa18fb2c6163ad9984f81ab65b22e3", + "repositoryUrl": "https://github.com/mestevens/gtest-ios-framework" + }, + "comments": "gtest-ios-framework" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "277508879878e0a5b5b43599b1bea11f66eb3c6c", + "repositoryUrl": "https://github.com/dmlc/dlpack.git" + }, + "comments": "dlpack" + } + }, + { + "component": { + "Type": "other", + "Other": { + "Name": "SQLite3", + "Version": "3.22.0", + "DownloadUrl": "http://security.ubuntu.com/ubuntu/pool/main/s/sqlite3/libsqlite3-dev_3.22.0-1ubuntu0.4_amd64.deb" + } + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "9d0ef119d9fcb9139f831adc224857b791c81140", + "repositoryUrl": "https://github.com/dlfcn-win32/dlfcn-win32.git" + }, + "comments": "dlfcn-win32" + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "6812205f18ca4ef54372e87e1a13ce4a859434df", + "repositoryUrl": "https://github.com/python-pillow/Pillow.git" + }, + "comments": "python-pillow. Implementation logic for anti-aliasing copied by Resize CPU kernel." + } + }, + { + "component": { + "type": "git", + "git": { + "commitHash": "e7248b26a1ed53fa030c5c459f7ea095dfd276ac", + "repositoryUrl": "https://gitlab.com/libeigen/eigen.git" + } + } + } + ], + "Version": 1 } diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 31ebf58b03152..af341aaead2d5 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -102,7 +102,6 @@ option(onnxruntime_BUILD_CSHARP "Build C# library" OFF) option(onnxruntime_BUILD_OBJC "Build Objective-C library" OFF) option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to provide eigen_SOURCE_PATH if turn this on." OFF) option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF) -option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF) option(onnxruntime_USE_VSINPU "Build with VSINPU support" OFF) cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF) @@ -145,10 +144,6 @@ option(onnxruntime_USE_TELEMETRY "Build with Telemetry" OFF) cmake_dependent_option(onnxruntime_USE_MIMALLOC "Override new/delete and arena allocator with mimalloc" OFF "WIN32;NOT onnxruntime_USE_CUDA;NOT onnxruntime_USE_OPENVINO" OFF) option(onnxruntime_USE_CANN "Build with CANN support" OFF) option(onnxruntime_USE_ROCM "Build with AMD GPU support" OFF) -option(onnxruntime_USE_TVM "Build with TVM support" OFF) -option(onnxruntime_TVM_CUDA_RUNTIME "Build TVM with CUDA support" OFF) -option(onnxruntime_TVM_USE_LLVM "Build TVM with LLVM. Set customized path to llvm-config.exe here if need" OFF) -option(onnxruntime_TVM_USE_HASH "Build ipp-crypto library for support hash algorithm. It is defined for TVM only") option(onnxruntime_USE_XNNPACK "Build with XNNPACK support. Provides an alternative math library on ARM, WebAssembly and x86." OFF) option(onnxruntime_USE_WEBNN "Build with WebNN support. Enable hardware acceleration in web browsers." OFF) option(onnxruntime_USE_WEBGPU "Build with WebGPU support. Enable WebGPU via C/C++ interface." OFF) @@ -906,11 +901,6 @@ if (onnxruntime_USE_SNPE) list(APPEND ONNXRUNTIME_PROVIDER_NAMES snpe) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_SNPE=1) endif() -if (onnxruntime_USE_TVM) - list(APPEND ORT_PROVIDER_FLAGS -DUSE_TVM=1) - list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_TVM=1) - list(APPEND ONNXRUNTIME_PROVIDER_NAMES tvm) -endif() if (onnxruntime_USE_WINML) list(APPEND ORT_PROVIDER_FLAGS -DUSE_WINML=1) list(APPEND ORT_PROVIDER_CMAKE_FLAGS -Donnxruntime_USE_WINML=1) @@ -1313,50 +1303,6 @@ if (onnxruntime_USE_DNNL) add_compile_definitions(DNNL_OPENMP) endif() -# TVM EP -if (onnxruntime_USE_TVM) - if (NOT TARGET tvm) - message(STATUS "Include TVM(*).") - include(tvm) - endif() - - # ipp-crypto - if (onnxruntime_TVM_USE_HASH) - message(STATUS "Include ipp-crypto(*).") - include(ipp-crypto) - endif() - - # TVM - if (onnxruntime_TVM_USE_LLVM) - set(USE_LLVM "${onnxruntime_TVM_USE_LLVM}" CACHE STRING "Path to LLVM for correct TVM build") - elseif(onnxruntime_USE_LLVM) - set(USE_LLVM ON CACHE BOOL "Only defined for TVM") - endif() - - if (onnxruntime_TVM_CUDA_RUNTIME) - set(USE_CUDA ON CACHE BOOL "Only defined for TVM" FORCE) - endif() - - # TODO(vvchernov): customized tvm logger is hidden due to the issue on TVM side (https://github.com/apache/tvm/issues/10139) - # add_compile_definitions(TVM_LOG_CUSTOMIZE=1) - # add_library(tvm_custom_logger STATIC ${ONNXRUNTIME_ROOT}/core/providers/tvm/custom_logging.cc) - - set(USE_OPENMP gnu CACHE STRING "Only defined for TVM") - add_subdirectory(${tvm_SOURCE_DIR} ${tvm_BINARY_DIR} EXCLUDE_FROM_ALL) - - set_target_properties(tvm PROPERTIES FOLDER ${tvm_SOURCE_DIR}) - # target_link_libraries(tvm PUBLIC tvm_custom_logger) - - set(TVM_INCLUDES ${tvm_SOURCE_DIR}/include - ${tvm_SOURCE_DIR}/3rdparty/dmlc-core/include - ${tvm_SOURCE_DIR}/3rdparty/dlpack/include - $) - - set(onnxruntime_tvm_libs onnxruntime_providers_tvm) - list(APPEND onnxruntime_EXTERNAL_LIBRARIES tvm) - list(APPEND onnxruntime_EXTERNAL_DEPENDENCIES tvm) -endif() - # onnxruntime-extensions if (onnxruntime_USE_EXTENSIONS) include(extensions) diff --git a/cmake/external/tvm.cmake b/cmake/external/tvm.cmake deleted file mode 100644 index 93049c8b85853..0000000000000 --- a/cmake/external/tvm.cmake +++ /dev/null @@ -1,24 +0,0 @@ -if (onnxruntime_USE_TVM) - message(STATUS "onnxruntime_USE_TVM: Fetch tvm for TVM EP") - - FetchContent_Declare( - tvm - GIT_REPOSITORY https://github.com/apache/tvm.git - GIT_TAG 2379917985919ed3918dc12cad47f469f245be7a - ) - - FetchContent_GetProperties(tvm) - if(NOT tvm_POPULATED) - FetchContent_Populate(tvm) - if (WIN32) - execute_process( - COMMAND ${CMAKE_COMMAND} -E create_symlink ${tvm_BINARY_DIR}/${CMAKE_BUILD_TYPE} ${tvm_SOURCE_DIR}/build - ) - else() - file(CREATE_LINK ${tvm_BINARY_DIR} ${tvm_SOURCE_DIR}/build SYMBOLIC) - endif() - endif() - - set(tvm_INCLUDE_DIRS ${tvm_SOURCE_DIR}/include) - -endif() diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 003f7ad18286b..732c0511d400f 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -210,7 +210,6 @@ set(onnxruntime_INTERNAL_LIBRARIES ${PROVIDERS_NNAPI} ${PROVIDERS_QNN} ${PROVIDERS_SNPE} - ${PROVIDERS_TVM} ${PROVIDERS_RKNPU} ${PROVIDERS_VSINPU} ${PROVIDERS_XNNPACK} @@ -221,7 +220,6 @@ set(onnxruntime_INTERNAL_LIBRARIES ${onnxruntime_winml} onnxruntime_optimizer onnxruntime_providers - ${onnxruntime_tvm_libs} onnxruntime_lora onnxruntime_framework onnxruntime_graph diff --git a/cmake/onnxruntime_codegen_tvm.cmake b/cmake/onnxruntime_codegen_tvm.cmake deleted file mode 100644 index 7b50d8f8603ae..0000000000000 --- a/cmake/onnxruntime_codegen_tvm.cmake +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - -file(GLOB_RECURSE onnxruntime_codegen_common_srcs - "${ONNXRUNTIME_ROOT}/core/codegen/common/*.h" - "${ONNXRUNTIME_ROOT}/core/codegen/common/*.cc" -) - -file(GLOB_RECURSE onnxruntime_codegen_tvm_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.h" - "${ONNXRUNTIME_ROOT}/core/codegen/mti/*.cc" - "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.h" - "${ONNXRUNTIME_ROOT}/core/codegen/passes/*.cc" -) - -source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs}) - -#onnxruntime_codegen_tvm depends on onnxruntime framework -onnxruntime_add_static_library(onnxruntime_codegen_tvm ${onnxruntime_codegen_common_srcs} ${onnxruntime_codegen_tvm_srcs}) -set_target_properties(onnxruntime_codegen_tvm PROPERTIES FOLDER "ONNXRuntime") -target_include_directories(onnxruntime_codegen_tvm PRIVATE ${ONNXRUNTIME_ROOT} ${TVM_INCLUDES} ${MKLML_INCLUDE_DIR} ${eigen_INCLUDE_DIRS}) -onnxruntime_add_include_to_target(onnxruntime_codegen_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers safeint_interface Boost::mp11) -target_compile_options(onnxruntime_codegen_tvm PRIVATE ${DISABLED_WARNINGS_FOR_TVM}) -# need onnx to build to create headers that this project includes -add_dependencies(onnxruntime_codegen_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES}) diff --git a/cmake/onnxruntime_csharp.cmake b/cmake/onnxruntime_csharp.cmake index 22c993d07f7f9..39533429e181c 100644 --- a/cmake/onnxruntime_csharp.cmake +++ b/cmake/onnxruntime_csharp.cmake @@ -30,10 +30,6 @@ if (onnxruntime_USE_NNAPI_BUILTIN) STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_NNAPI;") endif() -if (onnxruntime_USE_TVM) - STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_TVM,") -endif() - if (onnxruntime_USE_OPENVINO) STRING(APPEND CSHARP_PREPROCESSOR_DEFINES "USE_OPENVINO;") endif() diff --git a/cmake/onnxruntime_providers.cmake b/cmake/onnxruntime_providers.cmake index 9666877cdc206..582491de9503d 100644 --- a/cmake/onnxruntime_providers.cmake +++ b/cmake/onnxruntime_providers.cmake @@ -101,9 +101,6 @@ endif() if(onnxruntime_USE_ROCM) set(PROVIDERS_ROCM onnxruntime_providers_rocm) endif() -if (onnxruntime_USE_TVM) - set(PROVIDERS_TVM onnxruntime_providers_tvm) -endif() if (onnxruntime_USE_XNNPACK) set(PROVIDERS_XNNPACK onnxruntime_providers_xnnpack) endif() @@ -194,10 +191,6 @@ if (onnxruntime_USE_ROCM) include(onnxruntime_providers_rocm.cmake) endif() -if (onnxruntime_USE_TVM) - include(onnxruntime_providers_tvm.cmake) -endif() - if (onnxruntime_USE_VSINPU) include(onnxruntime_providers_vsinpu.cmake) endif() diff --git a/cmake/onnxruntime_providers_cuda.cmake b/cmake/onnxruntime_providers_cuda.cmake index 39ad530146b33..4f86717026118 100644 --- a/cmake/onnxruntime_providers_cuda.cmake +++ b/cmake/onnxruntime_providers_cuda.cmake @@ -224,8 +224,7 @@ include(cutlass) target_include_directories(${target} PRIVATE ${cutlass_SOURCE_DIR}/include ${cutlass_SOURCE_DIR}/examples ${cutlass_SOURCE_DIR}/tools/util/include) - target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} ${TVM_INCLUDES} - PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) + target_include_directories(${target} PRIVATE ${ONNXRUNTIME_ROOT} ${CMAKE_CURRENT_BINARY_DIR} ${eigen_INCLUDE_DIRS} PUBLIC ${CUDAToolkit_INCLUDE_DIRS}) # ${CMAKE_CURRENT_BINARY_DIR} is so that #include "onnxruntime_config.h" inside tensor_shape.h is found set_target_properties(${target} PROPERTIES LINKER_LANGUAGE CUDA) set_target_properties(${target} PROPERTIES FOLDER "ONNXRuntime") diff --git a/cmake/onnxruntime_providers_tvm.cmake b/cmake/onnxruntime_providers_tvm.cmake deleted file mode 100644 index 8fd50c70dd5d7..0000000000000 --- a/cmake/onnxruntime_providers_tvm.cmake +++ /dev/null @@ -1,64 +0,0 @@ -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. - - add_definitions(-DUSE_TVM=1) - if (onnxruntime_TVM_USE_HASH) - add_definitions(-DUSE_TVM_HASH=1) - endif() - - if (onnxruntime_TVM_USE_HASH) - file (GLOB_RECURSE onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h" - "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc" - ) - else() - file (GLOB onnxruntime_providers_tvm_cc_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.h" - "${ONNXRUNTIME_ROOT}/core/providers/tvm/*.cc" - ) - endif() - - source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_tvm_cc_srcs}) - onnxruntime_add_static_library(onnxruntime_providers_tvm ${onnxruntime_providers_tvm_cc_srcs}) - - if ( CMAKE_COMPILER_IS_GNUCC ) - target_compile_options(onnxruntime_providers_tvm PRIVATE -Wno-unused-parameter -Wno-missing-field-initializers) - endif() - - target_include_directories(onnxruntime_providers_tvm PRIVATE - ${TVM_INCLUDES} - ${PYTHON_INCLUDE_DIRS}) - onnxruntime_add_include_to_target(onnxruntime_providers_tvm onnxruntime_common onnxruntime_framework onnx onnx_proto ${PROTOBUF_LIB} flatbuffers::flatbuffers Boost::mp11 safeint_interface) - - add_dependencies(onnxruntime_providers_tvm ${onnxruntime_EXTERNAL_DEPENDENCIES}) - - if (onnxruntime_TVM_USE_HASH) - add_dependencies(onnxruntime_providers_tvm ippcp_s) - target_include_directories(onnxruntime_providers_tvm PRIVATE ${IPP_CRYPTO_INCLUDE_DIR}) - target_link_libraries(onnxruntime_providers_tvm PRIVATE ippcp_s) - endif() - - set_target_properties(onnxruntime_providers_tvm PROPERTIES FOLDER "ONNXRuntime") - set_target_properties(onnxruntime_providers_tvm PROPERTIES LINKER_LANGUAGE CXX) - - if (WIN32 AND MSVC) - # wd4100: identifier' : unreferenced formal parameter - # wd4127: conditional expression is constant - # wd4244: conversion from 'int' to 'char', possible loss of data - # TODO: 4244 should not be disabled - target_compile_options(onnxruntime_providers_tvm PRIVATE "/wd4100" "/wd4127" "/wd4244") - else() - target_compile_options(onnxruntime_providers_tvm PRIVATE "-Wno-error=type-limits") - endif() - target_compile_definitions(onnxruntime_providers_tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=) - - install(FILES ${PROJECT_SOURCE_DIR}/../include/onnxruntime/core/providers/tvm/tvm_provider_factory.h - DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}/onnxruntime/) - - if (NOT onnxruntime_BUILD_SHARED_LIB) - install(TARGETS onnxruntime_providers_tvm - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} - FRAMEWORK DESTINATION ${CMAKE_INSTALL_BINDIR}) - endif() \ No newline at end of file diff --git a/cmake/onnxruntime_python.cmake b/cmake/onnxruntime_python.cmake index 7239b245a7245..d2c022e4e0269 100644 --- a/cmake/onnxruntime_python.cmake +++ b/cmake/onnxruntime_python.cmake @@ -169,7 +169,6 @@ endif() target_link_libraries(onnxruntime_pybind11_state PRIVATE onnxruntime_session ${onnxruntime_libs} - ${PROVIDERS_TVM} ${PROVIDERS_NNAPI} ${PROVIDERS_XNNPACK} ${PROVIDERS_COREML} @@ -184,7 +183,6 @@ target_link_libraries(onnxruntime_pybind11_state PRIVATE onnxruntime_optimizer onnxruntime_providers onnxruntime_util - ${onnxruntime_tvm_libs} onnxruntime_lora onnxruntime_framework onnxruntime_util @@ -965,37 +963,6 @@ if (onnxruntime_USE_ROCM) ) endif() -if (onnxruntime_USE_TVM) - file(GLOB onnxruntime_python_providers_tvm_srcs CONFIGURE_DEPENDS - "${ONNXRUNTIME_ROOT}/python/providers/tvm/*.py" - ) - add_custom_command( - TARGET onnxruntime_pybind11_state POST_BUILD - COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/providers - COMMAND ${CMAKE_COMMAND} -E make_directory $/onnxruntime/providers/tvm - COMMAND ${CMAKE_COMMAND} -E copy - ${onnxruntime_python_providers_tvm_srcs} - $/onnxruntime/providers/tvm - COMMAND ${CMAKE_COMMAND} -E copy - $ - $/onnxruntime/capi/ - ) - - add_custom_command( - TARGET onnxruntime_pybind11_state POST_BUILD - WORKING_DIRECTORY ${tvm_SOURCE_DIR}/python - COMMAND ${Python_EXECUTABLE} setup.py bdist_wheel - ) - - add_custom_command( - TARGET onnxruntime_pybind11_state POST_BUILD - COMMAND ${Python_EXECUTABLE} - $/onnxruntime/providers/tvm/extend_python_file.py - --target_file $/onnxruntime/capi/_ld_preload.py - ) - -endif() - if (onnxruntime_USE_DML) if (NOT onnxruntime_USE_CUSTOM_DIRECTML) set(dml_shared_lib_path ${DML_PACKAGE_DIR}/bin/${onnxruntime_target_platform}-win/${DML_SHARED_LIB}) diff --git a/cmake/onnxruntime_unittests.cmake b/cmake/onnxruntime_unittests.cmake index 3b73933f3ff1d..e822f0a3655fc 100644 --- a/cmake/onnxruntime_unittests.cmake +++ b/cmake/onnxruntime_unittests.cmake @@ -9,9 +9,6 @@ set(TEST_INC_DIR ${ONNXRUNTIME_ROOT}) if (onnxruntime_ENABLE_TRAINING) list(APPEND TEST_INC_DIR ${ORTTRAINING_ROOT}) endif() -if (onnxruntime_USE_TVM) - list(APPEND TEST_INC_DIR ${TVM_INCLUDES}) -endif() set(disabled_warnings) function(AddTest) @@ -114,7 +111,6 @@ function(AddTest) endif() target_compile_options(${_UT_TARGET} PRIVATE ${disabled_warnings}) else() - target_compile_options(${_UT_TARGET} PRIVATE ${DISABLED_WARNINGS_FOR_TVM}) target_compile_options(${_UT_TARGET} PRIVATE "$<$:SHELL:--compiler-options -Wno-error=sign-compare>" "$<$>:-Wno-error=sign-compare>") if (${HAS_NOERROR}) @@ -644,13 +640,11 @@ set(ONNXRUNTIME_TEST_LIBS ${PROVIDERS_ACL} ${PROVIDERS_ARMNN} ${PROVIDERS_COREML} - # ${PROVIDERS_TVM} ${PROVIDERS_XNNPACK} ${PROVIDERS_AZURE} onnxruntime_optimizer onnxruntime_providers onnxruntime_util - ${onnxruntime_tvm_libs} onnxruntime_lora onnxruntime_framework onnxruntime_util @@ -752,12 +746,6 @@ if(onnxruntime_USE_AZURE) list(APPEND onnxruntime_test_providers_libs onnxruntime_providers_azure) endif() -if(WIN32) - if (onnxruntime_USE_TVM) - list(APPEND disabled_warnings ${DISABLED_WARNINGS_FOR_TVM}) - endif() -endif() - file(GLOB onnxruntime_test_framework_src CONFIGURE_DEPENDS ${onnxruntime_test_framework_src_patterns} ) @@ -858,9 +846,6 @@ if (onnxruntime_ENABLE_TRAINING_APIS) list(APPEND all_tests ${onnxruntime_test_training_api_src}) endif() -if (onnxruntime_USE_TVM) - list(APPEND all_tests ${onnxruntime_test_tvm_src}) -endif() if (onnxruntime_USE_OPENVINO) list(APPEND all_tests ${onnxruntime_test_openvino_src}) @@ -1092,15 +1077,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) COMMAND ${CMAKE_COMMAND} -E copy ${DNNL_DLL_PATH} $ ) endif() - if(WIN32) - if (onnxruntime_USE_TVM) - add_custom_command( - TARGET ${test_data_target} POST_BUILD - COMMAND ${CMAKE_COMMAND} -E copy $ $ - ) - endif() - endif() - if(WIN32) set(wide_get_opt_src_dir ${TEST_SRC_DIR}/win_getopt/wide) onnxruntime_add_static_library(win_getopt_wide ${wide_get_opt_src_dir}/getopt.cc ${wide_get_opt_src_dir}/include/getopt.h) @@ -1142,12 +1118,6 @@ if (NOT IOS) endif() set_target_properties(onnx_test_runner PROPERTIES FOLDER "ONNXRuntimeTest") - if (onnxruntime_USE_TVM) - if (WIN32) - target_link_options(onnx_test_runner PRIVATE "/STACK:4000000") - endif() - endif() - install(TARGETS onnx_test_runner ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} @@ -1301,11 +1271,6 @@ if (NOT onnxruntime_ENABLE_TRAINING_TORCH_INTEROP) endif() set_target_properties(onnxruntime_perf_test PROPERTIES FOLDER "ONNXRuntimeTest") - if (onnxruntime_USE_TVM) - if (WIN32) - target_link_options(onnxruntime_perf_test PRIVATE "/STACK:4000000") - endif() - endif() endif() diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs index be157a0419fc0..d6c46833f1514 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/NativeMethods.shared.cs @@ -1142,9 +1142,6 @@ IntPtr[] outputValues /* An array of output value pointers. Array must be alloca [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)] public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_MIGraphX(IntPtr /*(OrtSessionOptions*)*/ options, int device_id); - - [DllImport(NativeLib.DllName, CharSet = CharSet.Ansi)] - public static extern IntPtr /*(OrtStatus*)*/ OrtSessionOptionsAppendExecutionProvider_Tvm(IntPtr /*(OrtSessionOptions*) */ options, byte[] /*(char char*)*/ settings); #endif /// /// Append a TensorRT EP instance (configured based on given provider options) to the native OrtSessionOptions instance diff --git a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs index 3acd84b3016de..9841d972fa620 100644 --- a/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs +++ b/csharp/src/Microsoft.ML.OnnxRuntime/SessionOptions.shared.cs @@ -146,27 +146,6 @@ public static SessionOptions MakeSessionOptionWithTensorrtProvider(OrtTensorRTPr } } - /// - /// A helper method to construct a SessionOptions object for TVM execution. - /// Use only if you have the onnxruntime package specific to this Execution Provider. - /// - /// settings string, comprises of comma separated key:value pairs. default is empty - /// A SessionsOptions() object configured for execution with TVM - public static SessionOptions MakeSessionOptionWithTvmProvider(String settings = "") - { - SessionOptions options = new SessionOptions(); - try - { - options.AppendExecutionProvider_Tvm(settings); - return options; - } - catch (Exception) - { - options.Dispose(); - throw; - } - } - /// /// A helper method to construct a SessionOptions object for ROCM execution. /// Use only if ROCM is installed and you have the onnxruntime package specific to this Execution Provider. @@ -397,20 +376,6 @@ public void AppendExecutionProvider_CoreML(CoreMLFlags coremlFlags = CoreMLFlags #endif } - /// - /// Use only if you have the onnxruntime package specific to this Execution Provider. - /// - /// string with TVM specific settings - public void AppendExecutionProvider_Tvm(string settings = "") - { -#if __MOBILE__ - throw new NotSupportedException("The TVM Execution Provider is not supported in this build"); -#else - var utf8 = NativeOnnxValueHelper.StringToZeroTerminatedUtf8(settings); - NativeApiStatus.VerifySuccess(NativeMethods.OrtSessionOptionsAppendExecutionProvider_Tvm(handle, utf8)); -#endif - } - private class ExecutionProviderAppender { private byte[] _utf8ProviderName; diff --git a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs index aa0e6ee62248a..b2a863a48e68a 100644 --- a/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs +++ b/csharp/test/Microsoft.ML.OnnxRuntime.Tests.Common/InferenceTest.cs @@ -146,10 +146,6 @@ public void TestSessionOptions() opt.AppendExecutionProvider_Nnapi(0); #endif -#if USE_TVM - opt.AppendExecutionProvider_Tvm("Vulkan -device=amd_apu"); -#endif - #if USE_OPENVINO opt.AppendExecutionProvider_OpenVINO(); #endif diff --git a/docs/TVM_EP.md b/docs/TVM_EP.md deleted file mode 100644 index df59d5c05855c..0000000000000 --- a/docs/TVM_EP.md +++ /dev/null @@ -1,319 +0,0 @@ -# TVM Execution Provider - -## Contents - -- [Introduction](#introduction) -- [Build](#build-onnx-runtime-with-the-tvm-execution-provider) - - [Linux](#linux) - - [Windows](#windows) -- [Configuration options](#configuration-options) -- [Performance Tuning](#performance-tuning) - - [Using precompiled model](#using-precompiled-model) -- [Samples](#samples) -- [Known issues](#known-issues) - - -## Introduction - -TVM is an execution provider for ONNX Runtime that is built on top of Apache TVM. It enables ONNX Runtime users to leverage Apache TVM model optimizations. -TVM EP is currently in "Preview". It's been tested to work on a handful of models on Linux or Windows, but not on MacOS. - -## Build ONNX Runtime with the TVM Execution Provider - -### **Linux** -Install the minimal pre-requisites on Ubuntu/Debian like linux operating systems: -```bash -apt-get install -y python3 python3-dev python3-pip python3-setuptools gcc libtinfo-dev zlib1g-dev build-essential cmake libedit-dev libxml2-dev llvm-12 -pip3 install numpy decorator attrs nasm -``` -Note: since ONNX Runtime with TVM EP is built with Intel ipp-crypto library there are new requirements. Compiler gcc (and g++) version should be equal to or higher than 8.2. nasm version should be 2.14.02 or higher. Problem with small nasm version can be seen [here](https://github.com/intel/ipp-crypto/issues/9) or [here](https://bugzilla.nasm.us/show_bug.cgi?id=3392205). For ubuntu LTS 18 `apt-get install nasm` is not enough due to it has version 2.13.02, see how to install from sources instruction [here](https://stackoverflow.com/questions/36144930/steps-to-install-nasm-offline-on-ubuntu). - -Also, the current implementation has `NVidia GPU` support for TVM EP. For now, you can use only `NVidia GPU` with CUDA Toolkit support. -To do this, make sure you have installed the NVidia driver and CUDA Toolkit. -More detailed instructions can be found on the [official page](https://developer.nvidia.com/cuda-toolkit). - -Clone this repo. -In order to build ONNXRT you will need to have CMake 3.18 or higher. In Ubuntu 20.04 you can use the following commands to install the latest version of CMake: - -```bash -sudo apt-get update -sudo apt-get install gpg wget - -wget -O - https://apt.kitware.com/keys/kitware-archive-latest.asc 2>/dev/null | gpg --dearmor - | sudo tee /usr/share/keyrings/kitware-archive-keyring.gpg >/dev/null - -echo 'deb [signed-by=/usr/share/keyrings/kitware-archive-keyring.gpg] https://apt.kitware.com/ubuntu/ focal main' | sudo tee /etc/apt/sources.list.d/kitware.list >/dev/null -sudo apt-get update - -sudo rm /usr/share/keyrings/kitware-archive-keyring.gpg -sudo apt-get install kitware-archive-keyring - -sudo apt-get install cmake -``` - -Build ONNX Runtime (TVM x86): -```bash -./build.sh --config Release --enable_pybind --build_wheel --parallel --skip_tests --skip_onnx_tests --use_tvm -``` - -Build ONNX Runtime (TVM with CUDA support): -```bash -./build.sh --config Release --enable_pybind --build_wheel --parallel --skip_tests --skip_onnx_tests --use_tvm --tvm_cuda_runtime -``` - -This command builds both `TVM` and `onnxruntime-tvm`. It creates two wheel, one for each project. -Build the python API for ONNX Runtime instead of using the standard package. Instructions for this are given below. - -Package for TVM: -```bash -cd -python3 -m pip uninstall tvm -y -whl_path=$(find ./build//Release/_deps/tvm-src/python/dist -name "*.whl") -python3 -m pip install $whl_path -``` - -Package for TVM EP: -```bash -cd -python3 -m pip uninstall onnxruntime onnxruntime-tvm -y -whl_path=$(find ./build//Release/dist -name "*.whl") -python3 -m pip install $whl_path -``` - -Alternatively, you can set `PYTHONPATH` to tell python where to find the ONNXRT library and the TVM library. -```bash -export PYTHONPATH=/build//Release:${PYTHONPATH} -export PYTHONPATH=/build//Release/_deps/tvm-src/python:${PYTHONPATH} -``` - -### **Windows** -Install the minimal prerequisites on Windows: Git, CMake, Visual Studio, Python, LLVM -- Git: Download Git for Windows from [here](https://git-scm.com/download/win) and install it. Please make sure that the git.exe path is included in the environment variable. By default, it should be added. To check git after the installation use `git --version` in command line (cmd). -- CMake: use [the link](https://cmake.org/download/) to download and install CMake. msi-file is recommended for it. To verify CMake installation use `cmake --version` in cmd. -- Visual Studio: Download from [here](https://visualstudio.microsoft.com/ru/downloads/) and install Visual Studio 20** Community & Visual Studio Build Tools respectively. It is recommended not to change the default installation path. Chose "Desktop development with C++" workload and make sure that both options of “MSVC [contemporary version] C++ build tools” and “Windows 10 SDK” are selected. -- Python: Download Python 3.* from [here](https://www.python.org/downloads/windows/) and install it. Please have a check on the option of “Add Python to PATH”, so the installer will include the Python directory into the environment variable directly. To check python after the installation use `python` from cmd. The expected output is similar to the following: -```cmd -Python 3.10.5 (tags/v3.10.5:f377153, Jun 6 2022, 16:14:13) [MSC v.1929 64 bit (AMD64)] on win32 -Type "help", "copyright", "credits" or "license" for more information. ->>> -``` -Use `quit()` to exit from python interface. -- LLVM: the compiler is not necessary for pure ONNX Runtime installation but it is needed for TVM EP by default. -```cmd -git clone --depth 1 --branch release/11.x https://github.com/llvm/llvm-project.git -cmake -S llvm -B build -DLLVM_ENABLE_PROJECTS="clang;libcxx;libcxxabi" -DLLVM_TARGETS_TO_BUILD=X86 -Thost=x64 -DCMAKE_BUILD_TYPE=Release -G "Visual Studio 17 2022" -cmake --build ./build --config Release -``` -- Dependencies of ipp-crypto:
-1. install asm compiler (nasm) on windows by line: -```cmd -winget install nasm -i -``` -          -Add it to PATH (instruction for Windows GUI can be seen [here](https://www.computerhope.com/issues/ch000549.htm#dospath)) or by cmd: -```cmd -set PATH="%PATH%;C:\Program Files\NASM" -``` -          -or -```cmd -setx PATH "%PATH%;C:\Program Files\NASM" -``` -          -Check by `nasm --version` in prompt command line.
-       -2. install openssl on windows by msi-file from [here](https://slproweb.com/products/Win32OpenSSL.html) -Add path to directory (e.g. "C:\Program Files\OpenSSL-Win64\bin") with executable file to PATH (see instructions above).
-          -Check by `openssl version` in prompt command line.
-       -3. Correct build of ipp-crytpo requires specific environment variables for supported MSVC compiler. Long way to adjust the environment is to follow to instructions [here](https://docs.microsoft.com/en-us/cpp/build/building-on-the-command-line?view=msvc-170&viewFallbackFrom=vs-2017). Quick way is to use VS Developer command prompt where the environment have been already adjusted or add some paths to standard Windows command prompt: -```cmd -set INCLUDE=C:\Program Files\Microsoft Visual Studio\2022\Community\VC\Tools\MSVC\14.32.31326\include;C:\Program Files (x86)\Windows Kits\10\include\10.0.22621.0\ucrt -``` -          -Take into account that MSVC and Kit versions are specific for Visual Studio built on the machine, specified values here are used as example. -
-
- -For using NVIDIA GPU (optional) CUDA and cuDNN should be installed. -- CUDA: Install CUDA by the [link](https://developer.nvidia.com/cuda-11.0-download-archive). -- cuDNN: download cuDNN installer from [here](https://developer.nvidia.com/rdp/cudnn-archive). Choose v8.* for corresponding CUDA v11.*, unzip it, and move cuDNN files as following: -1. [unzipped dir]\bin\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\bin -2. [unzipped dir]\include\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\include -3. [unzipped dir]\lib\ → C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.0\lib - -To verify the CUDA installation use `nvcc --version` in cmd. -
-
- -#### **Build ONNX Runtime with TVM Execution Provider from source (Python):** -- Use command line and clone sources from github: -```cmd -git clone --recursive https://github.com/Microsoft/onnxruntime -cd onnxruntime -``` -- CPU build: -``` -build.bat --config Release --enable_pybind --build_wheel --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config /build/Release/bin/llvm-config.exe -``` -- GPU build: -``` -build.bat --config Release --enable_pybind --build_wheel --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config /build/Release/bin/llvm-config.exe --use_cuda --cudnn_home “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.*” --cuda_home “C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.*” -``` -In both cases (CPU, GPU) there are the following options for cmake generator: "Visual Studio 17 2022" and "Ninja". Also handshake mechanism can be switched on by `--use_tvm_hash` flag. At the latter case ipp-crypto library is built with dependencies, see details above. -- Install python wheel package for ONNX Runtime:
-Default path to the package is `/build/Windows/Release/Release/dist`. Note that it is different in comparison with path to the package on Linux. Before installation check names of wheel packages and use corresponding one. It can be looked like the following: -```cmd -python -m pip install .\onnxruntime\build\Windows\Release\Release\dist\onnxruntime_tvm-1.6.0-cp38-cp38-win_amd64.whl -``` -- Install python wheel package for TVM due to its python API is used inside TVM EP:
-It can be looked like the following: -```cmd -python -m pip install .\onnxruntime\build\Windows\Release\_deps\tvm-src\python\dist\tvm-0.9.dev1728+g3425ed846-cp39-cp39-win_amd64.whl -``` -- Verify result by python script. Note: python should not be launched from directory containing 'onnxruntime' directory for correct result: -```python -import onnxruntime -print(onnxruntime.__version__) -print(onnxruntime.get_device()) -print(onnxruntime.get_available_providers()) -``` -- Uninstall procedure: -```cmd -pip uninstall onnxruntime-tvm -``` - -#### **Build ONNX Runtime with TVM Execution Provider from source (C#):** -- Use command line and clone sources from github: -```cmd -git clone --recursive https://github.com/Microsoft/onnxruntime -cd onnxruntime -``` -- CPU build: - -Make sure you download [nuget.exe](https://docs.microsoft.com/en-us/nuget/install-nuget-client-tools#nugetexe-cli) and add path to it into `PATH` env. -``` -build.bat --config Release --build_nuget --skip_tests --parallel --use_tvm --skip_onnx_tests --cmake_generator "Visual Studio 17 2022" --llvm_config llvm-config.exe -``` -- Install C# nuget package for TVM EP. Default path to the package is `\build\Windows\Release\Release`. - - -## Configuration options -TVM Executor Provider can be configured with the following provider options: -1. Python -```python -po = [dict(executor=tvm_executor_type, - so_folder=folder_with_pretuned_files, - check_hash=check_hash, - hash_file_path=hash_file_path, - target=client_target, - target_host=client_target_host, - opt_level=client_opt_level, - freeze_weights=freeze, - to_nhwc=layout_transform, - tuning_type=tvm_optimizer_type, - tuning_file_path=client_tuning_logfile, - input_names = input_names_str, - input_shapes = input_shapes_str)] -tvm_session = onnxruntime.InferenceSession(model_path, providers=["TvmExecutionProvider"], provider_options=po) -``` - -2. C# - -Currently, only precompiled models are supported in C# (see the related section below). - -```CSharp -SessionOptions session_options = new SessionOptions{}; -string tvm_ep_options = - $"executor: {tvm_executor_type}, " + - $"so_folder: {folder_with_pretuned_files}, " + - $"check_hash: {check_hash}, " + - $"hash_file_path: {hash_file_path}, " + - $"target: {client_target}, " + - $"target_host: {client_target_host}, " + - $"opt_level: {client_opt_level}, " + - $"freeze_weights: {freeze}, " + - $"to_nhwc: {layout_transform}, " + - $"tuning_type: {tvm_optimizer_type}, " + - $"tuning_file_path: {client_tuning_logfile}, " + - $"input_names: {input_names_str}, " + - $"input_shapes: {input_shapes_str}"; - -session_options.AppendExecutionProvider_Tvm(tvm_ep_options); -using var tvm_session = new InferenceSession(modelFilePath, session_options); -``` -
- -- `executor` is executor type used by TVM. There is choice between two types: GraphExecutor and VirtualMachine which are corresponded to "graph" and "vm" tags. VirtualMachine is used by default. -- `so_folder` is path to folder with set of files (.ro-, .so/.dll-files and weights) obtained after model tuning. It uses these files for executor compilation instead of onnx-model. But the latter is still needed for ONNX Runtime. -- `check_hash` means that it is necessary to perform a HASH check for the model obtained in the `so_folder` parameter. It is `False` by default. -- `hash_file_path` is path to file that contains the pre-computed HASH for the ONNX model which result of tuning locates in the path passed by `so_folder` parameter. - If an empty string was passed as this value, then the file will be searched in the folder that was passed in the `so_folder` parameter. -- `target` and `target_host` are strings like in TVM (e.g. "llvm --mcpu=avx2"). When using accelerators, target may be something like `cuda` while target_host may be `llvm -mtriple=x86_64-linux-gnu` -- `opt_level` is TVM optimization level. It is 3 by default -- `freeze_weights` means that all model weights are kept on compilation stage otherwise they are downloaded each inference. True is recommended value for the best performance. It is true by default. -- `to_nhwc` switches on special model transformations, particularly data layout, which Octomizer is used. It allows to work correctly with tuning logs obtained from Octomizer. It is false by default. -- `tuning_type` defines the type of TVM tuning logs being used, and can be set to either `AutoTVM` (1st gen auto tuning logs) or `Ansor` (2nd gen auto tuning logs). By default this option is set to `AutoTVM`. -- `tuning_file_path` is path to AutoTVM or Ansor tuning file which gives specifications for given model and target for the best performance. (See below for more details). - -TVM supports models with fixed graph only. If your model has unknown dimensions in input shapes (excluding batch size) you must provide the shape using the `input_names` and `input_shapes` provider options. Below is an example of what must be passed to `provider_options`: -```python -input_names = "input_1 input_2" -input_shapes = "[1 3 224 224] [1 2]" -``` - -## Performance Tuning -TVM optimizes machine learning models through an automated tuning process that produces model variants specific to targeted hardware architectures. This process also generates 'tuning logs' that the TVM EP relies on to maximize model performance. These logs can be acquired for your model by either using TVM as described here: - -AutoTVM: -https://tvm.apache.org/docs/how_to/tune_with_autotvm/index.html - -Ansor (Autoscheduling): -https://tvm.apache.org/docs/how_to/tune_with_autoscheduler/index.html - -or by using logs generated through the OctoML platform (https://onnx.octoml.ai) using instructions [here](https://help.octoml.ai/en/articles/5814452-using-octoml-platform-logs-with-onnx-rt-tvm-ep) - -Using the TVM EP with TVM tuning logs also requires users to turn off ONNX Runtime preprocessing. To do this, the following `SessionOptions()` can be used: -``` -so = onnxruntime.SessionOptions() -so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL - -tvm_session = onnxruntime.InferenceSession(model_path, sess_options=so, providers=["TvmExecutionProvider"], provider_options=po) -``` - -### **Using precompiled model** -It is also possible to use a precompiled model. - -The compiled model can be obtained using the [OctoML platform](https://onnx.octoml.ai) -or compiled directly (see **Support precompiled model** section in -[Sample notebook for ResNet50 inference with TVM EP](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb) -for more information on model compilation). - -In order to use the precompiled model, only need to pass two options: -* **executor** - `vm` (`VirtualMachine`) must be used as a value -(this functionality is not supported for `GraphExecutor`); -* **so_folder** - as a value, you must pass the path to the directory where -the files of the precompiled model are located. -* **check_hash** - (optional) if you want to check hash, you must pass `True` as the value. -* **hash_file_path** - (optional) by default, the file containing the hash for the tuned model will be searched in the directory that is passed in the `so_folder` parameter. - If you want to specify different location, then you must pass the path to the file that contains the desired hash as a value. - -You can read more about these options in section [Configuration options](#configuration-options) above. - - -## Samples -- [Sample notebook for ResNet50 inference with TVM EP](https://github.com/microsoft/onnxruntime/blob/main/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb) - -## Known issues -- At this moment, the TVM EP has only been verified on UNIX/Linux and Windows systems. -- Some compatibility issues have been found between ONNX and Google protobuf. `AttributeError: module 'google.protobuf.internal.containers' has no attribute 'MutableMapping'`. This usually occurss during `import onnx` in any python scripts for protobuf version >= 3.19.0 and ONNX version <= 1.8.1. To resolve the issue Google protobuf and ONNX can be reinstalled separately or together using: -``` -pip3 uninstall onnx -y -pip3 install onnx==1.10.1 -pip3 uninstall protobuf -y -pip3 install protobuf==3.19.1 -``` - -The following pair of ONNX and protobuf versions have been found to be compatible: -- 3.17.3 and 1.8.0 -- 3.19.1 and 1.10.1 diff --git a/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb b/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb deleted file mode 100644 index 830495bdfb98d..0000000000000 --- a/docs/python/notebooks/onnxruntime-tvm-tutorial.ipynb +++ /dev/null @@ -1,657 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "72476497", - "metadata": {}, - "source": [ - "# ONNX Runtime: Tutorial for TVM execution provider\n", - "\n", - "This notebook shows a simple example for model inference with TVM EP.\n", - "\n", - "\n", - "#### Tutorial Roadmap:\n", - "1. Prerequistes\n", - "2. Accuracy check for TVM EP\n", - "3. Configuration options\n", - "4. Support precompiled model" - ] - }, - { - "cell_type": "markdown", - "id": "9345cbab", - "metadata": {}, - "source": [ - "## 1. Prerequistes\n", - "\n", - "Make sure that you have installed all the necessary dependencies described in the corresponding paragraph of the documentation.\n", - "\n", - "Also, make sure you have the `tvm` and `onnxruntime-tvm` packages in your pip environment. \n", - "\n", - "If you are using `PYTHONPATH` variable expansion, make sure it contains the following paths: `/onnxruntime/cmake/external/tvm_update/python` and `/onnxruntime/build/Linux/Release`." - ] - }, - { - "cell_type": "markdown", - "id": "da4ca21f", - "metadata": {}, - "source": [ - "### Common import\n", - "\n", - "These packages can be delivered from standard `pip`." - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "id": "0f072875", - "metadata": {}, - "outputs": [], - "source": [ - "import os\n", - "import onnx\n", - "import tempfile\n", - "import numpy as np\n", - "from typing import List, AnyStr\n", - "from onnx import ModelProto, helper, checker, mapping" - ] - }, - { - "cell_type": "markdown", - "id": "118670aa", - "metadata": {}, - "source": [ - "### Specialized import\n", - "\n", - "It is better to collect these packages from source code in order to clearly understand what is available to you right now." - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "id": "a5502966", - "metadata": {}, - "outputs": [], - "source": [ - "import onnxruntime\n", - "\n", - "import tvm\n", - "import tvm.relay\n", - "import tvm.testing\n", - "import tvm.runtime\n", - "import tvm.runtime.vm\n", - "import tvm.relay.backend.vm\n", - "import tvm.contrib.download" - ] - }, - { - "cell_type": "markdown", - "id": "b7313183", - "metadata": {}, - "source": [ - "### Helper functions for working with ONNX ModelProto\n", - "\n", - "This set of helper functions allows you to recognize the meta information of the models. This information is needed for more versatile processing of ONNX models." - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7d0a36e8", - "metadata": {}, - "outputs": [], - "source": [ - "def get_onnx_input_names(model: ModelProto) -> List[AnyStr]:\n", - " inputs = [node.name for node in model.graph.input]\n", - " initializer = [node.name for node in model.graph.initializer]\n", - " inputs = list(set(inputs) - set(initializer))\n", - " return sorted(inputs)\n", - "\n", - "\n", - "def get_onnx_output_names(model: ModelProto) -> List[AnyStr]:\n", - " return [node.name for node in model.graph.output]\n", - "\n", - "\n", - "def get_onnx_input_types(model: ModelProto) -> List[np.dtype]:\n", - " input_names = get_onnx_input_names(model)\n", - " return [\n", - " mapping.TENSOR_TYPE_TO_NP_TYPE[node.type.tensor_type.elem_type]\n", - " for node in sorted(model.graph.input, key=lambda node: node.name) if node.name in input_names\n", - " ]\n", - "\n", - "\n", - "def get_onnx_input_shapes(model: ModelProto) -> List[List[int]]:\n", - " input_names = get_onnx_input_names(model)\n", - " return [\n", - " [dv.dim_value for dv in node.type.tensor_type.shape.dim]\n", - " for node in sorted(model.graph.input, key=lambda node: node.name) if node.name in input_names\n", - " ]\n", - "\n", - "\n", - "def get_random_model_inputs(model: ModelProto) -> List[np.ndarray]:\n", - " input_shapes = get_onnx_input_shapes(model)\n", - " input_types = get_onnx_input_types(model)\n", - " assert len(input_types) == len(input_shapes)\n", - " inputs = [np.random.uniform(size=shape).astype(dtype) for shape, dtype in zip(input_shapes, input_types)]\n", - " return inputs" - ] - }, - { - "cell_type": "markdown", - "id": "f0de1682", - "metadata": {}, - "source": [ - "### Wrapper helper functions for Inference\n", - "\n", - "Wrapper helper functions for running model inference using ONNX Runtime EP." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "258ce9e9", - "metadata": {}, - "outputs": [], - "source": [ - "def get_onnxruntime_output(model: ModelProto, inputs: List, provider_name: AnyStr) -> np.ndarray:\n", - " output_names = get_onnx_output_names(model)\n", - " input_names = get_onnx_input_names(model)\n", - " assert len(input_names) == len(inputs)\n", - " input_dict = {input_name: input_value for input_name, input_value in zip(input_names, inputs)}\n", - "\n", - " inference_session = onnxruntime.InferenceSession(model.SerializeToString(), providers=[provider_name])\n", - " output = inference_session.run(output_names, input_dict)\n", - "\n", - " # Unpack output if there's only a single value.\n", - " if len(output) == 1:\n", - " output = output[0]\n", - " return output\n", - "\n", - "\n", - "def get_cpu_onnxruntime_output(model: ModelProto, inputs: List) -> np.ndarray:\n", - " return get_onnxruntime_output(model, inputs, \"CPUExecutionProvider\")\n", - "\n", - "\n", - "def get_tvm_onnxruntime_output(model: ModelProto, inputs: List) -> np.ndarray:\n", - " return get_onnxruntime_output(model, inputs, \"TvmExecutionProvider\")" - ] - }, - { - "cell_type": "markdown", - "id": "cc17d3b2", - "metadata": {}, - "source": [ - "### Helper function for checking accuracy\n", - "\n", - "This function uses the TVM API to compare two output tensors. The tensor obtained using the `CPUExecutionProvider` is used as a reference.\n", - "\n", - "If a mismatch is found between tensors, an appropriate exception will be thrown." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "id": "4e598907", - "metadata": {}, - "outputs": [], - "source": [ - "def verify_outputs(\n", - " lhs: List[np.ndarray],\n", - " rhs: List[np.ndarray],\n", - " rtol: float = 5e-5,\n", - " atol: float = 5e-5\n", - ") -> None:\n", - " for lhs_tensor, rhs_tensor in zip(lhs, rhs):\n", - " tvm.testing.assert_allclose(lhs_tensor, rhs_tensor, rtol=rtol, atol=atol)\n", - " assert lhs_tensor.dtype == rhs_tensor.dtype\n", - " print(\"Same output, congratulations!\")" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "id": "f33a372b", - "metadata": {}, - "outputs": [], - "source": [ - "def verify_with_ort_with_inputs(\n", - " model,\n", - " inputs,\n", - " out_shape=None,\n", - " opset=None,\n", - " freeze_params=False,\n", - " dtype=\"float32\",\n", - " rtol=1e-5,\n", - " atol=1e-5,\n", - " opt_level=1,\n", - "):\n", - " if opset is not None:\n", - " model.opset_import[0].version = opset\n", - "\n", - " ort_out = get_cpu_onnxruntime_output(model, inputs)\n", - " tvm_out = get_tvm_onnxruntime_output(model, inputs)\n", - " verify_outputs(ort_out, tvm_out, rtol, atol)" - ] - }, - { - "cell_type": "markdown", - "id": "8c62b01a", - "metadata": {}, - "source": [ - "### Helper functions for download models\n", - "\n", - "These functions use the TVM API to download models from the ONNX Model Zoo." - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "324c00e7", - "metadata": {}, - "outputs": [], - "source": [ - "BASE_MODEL_URL = \"https://github.com/onnx/models/raw/master/\"\n", - "MODEL_URL_COLLECTION = {\n", - " \"ResNet50-v1\": \"vision/classification/resnet/model/resnet50-v1-7.onnx\",\n", - " \"ResNet50-v2\": \"vision/classification/resnet/model/resnet50-v2-7.onnx\",\n", - " \"SqueezeNet-v1.1\": \"vision/classification/squeezenet/model/squeezenet1.1-7.onnx\",\n", - " \"SqueezeNet-v1.0\": \"vision/classification/squeezenet/model/squeezenet1.0-7.onnx\",\n", - " \"Inception-v1\": \"vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx\",\n", - " \"Inception-v2\": \"vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx\",\n", - "}\n", - "\n", - "\n", - "def get_model_url(model_name):\n", - " return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name]\n", - "\n", - "\n", - "def get_name_from_url(url):\n", - " return url[url.rfind(\"/\") + 1 :].strip()\n", - "\n", - "\n", - "def find_of_download(model_name):\n", - " model_url = get_model_url(model_name)\n", - " model_file_name = get_name_from_url(model_url)\n", - " return tvm.contrib.download.download_testdata(model_url, model_file_name, module=\"models\")" - ] - }, - { - "cell_type": "markdown", - "id": "90fb7c5c", - "metadata": {}, - "source": [ - "## 2. Accuracy check for TVM EP \n", - "\n", - "This section will check the accuracy. The check will be to compare the output tensors for `CPUExecutionProvider` and `TvmExecutionProvider`. See the description of `verify_with_ort_with_inputs` function used above.\n", - "\n", - "\n", - "### Check for simple architectures" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "c739ed5c", - "metadata": {}, - "outputs": [], - "source": [ - "def get_two_input_model(op_name: AnyStr) -> ModelProto:\n", - " dtype = \"float32\"\n", - " in_shape = [1, 2, 3, 3]\n", - " in_type = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)]\n", - " out_shape = in_shape\n", - " out_type = in_type\n", - "\n", - " layer = helper.make_node(op_name, [\"in1\", \"in2\"], [\"out\"])\n", - " graph = helper.make_graph(\n", - " [layer],\n", - " \"two_input_test\",\n", - " inputs=[\n", - " helper.make_tensor_value_info(\"in1\", in_type, in_shape),\n", - " helper.make_tensor_value_info(\"in2\", in_type, in_shape),\n", - " ],\n", - " outputs=[\n", - " helper.make_tensor_value_info(\n", - " \"out\", out_type, out_shape\n", - " )\n", - " ],\n", - " )\n", - " model = helper.make_model(graph, producer_name=\"two_input_test\")\n", - " checker.check_model(model, full_check=True)\n", - " return model" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "7048ee6d", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Same output, congratulations!\n", - "****************** Success! ******************\n" - ] - } - ], - "source": [ - "onnx_model = get_two_input_model(\"Add\")\n", - "inputs = get_random_model_inputs(onnx_model)\n", - "verify_with_ort_with_inputs(onnx_model, inputs)\n", - "print(\"****************** Success! ******************\")" - ] - }, - { - "cell_type": "markdown", - "id": "52c880f4", - "metadata": {}, - "source": [ - "### Check for DNN architectures " - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f5d465dc", - "metadata": {}, - "outputs": [], - "source": [ - "def get_onnx_model(model_name):\n", - " model_path = find_of_download(model_name)\n", - " onnx_model = onnx.load(model_path)\n", - " return onnx_model" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "id": "68daac7e", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "One or more operators have not been tuned. Please tune your model for better performance. Use DEBUG logging level to see more details.\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Same output, congratulations!\n", - "****************** Success! ******************\n" - ] - } - ], - "source": [ - "model_name = \"ResNet50-v1\"\n", - "\n", - "onnx_model = get_onnx_model(model_name)\n", - "inputs = get_random_model_inputs(onnx_model)\n", - "verify_with_ort_with_inputs(onnx_model, inputs)\n", - "print(\"****************** Success! ******************\")" - ] - }, - { - "cell_type": "markdown", - "id": "e27f64a2", - "metadata": {}, - "source": [ - "## 3. Configuration options\n", - "\n", - "This section shows how you can configure TVM EP using custom options. For more details on the options used, see the corresponding section of the documentation." - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "id": "a053f59f", - "metadata": {}, - "outputs": [], - "source": [ - "provider_name = \"TvmExecutionProvider\"\n", - "provider_options = dict(\n", - " target=\"llvm -mtriple=x86_64-linux-gnu\",\n", - " target_host=\"llvm -mtriple=x86_64-linux-gnu\",\n", - " opt_level=3,\n", - " freeze_weights=True,\n", - " tuning_file_path=\"\",\n", - " tuning_type=\"Ansor\",\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "3f6e6f01", - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"ResNet50-v1\"\n", - "onnx_model = get_onnx_model(model_name)\n", - "input_dict = {\n", - " input_name: input_value for input_name, input_value in zip(\n", - " get_onnx_input_names(onnx_model),\n", - " get_random_model_inputs(onnx_model),\n", - " )\n", - "}\n", - "output_names = get_onnx_output_names(onnx_model)" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "id": "85ab83f2", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "****************** Output shape: (1, 1000) ******************\n" - ] - } - ], - "source": [ - "tvm_session = onnxruntime.InferenceSession(\n", - " onnx_model.SerializeToString(),\n", - " providers=[provider_name],\n", - " provider_options=[provider_options],\n", - ")\n", - "output = tvm_session.run(output_names, input_dict)[0]\n", - "print(f\"****************** Output shape: {output.shape} ******************\")" - ] - }, - { - "cell_type": "markdown", - "id": "b704374b", - "metadata": {}, - "source": [ - "## 4. Support precompiled model\n", - "\n", - "Wrapper functions that allow you to compile the model and save it in the desired format." - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "id": "8150942b", - "metadata": {}, - "outputs": [], - "source": [ - "def compile_virtual_machine(model: onnx.ModelProto, target_str: AnyStr) -> tvm.runtime.vm.Executable:\n", - " ir_mod, params = tvm.relay.frontend.from_onnx(\n", - " model,\n", - " opset=model.opset_import[0].version,\n", - " freeze_params=True,\n", - " )\n", - " target = tvm.target.Target(target=target_str, host=target_str)\n", - " return tvm.relay.backend.vm.compile(ir_mod, target)\n", - "\n", - "\n", - "def serialize_virtual_machine(vm_exec: tvm.runtime.vm.Executable) -> AnyStr:\n", - " temp_directory = tempfile.mkdtemp()\n", - " path_consts = os.path.join(temp_directory, \"consts\")\n", - " vm_exec.move_late_bound_consts(path_consts, byte_limit=256)\n", - " lib_path = os.path.join(temp_directory, f\"model.so\")\n", - " code_path = os.path.join(temp_directory, f\"model.ro\")\n", - " code, lib = vm_exec.save()\n", - " lib.export_library(lib_path)\n", - " with open(code_path, \"wb\") as fo:\n", - " fo.write(code)\n", - " return temp_directory" - ] - }, - { - "cell_type": "markdown", - "id": "9cbb987e", - "metadata": {}, - "source": [ - "Preparation of the ONNX model." - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "id": "febb9d72", - "metadata": {}, - "outputs": [], - "source": [ - "model_name = \"ResNet50-v1\"\n", - "onnx_model = get_onnx_model(model_name)\n", - "input_dict = {\n", - " input_name: input_value for input_name, input_value in zip(\n", - " get_onnx_input_names(onnx_model),\n", - " get_random_model_inputs(onnx_model),\n", - " )\n", - "}\n", - "output_names = get_onnx_output_names(onnx_model)" - ] - }, - { - "cell_type": "markdown", - "id": "b05b251a", - "metadata": {}, - "source": [ - "Compiling the ONNX model using `VirtualMachine` (TVM)." - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "id": "b4b999ee", - "metadata": {}, - "outputs": [], - "source": [ - "compiled_vm_exec = compile_virtual_machine(onnx_model, target_str=\"llvm\")" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - "id": "e3408c15", - "metadata": {}, - "outputs": [], - "source": [ - "so_folder = serialize_virtual_machine(compiled_vm_exec)" - ] - }, - { - "cell_type": "markdown", - "id": "311405e8", - "metadata": {}, - "source": [ - "Preparing `ProviderOptions` and launching `TVM EP` inference.\n", - "\n", - "In order to use the precompiled model, you only need to pass two options:\n", - "* **executor** - `vm` (`VirtualMachine`) must be used as a value (this functionality is not supported for `GraphExecutor`);\n", - "* **so_folder** - as a value, you must pass the path to the directory where the files of the precompiled model are located." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "id": "8927293c", - "metadata": {}, - "outputs": [], - "source": [ - "provider_name = \"TvmExecutionProvider\"\n", - "provider_options = dict(\n", - " executor=\"vm\",\n", - " so_folder=so_folder,\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "id": "d7532863", - "metadata": {}, - "outputs": [], - "source": [ - "tvm_session = onnxruntime.InferenceSession(\n", - " onnx_model.SerializeToString(),\n", - " providers=[provider_name],\n", - " provider_options=[provider_options],\n", - ")\n", - "tvm_output = tvm_session.run(output_names, input_dict)" - ] - }, - { - "cell_type": "markdown", - "id": "1c0b983e", - "metadata": {}, - "source": [ - "Let's make sure that the output values match those that can be obtained through `CPUExecutionProvider`:" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "id": "c3de2299", - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Same output, congratulations!\n" - ] - } - ], - "source": [ - "verify_outputs(\n", - " tvm_output[0],\n", - " get_cpu_onnxruntime_output(\n", - " onnx_model,\n", - " input_dict.values()\n", - " ),\n", - ")" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -} diff --git a/onnxruntime/core/codegen/common/common.cc b/onnxruntime/core/codegen/common/common.cc deleted file mode 100644 index 818b919e99ef2..0000000000000 --- a/onnxruntime/core/codegen/common/common.cc +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/common/common.h" - -#include "core/framework/tensorprotoutils.h" -#include "core/common/inlined_containers.h" -#include "core/graph/graph.h" -#include "core/graph/schema_registry.h" -#include -#include - -namespace onnxruntime { - -NodeKey GetKey(const onnxruntime::Node* node) { - ORT_ENFORCE(nullptr != node); - ORT_ENFORCE(node->OutputDefs().size() > 0); - return node->OutputDefs()[0]->Name(); -} - -NodeKey GetKey(const onnxruntime::Node& node) { - ORT_ENFORCE(node.OutputDefs().size() > 0); - return node.OutputDefs()[0]->Name(); -} - -NodeKey GetKey(const onnxruntime::NodeArg* def) { - // NodeArg's name is unique. - ORT_ENFORCE(nullptr != def); - return def->Name(); -} - -bool IsRecurrentNode(const onnxruntime::Node& node) { - auto op_type = node.OpType(); - return (op_type == "LSTM" || op_type == "RNN" || op_type == "GRU" || - op_type == "Scan" || op_type == "Loop"); -} - -bool IsAliasNode(const onnxruntime::Node& node) { - auto op_type = node.OpType(); - if (op_type == "Transpose") { - // Treat Transpose (1,N) -> (N,1) as Alias - const auto shape = node.OutputDefs()[0]->Shape(); - if (shape != nullptr && shape->dim_size() == 2) { - for (int i = 0; i < 2; ++i) { - if (shape->dim(i).has_dim_value() && shape->dim(i).dim_value() == 1) { - return true; - } - } - } - return false; - } - - return (op_type == "Flatten" || op_type == "Identity" || op_type == "Reshape" || - op_type == "Squeeze" || op_type == "Unsqueeze"); -} - -std::string NormalizeCppName(const std::string& name) { - std::string normalized_name = name; - for (char c : {'.', ' ', '+', '-', '*', '/', '\\', '='}) - std::replace(normalized_name.begin(), normalized_name.end(), c, '_'); - return normalized_name; -} - -std::string NormalizeNodeArgName(const NodeArg* def) { - return NormalizeCppName(def->Name()); -} - -bool IsFusedNode(const Node& node) { - if (node.NodeType() == Node::Type::Fused) { - return true; - } - return false; -} - -// A unified API to get Subgraph -const Graph* GetSubgraph(const Node& node) { - if (node.NodeType() == Node::Type::Fused) { - return &(node.GetFunctionBody()->Body()); - } else if (node.OpType() == "Scan") { - return node.GetGraphAttribute("body"); - } - // return nullptr implying no subgraph - return nullptr; -} - -bool HasLoop(const Node& node) { - auto op_type = node.OpType(); - if (op_type == "LSTM" || - op_type == "GRU" || - op_type == "RNN" || - op_type == "Scan") { - return true; - } - return false; -} - -// Return the corresponding input node for the NodeArg of the given node -const onnxruntime::Node* GetInputNode(const Node& node, const NodeArg* def) { - const auto& input_name = def->Name(); - const onnxruntime::Node* input_node = nullptr; - // search input node set to see if input_name is in their outputs (weights are not from node) - for (auto iter = node.InputNodesBegin(); iter != node.InputNodesEnd(); ++iter) { - const onnxruntime::Node& p = *iter; - bool found = false; - ORT_THROW_IF_ERROR(p.ForEachWithIndex( - p.OutputDefs(), - [&found, &input_name](const onnxruntime::NodeArg& out_def, size_t) { - if (input_name == out_def.Name()) { - found = true; - } - return Status::OK(); - })); - if (found) - input_node = &p; - } - return input_node; -} - -// create capacity from subgraph -std::unique_ptr ToCapacity(const onnxruntime::GraphViewer& graph, - int fused_count, - std::unique_ptr& subgraph) { - auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); - meta_def->name = "Fuse" + std::to_string(fused_count); - meta_def->domain = "Fuse"; - - std::set node_indices(subgraph->nodes.begin(), subgraph->nodes.end()); - - const auto& start_node_index = subgraph->nodes.front(); - const auto& start_node = *graph.GetNode(start_node_index); - const auto& end_node_index = subgraph->nodes.back(); - const auto& end_node = *graph.GetNode(end_node_index); - meta_def->name += start_node.OpType() + std::to_string(start_node_index); - meta_def->name += "_With" + std::to_string(subgraph->nodes.size()) + "Nodes_"; - meta_def->name += end_node.OpType() + std::to_string(end_node_index); - - InlinedHashSet real_output_names; - real_output_names.reserve(graph.GetOutputs().size()); - for (const auto* def : graph.GetOutputs()) { - real_output_names.insert(def->Name()); - } - - for (const auto& node_index : subgraph->nodes) { - const auto& node = *graph.GetNode(node_index); - auto process_input_fn = - [&meta_def, &node, &node_indices](const onnxruntime::NodeArg& def, size_t) { - const onnxruntime::Node* input_node = GetInputNode(node, &def); - bool input_from_subgraph = (input_node && node_indices.count(input_node->Index())); - if (!input_from_subgraph) { - // input is from weights or outside of graph - meta_def->inputs.push_back(def.Name()); - } - return Status::OK(); - }; - // handle current graph's inputs - ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.InputDefs(), process_input_fn)); - // nodes' implicit inputs also need to be collected. They need to - // be promoted to being explicit inputs for everything to work. - ORT_THROW_IF_ERROR(node.ForEachWithIndex(node.ImplicitInputDefs(), process_input_fn)); - - // Handle outouts - // two cases are considered as outputs - // 1. Output NodeArg is not used by any Node - // 2. Output NodeArg is used by at least one Node out of this subgraph. - // Note a NodeArg can be used by Nodes in and out of the subgraph at the same time. - // 3. Output NodeArg is one of real outputs of an Ort graph. - - auto InsertOutputToSubgraph = [&meta_def](const NodeArg* def) { - if (std::find(meta_def->outputs.begin(), meta_def->outputs.end(), def->Name()) == - meta_def->outputs.end()) { - meta_def->outputs.push_back(def->Name()); - } - }; - - InlinedHashSet input_names_from_the_output_node; - - for (auto o_iter = node.OutputEdgesBegin(); o_iter != node.OutputEdgesEnd(); ++o_iter) { - const auto& p = *o_iter; - const Node& out_node = p.GetNode(); - - // preprocess for the case 1 - ORT_THROW_IF_ERROR(out_node.ForEachWithIndex( - out_node.InputDefs(), - [&input_names_from_the_output_node](const onnxruntime::NodeArg& in_def, size_t) { - input_names_from_the_output_node.insert(in_def.Name()); - return Status::OK(); - })); - - // handle the case 2 - if (node_indices.count(out_node.Index()) == 0) { - const NodeArg* def = node.OutputDefs()[p.GetSrcArgIndex()]; - InsertOutputToSubgraph(def); - } - } - - // handle case 1 and 3 - ORT_THROW_IF_ERROR(node.ForEachWithIndex( - node.OutputDefs(), - [&](const onnxruntime::NodeArg& def, size_t) { - if (input_names_from_the_output_node.count(def.Name()) == 0 || - real_output_names.count(def.Name()) > 0) { - InsertOutputToSubgraph(&def); - } - return Status::OK(); - })); - } - - // Handle subgraph's initializers - const auto& all_initializers = graph.GetAllInitializedTensors(); - for (const auto& node_index : subgraph->nodes) { - const auto& node = *graph.GetNode(node_index); - // check whether it is an immediate nested subgraph - auto immediate_nested_subgraph = GetSubgraph(node); - // If so, copy the immediate nested subgraph's initializers to meta_def->inputs. - // Note we don't need recursion here, since Ort did recursion for us by handling subgraph early than the current graph. - // Therefore, the all inner nested subgraph's initializers should be already in the immediate nested subgraph's inputs. - if (nullptr != immediate_nested_subgraph) { - for (auto& n : immediate_nested_subgraph->Nodes()) { - auto add_input_fn = - [&meta_def, &all_initializers](const onnxruntime::NodeArg& def, size_t) { - auto iter = all_initializers.find(def.Name()); - if (iter != all_initializers.end()) { - meta_def->inputs.push_back(def.Name()); - } - return Status::OK(); - }; - ORT_THROW_IF_ERROR(n.ForEachWithIndex(n.InputDefs(), add_input_fn)); - ORT_THROW_IF_ERROR(n.ForEachWithIndex(n.ImplicitInputDefs(), add_input_fn)); - } - } - } - - meta_def->since_version = 1; - meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL; - std::unique_ptr finished_subgraph(subgraph.release()); - finished_subgraph->SetMetaDef(std::move(meta_def)); - return std::make_unique(std::move(finished_subgraph)); -} - -int64_t ShapeRank(const NodeArg* def) { - ORT_ENFORCE_DEBUG(nullptr != def); - return gsl::narrow_cast(def->Shape()->dim_size()); -} - -bool ShapeHasValue(const NodeArg* def, int i) { - ORT_ENFORCE_DEBUG(nullptr != def); - ORT_ENFORCE_DEBUG(i >= 0); - ORT_ENFORCE_DEBUG(i < def->Shape()->dim_size()); - return utils::HasDimValue(def->Shape()->dim(i)); -} - -bool ShapeHasSymbol(const NodeArg* def, int i) { - ORT_ENFORCE_DEBUG(nullptr != def); - ORT_ENFORCE_DEBUG(i >= 0); - ORT_ENFORCE_DEBUG(i < def->Shape()->dim_size()); - return utils::HasDimParam(def->Shape()->dim(i)); -} - -int64_t ShapeValue(const NodeArg* def, int i) { - ORT_ENFORCE_DEBUG(ShapeHasValue(def, i)); - return def->Shape()->dim(i).dim_value(); -} - -const std::string& ShapeSymbol(const NodeArg* def, int i) { - ORT_ENFORCE_DEBUG(ShapeHasSymbol(def, i)); - return def->Shape()->dim(i).dim_param(); -} - -ONNX_NAMESPACE::TensorProto_DataType TensorProtoDataType(const NodeArg* def) { - ORT_ENFORCE_DEBUG(nullptr != def); - return static_cast(def->TypeAsProto()->tensor_type().elem_type()); -} - -// Convert GraphNodes to internal NodePtrs without check lifetime. -// Please use it only locally when GraphNodes still exist -InlinedVector ConvertGraphNodesToNodePtrs(const ConstGraphNodes& graph_nodes) { - InlinedVector nodes; - for (auto& node : graph_nodes) { - nodes.push_back(&node); - } - return nodes; -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/common.h b/onnxruntime/core/codegen/common/common.h deleted file mode 100644 index 81b74daf6f711..0000000000000 --- a/onnxruntime/core/codegen/common/common.h +++ /dev/null @@ -1,153 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/common/inlined_containers.h" -#include "core/framework/compute_capability.h" -#include "core/framework/tensor.h" -#include "core/graph/graph_nodes.h" -#include "core/graph/graph_viewer.h" - -#ifndef NDEBUG -#define ORT_ENFORCE_DEBUG(...) ORT_ENFORCE(__VA_ARGS__) -#else -#define ORT_ENFORCE_DEBUG(...) -#endif // !NDEBUG - -// DYN_PROMOTE is a simplified llvm::dyn_cast, which does not need RTTI -// DYN_PROMOTE is faster than dynamic_cast and also has smaller binary size -// Please use DYN_PROMOTE in a critical path. -#define DYN_PROMOTE(BASE) \ - template \ - inline const ToType* Promote(const BASE* base) { \ - if (ToType::IsType(base)) \ - return static_cast(base); \ - return nullptr; \ - } \ - \ - template \ - inline ToType* Promote(BASE* base) { \ - if (ToType::IsType(base)) \ - return static_cast(base); \ - return nullptr; \ - } \ - \ - template \ - inline ToType* Promote(const std::unique_ptr& base) { \ - if (ToType::IsType(base.get())) \ - return static_cast(base); \ - return nullptr; \ - } \ - \ - template \ - inline ToType* Promote(const std::shared_ptr& base) { \ - if (ToType::IsType(base.get())) \ - return static_cast(base); \ - return nullptr; \ - } - -// DYN_PROMOTE_BASE is a macro inserted in the base class to support DYN_PROMOTE -// TYPE_ID is required for DYN_PROMOTE and TYPE_ID is a enum class -// TYPE_ID_VAR is a corresponding variable name for in the base class -#define DYN_PROMOTE_BASE(BASE, TYPE_ID, TYPE_ID_VAR) \ - inline const TYPE_ID TypeID() const { \ - return TYPE_ID_VAR; \ - } \ - \ - static inline bool IsType(const BASE*) { \ - return true; \ - } - -// DYN_PROMOTE_DERIVED is a macro inserted in a derived class to support DYN_PROMOTE -// TYPE_ID is required for DYN_PROMOTE and TYPE_ID is a enum class -// TYPE_ID_VALUE is corresponding TYPE_ID::value of a derived class. -#define DYN_PROMOTE_DERIVED(BASE, TYPE_ID, TYPE_ID_VALUE) \ - static inline bool IsType(const BASE* base) { \ - ORT_ENFORCE_DEBUG(nullptr != base); \ - return base->TypeID() == TYPE_ID::TYPE_ID_VALUE; \ - } - -// DYNAMIC_PROMOTE is a dynamic_cast needing RTTI -// DYNAMIC_PROMOTE is usually slower than than DYN_PROMOTE. -// Please use DYNAMIC_PROMOTE in a non-critical path. -#define DYNAMIC_PROMOTE(BASE) \ - template \ - inline const X* Promote(const BASE* base) { \ - auto derived = dynamic_cast(base); \ - ORT_ENFORCE(nullptr != derived); \ - return derived; \ - } \ - \ - template \ - inline X* Promote(BASE* base) { \ - auto derived = dynamic_cast(base); \ - ORT_ENFORCE(nullptr != derived); \ - return derived; \ - } \ - \ - template \ - inline X* Promote(const std::unique_ptr& base) { \ - auto derived = dynamic_cast(base.get()); \ - ORT_ENFORCE(nullptr != derived); \ - return derived; \ - } \ - \ - template \ - inline X* Promote(const std::shared_ptr& base) { \ - auto derived = dynamic_cast(base.get()); \ - ORT_ENFORCE(nullptr != derived); \ - return derived; \ - } - -namespace onnxruntime { - -// Nodekey is used as a key for maps -using NodeKey = std::string; - -NodeKey GetKey(const onnxruntime::Node* node); -NodeKey GetKey(const onnxruntime::Node& node); -NodeKey GetKey(const onnxruntime::NodeArg* def); - -bool IsRecurrentNode(const onnxruntime::Node& node); - -bool IsAliasNode(const onnxruntime::Node& node); - -// Helper function that creates ComputeCapability for subgraphs -std::unique_ptr ToCapacity(const onnxruntime::GraphViewer& graph, - int fused_count, - std::unique_ptr& subgraph); - -bool IsFusedNode(const Node& node); - -bool HasLoop(const Node& node); - -const Graph* GetSubgraph(const Node& node); - -std::string NormalizeCppName(const std::string& name); - -std::string NormalizeNodeArgName(const NodeArg* def); - -// Return the corresponding input node for the NodeArg of the given node -const onnxruntime::Node* GetInputNode(const Node& node, const NodeArg* def); - -int64_t ShapeRank(const NodeArg* def); - -bool ShapeHasValue(const NodeArg* def, int i); - -bool ShapeHasSymbol(const NodeArg* def, int i); - -int64_t ShapeValue(const NodeArg* def, int i); - -const std::string& ShapeSymbol(const NodeArg* def, int i); - -ONNX_NAMESPACE::TensorProto_DataType TensorProtoDataType(const NodeArg* def); - -// Convert ConstGraphNodes to internal NodePtrs without check lifetime. -// Please use it only locally when GraphNodes still exist -InlinedVector ConvertGraphNodesToNodePtrs(const ConstGraphNodes& graph_nodes); - -enum : int { - Dimension_Unknown = -1, -}; - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/creator.h b/onnxruntime/core/codegen/common/creator.h deleted file mode 100644 index b31a12db4875b..0000000000000 --- a/onnxruntime/core/codegen/common/creator.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/dispatcher.h" - -// TODO rename this file to creator_base -namespace onnxruntime { -namespace codegen { - -// It is a base class for TVM Op IR builder, weight layout builder, TVM scheduler -// CreatorBase is a template class of compiler pass -// for 1) TVM IR builder -// 2) Weight layout transformer -// 3) TVM Scheduler, etc. -// CreatorBase is similor to OpXXCreate in llvm IR builder - -template -class CreatorBase { - public: - CreatorBase(const std::string& name) - : name_(name) {} - - virtual ~CreatorBase() = default; - - virtual RETURN_TYPE Evaluate(INPUT_TYPE, - NODE_TYPE, - CONTEXT_TYPE, - OUTPUT_TYPE) = 0; - - const std::string& Name() const { - return name_; - } - - protected: - std::string name_; - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CreatorBase); -}; - -// macro to stringize -#define STRINGIZE_NX(OP) #OP -#define STRINGIZE(OP) STRINGIZE_NX(OP) - -// macro returns class name -#define CREATOR_CLASS(OP, POSTFIX) \ - OP##POSTFIX - -// macro returns class name as string -#define CREATOR_STRING(OP, POSTFIX) \ - STRINGIZE(CREATOR_CLASS(OP, POSTFIX)) - -// macro returns class constructor name -#define CREATOR_CLASS_FUNC(OP, POSTFIX) \ - OP##POSTFIX() - -// macro declares a creator class inheriting the template class CreatorBase -// with corresponding template parameters -#define DECLARE_CREATOR_CLASS(OP, POSTFIX, INPUT, NODE, CONTEXT, OUTPUT, RETURN) \ - class CREATOR_CLASS(OP, POSTFIX) : public onnxruntime::codegen::CreatorBase { \ - public: \ - CREATOR_CLASS_FUNC(OP, POSTFIX) : CreatorBase(CREATOR_STRING(OP, POSTFIX)) {} \ - RETURN Evaluate(INPUT, \ - NODE, \ - CONTEXT, \ - OUTPUT) override; \ - \ - private: \ - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(CREATOR_CLASS(OP, POSTFIX)); \ - }; - -} // namespace codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/dispatcher.h b/onnxruntime/core/codegen/common/dispatcher.h deleted file mode 100644 index 80a854a06977c..0000000000000 --- a/onnxruntime/core/codegen/common/dispatcher.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/common/common.h" -#include -#include -#include - -namespace onnxruntime { -namespace codegen { - -// DispatcherBase is a customized unordered_map -// that provides all codegen-related functionality -// including 1) dispatching a pass -// 2) dump corresponding name -// DispatcherBase may or may not keep ownership, -// depending on the template parameter, CONTENT_TYPE. -// Note DispatcherBase has a protected destructor - -template -class DispatcherBase { - public: - DispatcherBase(const std::string& name) - : name_(name) {} - - const std::string& Name() const { - return name_; - } - - bool Contains(const std::string& name) const { - return contents_.count(name) > 0; - } - - void ForEach(std::function - func) { - for (auto& p : contents_) { - func(p.first, p.second); - } - } - - bool Register(const std::string& name, - CONTENT_TYPE op) { - if (!Contains(name)) { - contents_.emplace(name, op); - return true; - } - return false; - } - - CONTENT_TYPE Get(const std::string& key) const { - auto iter = contents_.find(key); - if (iter != contents_.end()) { - return iter->second; - } - return nullptr; - } - - const std::unordered_map GetContents() const { - return contents_; - } - - std::unordered_map GetMutableContents() { - return contents_; - } - - protected: - std::string name_; - std::unordered_map contents_; - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(DispatcherBase); - ~DispatcherBase() = default; -}; - -} // namespace codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/dump_array.h b/onnxruntime/core/codegen/common/dump_array.h deleted file mode 100644 index 8e51cd36d0087..0000000000000 --- a/onnxruntime/core/codegen/common/dump_array.h +++ /dev/null @@ -1,62 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include -#include -#include -#include - -namespace onnxruntime { - -template -void DumpArrayRecursive(const T1* data, int64_t& data_offset, const std::vector& shape, int idx) { - int dim = static_cast(shape.size()); - if (dim == 0) { - std::cout << "[]\n"; - return; - } - - assert(idx < dim); - int sz = shape[idx]; - - std::cout << "["; - if (idx < dim - 1) { - for (auto i = 0; i < sz; ++i) { - DumpArrayRecursive(data, data_offset, shape, idx + 1); - if (i < sz - 1) { - std::cout << ","; - // print multiple newlines after ',' when necessary - for (int j = idx + 1; j < dim; j++) - std::cout << "\n"; - // print leading spaces before "[" when necessary - for (int j = 0; j < idx + 1; ++j) - std::cout << " "; - } - } - } else { - for (auto i = 0; i < sz; ++i) { - if (std::is_same::value || std::is_same::value) - std::cout << std::setw(3) << static_cast(*(data + data_offset)); - else - std::cout << std::setw(12) << std::setprecision(8) << *(data + data_offset); - data_offset++; - if (i < sz - 1) - std::cout << ","; - } - } - std::cout << "]"; -} - -// A helper function to dump multidimensional arrays in a way similar to numpy -template -void DumpArray(const std::string& tag, const T1* data, const std::vector& shape) { - std::cout << tag << "\n"; - int64_t data_offset = 0; - DumpArrayRecursive(data, data_offset, shape, 0); - assert(data_offset == TotalSize(shape)); - std::cout << std::endl; -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/handle.h b/onnxruntime/core/codegen/common/handle.h deleted file mode 100644 index 7caad27dcbe01..0000000000000 --- a/onnxruntime/core/codegen/common/handle.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/target_info.h" -#include -#include - -namespace onnxruntime { -namespace codegen { - -using DomainVersionLookupFunc = std::function; - -struct CodeGenHandle { - CodeGenTarget* codegen_target; - DomainVersionLookupFunc domain_version_lookup_func = - // by default, always uses the latest opset implemented - [](const std::string&) { return INT_MAX; }; -}; - -} // namespace codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/op_macro.h b/onnxruntime/core/codegen/common/op_macro.h deleted file mode 100644 index 04305c4aa47b0..0000000000000 --- a/onnxruntime/core/codegen/common/op_macro.h +++ /dev/null @@ -1,101 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -namespace onnxruntime { - -#define LIST_BINARY_OPS() \ - BINARY_OP(Add) \ - BINARY_OP(Div) \ - BINARY_OP(Mul) \ - BINARY_OP(PRelu) \ - BINARY_OP(Sub) - -#define LIST_BINARY_CMP_OPS() \ - BINARY_CMP_OP(Equal) \ - BINARY_CMP_OP(Greater) \ - BINARY_CMP_OP(Less) - -#define LIST_POOL_OPS() \ - POOL_OP(MaxPool) \ - POOL_OP(AveragePool) \ - POOL_OP(GlobalMaxPool) \ - POOL_OP(GlobalAveragePool) - -#define LIST_REDUCE_OPS() \ - REDUCE_INDEXED_OP(ArgMax) \ - REDUCE_INDEXED_OP(ArgMin) \ - REDUCE_OP(ReduceL1) \ - REDUCE_OP(ReduceL2) \ - REDUCE_OP(ReduceLogSum) \ - REDUCE_OP(ReduceLogSumExp) \ - REDUCE_OP(ReduceMax) \ - REDUCE_OP(ReduceMean) \ - REDUCE_OP(ReduceMin) \ - REDUCE_OP(ReduceProd) \ - REDUCE_OP(ReduceSum) \ - REDUCE_OP(ReduceSumSquare) - -#define LIST_UNARY_OPS() \ - UNARY_OP(Abs) \ - UNARY_OP(Affine) \ - UNARY_OP(Ceil) \ - UNARY_OP(Elu) \ - UNARY_OP(Exp) \ - UNARY_OP(Floor) \ - UNARY_OP(HardSigmoid) \ - UNARY_OP(LeakyRelu) \ - UNARY_OP(Log) \ - UNARY_OP(Neg) \ - UNARY_OP(ParametricSoftplus) \ - UNARY_OP(Reciprocal) \ - UNARY_OP(Relu) \ - UNARY_OP(ScaledTanh) \ - UNARY_OP(Selu) \ - UNARY_OP(Sigmoid) \ - UNARY_OP(Softplus) \ - UNARY_OP(Softsign) \ - UNARY_OP(Sqrt) \ - UNARY_OP(Tanh) \ - UNARY_OP(ThresholdedRelu) - -#define LIST_VARIADIC_OPS() \ - VARIADIC_OP(Max) \ - VARIADIC_OP(Min) \ - VARIADIC_OP(Sum) - -#define LIST_ALL_GENERIC_OPS() \ - LIST_BINARY_OPS() \ - LIST_BINARY_CMP_OPS() \ - LIST_REDUCE_OPS() \ - LIST_POOL_OPS() \ - LIST_UNARY_OPS() \ - LIST_VARIADIC_OPS() \ - ADD_OP_ITEM(Cast) \ - ADD_OP_ITEM(Clip) \ - ADD_OP_ITEM(Concat) \ - ADD_OP_ITEM(Conv) \ - ADD_OP_ITEM(Crop) \ - ADD_OP_ITEM(Dropout) \ - ADD_OP_ITEM(Expand) \ - ADD_OP_ITEM(Flatten) \ - ADD_OP_ITEM(Gather) \ - ADD_OP_ITEM(GatherElements) \ - ADD_OP_ITEM(Gemm) \ - ADD_OP_ITEM(Identity) \ - ADD_OP_ITEM(LogSoftmax) \ - ADD_OP_ITEM(LSTM) \ - ADD_OP_ITEM(MatMul) \ - ADD_OP_ITEM(MatMulInteger) \ - ADD_OP_ITEM(Pad) \ - ADD_OP_ITEM(Reshape) \ - ADD_OP_ITEM(Shape) \ - ADD_OP_ITEM(Slice) \ - ADD_OP_ITEM(Softmax) \ - ADD_OP_ITEM(Split) \ - ADD_OP_ITEM(Squeeze) \ - ADD_OP_ITEM(Transpose) \ - ADD_OP_ITEM(Unsqueeze) \ - ADD_OP_ITEM(Where) - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/profile.h b/onnxruntime/core/codegen/common/profile.h deleted file mode 100644 index 31c9e764320d0..0000000000000 --- a/onnxruntime/core/codegen/common/profile.h +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -// uncomment this line or use -DCODEGEN_ENABLE_PROFILER in compiler options to enable profiler events in codegen -// #define CODEGEN_ENABLE_PROFILER - -#ifdef CODEGEN_ENABLE_PROFILER -#include "core/common/profiler.h" - -namespace onnxruntime { - -class ProfilerEvent { - public: - ProfilerEvent(const std::string& name) : name_(name) { - ts_ = profiling::Profiler::Instance().StartTime(); - } - - ~ProfilerEvent() { - profiling::Profiler::Instance().EndTimeAndRecordEvent(profiling::EventCategory::NODE_EVENT, name_, ts_); - } - - private: - TimePoint ts_; - const std::string name_; -}; - -} // namespace onnxruntime - -#define CODEGEN_PROFILER_EVENT(name) onnxruntime::ProfilerEvent profiler_event(name) - -#else - -#define CODEGEN_PROFILER_EVENT(name) - -#endif diff --git a/onnxruntime/core/codegen/common/registry.h b/onnxruntime/core/codegen/common/registry.h deleted file mode 100644 index c1642e76e2120..0000000000000 --- a/onnxruntime/core/codegen/common/registry.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/common/common.h" -#include -#include -#include - -namespace onnxruntime { -namespace codegen { - -// RegistryBase is a customized unordered_map -// that keep ownership of passes, -// including 1) IR builder passes -// 2) Weight layout transformer passes -// 3) Scheduler passses, etc. - -template -class RegistryBase { - public: - RegistryBase() = default; - - virtual ~RegistryBase() = default; - - bool Contains(const std::string& name) const { - return contents_.count(name) > 0; - } - - CONTENT_TYPE* Get(const std::string& name) const { - if (contents_.find(name) != contents_.end()) - return contents_.at(name).get(); - return nullptr; - } - - CONTENT_TYPE* RegisterOrGet( - const std::string& name, - std::unique_ptr&& ptr) { - if (!Contains(name)) - contents_.emplace(name, std::move(ptr)); - return Get(name); - } - - CONTENT_TYPE* RegisterOrGet( - std::unique_ptr&& ptr) { - return RegisterOrGet(ptr->Name(), std::move(ptr)); - } - - bool Register( - const std::string& name, - std::unique_ptr&& ptr) { - if (!Contains(name)) { - contents_.emplace(name, std::move(ptr)); - return true; - } - return false; - } - - bool Register( - std::unique_ptr&& ptr) { - return Register(ptr->Name(), std::move(ptr)); - } - - protected: - std::unordered_map> contents_; - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(RegistryBase); -}; - -// Put common Registry Management utilities if these is any - -} // namespace codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/settings.cc b/onnxruntime/core/codegen/common/settings.cc deleted file mode 100644 index 529cb654f922c..0000000000000 --- a/onnxruntime/core/codegen/common/settings.cc +++ /dev/null @@ -1,78 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/common/settings.h" - -#include "core/common/logging/logging.h" -#include -#include - -namespace onnxruntime { -namespace codegen { - -CodeGenSettings& CodeGenSettings::Instance() { - static CodeGenSettings settings; - return settings; -} - -CodeGenSettings::CodeGenSettings() {} - -void CodeGenSettings::InsertOptions(const std::map& options) { - for (const auto& option : options) { - const auto& key = option.first; - const auto& value = option.second; - - auto iter = options_.find(key); - // found existing ones - if (iter != options_.end()) { - if (iter->second != value) { - LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "CodeGenSettings: option" - << key << " is overridded from: " - << iter->second << " to: " << value; - iter->second = value; - } - } else { - options_.insert(std::make_pair(key, value)); - } - } -} - -void CodeGenSettings::DumpOptions() const { - std::ostringstream stream; - stream << "CodeGenSettings: dump all options" << std::endl; - for (const auto& option : options_) { - stream << " " << option.first << " = " << option.second << std::endl; - } - LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str(); -} - -std::string CodeGenSettings::GetOptionValue(const std::string& key) const { - const auto& iter = options_.find(key); - if (iter == options_.end()) { - LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << "CodeGenSettings::GetOptionValue: unrecognized option" << key; - return ""; - } - return iter->second; -} - -bool CodeGenSettings::HasOption(const std::string& key) const { - return options_.count(key) > 0; -} - -bool CodeGenSettings::OptionMatches(const std::string& key, const std::string& value) const { - if (!HasOption(key)) - return false; - -#ifdef _WIN32 - return 0 == _stricmp(options_.at(key).c_str(), value.c_str()); -#else - return 0 == strcasecmp(options_.at(key).c_str(), value.c_str()); -#endif -} - -void CodeGenSettings::Clear() { - options_.clear(); -} - -} // namespace codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/settings.h b/onnxruntime/core/codegen/common/settings.h deleted file mode 100644 index e327b0e207cc2..0000000000000 --- a/onnxruntime/core/codegen/common/settings.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace codegen { - -// use log level warning as default to make sure logs are outputted -#define CODEGEN_SETTINGS_LOG_LEVEL WARNING - -// This stores codegen settings to control dumps, execution preference, etc. -// CodeGenSettings could come from command line options or environment variables -// Or could come from a static variables in source code -class CodeGenSettings { - public: - // generic built-in options - constexpr static const char* kDumpAllOptions = "dump_all_options"; - constexpr static const char* kCodeGenDumpModule = "codegen_dump_module"; // dump tvm module - constexpr static const char* kCodeGenDumpLower = "codegen_dump_lower"; // dump lowered func - constexpr static const char* kCodeGenDumpSchedule = "codegen_dump_schedule"; // dump scheduler - - void InsertOptions(const std::map& options); - void DumpOptions() const; - std::string GetOptionValue(const std::string& key) const; - bool HasOption(const std::string& key) const; - bool OptionMatches(const std::string& key, const std::string& value) const; - void Clear(); - static CodeGenSettings& Instance(); - - private: - CodeGenSettings(); - - std::map options_; -}; - -} // namespace codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/target_info.h b/onnxruntime/core/codegen/common/target_info.h deleted file mode 100644 index da063545f0a1e..0000000000000 --- a/onnxruntime/core/codegen/common/target_info.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { - -// CodeGenTarget holds meta info for backend code generation -// and will be lowered to a target of corresponding backend -// code generation, e.g. TVM's Target. -class CodeGenTarget { - public: - CodeGenTarget() {} - CodeGenTarget(const std::string& target_name) - : target_name_(target_name) {} - - virtual int NaturalVectorWidth(int /*bits*/) const { - return 1; - } - - const std::string& GetTargetName() const { - return target_name_; - } - - virtual ~CodeGenTarget() = default; - - private: - std::string target_name_{"unknown"}; // default name is unknown -}; - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/utils.cc b/onnxruntime/core/codegen/common/utils.cc deleted file mode 100644 index f4140a411bddf..0000000000000 --- a/onnxruntime/core/codegen/common/utils.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/common/utils.h" -#include "core/common/cpuid_info.h" -#include "core/common/safeint.h" - -#include -#include - -namespace onnxruntime { - -std::unique_ptr GetEnv(const char* var) { - char* val = nullptr; -#if _MSC_VER - size_t len; - - if (_dupenv_s(&val, &len, var)) { - // Something went wrong, just return nullptr. - return nullptr; - } -#else - val = getenv(var); -#endif // _MSC_VER - - if (val == nullptr) { - return nullptr; - } - - // On windows, we will have to explicitly free val. Instead of returning val - // to its caller and make distinguish between windows and linux, we return - // a unique_ptr, and it will be destroyed automatically after the caller - // completes. - size_t len_val = strnlen(val, onnxruntime::kMaxStrLen) + 1; - auto p = std::make_unique(len_val); - // use explicit loop to get ride of VC's warning on unsafe copy - for (size_t i = 0; i < len_val; ++i) { - p[i] = val[i]; - } - return p; -} - -bool IsEnvVarDefined(const char* var) { - auto val = GetEnv(var); - return val != nullptr; -} - -int64_t TotalSize(const std::vector& shape) { - SafeInt total = 1; - for (auto s : shape) { - total *= s; - } - return total; -} - -// Return the strides for the input shape, i.e. the number of -// elements contained by a single element of current dimension. -// For example, for shape[3][4][5][6], strides will be -// [4*5*6, 5*6, 6, 1], i.e. [120, 30, 6, 1] -void GetStrides(const int64_t* shape, int ndim, std::vector& strides) { - strides.resize(ndim); - strides[ndim - 1] = 1; - for (int64_t i = ndim - 2; i >= 0; i--) { - strides[i] = strides[i + 1] * shape[i + 1]; - } -} - -// Common utils to get target option -TargetFeature GetTargetInfo(const codegen::CodeGenSettings& settings) { - TargetFeature feature; - - std::string target_str = ""; - - bool isAVX = false; - bool isAVX2 = false; - bool isAVX512 = false; - if (target_str == "avx") { - isAVX = true; - } else if (target_str == "avx2") { - isAVX = true; - isAVX2 = true; - } else if (target_str == "avx512") { - isAVX = true; - isAVX2 = true; - isAVX512 = true; - } else { - isAVX = CPUIDInfo::GetCPUIDInfo().HasAVX(); - isAVX2 = CPUIDInfo::GetCPUIDInfo().HasAVX2(); - isAVX512 = CPUIDInfo::GetCPUIDInfo().HasAVX512Skylake(); - } - - feature.hasAVX = isAVX; - feature.hasAVX2 = isAVX2; - feature.hasAVX512 = isAVX512; - - return feature; -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/common/utils.h b/onnxruntime/core/codegen/common/utils.h deleted file mode 100644 index ef06b5b72dc2c..0000000000000 --- a/onnxruntime/core/codegen/common/utils.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/common/common.h" -#include -#include -#include - -namespace onnxruntime { - -// Holding utility functions that are not tied to TVM and ORT - -std::unique_ptr GetEnv(const char* var); - -// Check if an environment variable is set -bool IsEnvVarDefined(const char* var); - -int64_t TotalSize(const std::vector& shape); - -void GetStrides(const int64_t* shape, int ndim, std::vector& strides); - -struct TargetFeature { - bool hasAVX; - bool hasAVX2; - bool hasAVX512; -}; - -TargetFeature GetTargetInfo(const codegen::CodeGenSettings& setttings); - -// GCD (Greatest Common Divisor) -template -T GCD(T a, T b) { - ORT_ENFORCE(a >= 0); - ORT_ENFORCE(b >= 0); - if (a < b) std::swap(a, b); - if (b == 0) return a; - while (a % b != 0) { - a = a % b; - std::swap(a, b); - } - return b; -} - -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/common.h b/onnxruntime/core/codegen/mti/common.h deleted file mode 100644 index d71e740b9284a..0000000000000 --- a/onnxruntime/core/codegen/mti/common.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include - -#define MTI_ASSERT(condition) \ - if (!(condition)) { \ - std::string error_msg = "Not satisfied: " #condition \ - ": line " + \ - std::to_string(__LINE__) + \ - " in file " + std::string(__FILE__) + "\n"; \ - throw std::runtime_error(error_msg); \ - } diff --git a/onnxruntime/core/codegen/mti/debug/tvm_print.cc b/onnxruntime/core/codegen/mti/debug/tvm_print.cc deleted file mode 100644 index 0491636032b47..0000000000000 --- a/onnxruntime/core/codegen/mti/debug/tvm_print.cc +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/debug/tvm_print.h" - -#include "core/codegen/common/utils.h" -#include "core/codegen/common/dump_array.h" -#include "core/codegen/mti/common.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.print") - .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* /*ret*/) { - DLTensor* X = args[0]; - DLTensor* Y = args[1]; - - DLDataType dtype = X->dtype; - std::vector shape; - int64_t total_size = 1; - for (int i = 0; i < X->ndim; ++i) { - shape.push_back(X->shape[i]); - total_size *= X->shape[i]; - } - - // pass X to Y - memcpy(static_cast(Y->data) + Y->byte_offset, - static_cast(X->data) + X->byte_offset, - total_size * dtype.bits / 8); - - if (tvm::runtime::TypeMatch(dtype, kDLFloat, 32)) { - float* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("float tensor:", data, shape); - } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 8)) { - int8_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("int8 tensor:", data, shape); - } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 16)) { - int16_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("int16 tensor:", data, shape); - } else if (tvm::runtime::TypeMatch(dtype, kDLInt, 32)) { - int32_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("int32 tensor:", data, shape); - } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 8)) { - uint8_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("uint8 tensor:", data, shape); - } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 16)) { - uint16_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("uint16 tensor:", data, shape); - } else if (tvm::runtime::TypeMatch(dtype, kDLUInt, 32)) { - uint32_t* data = reinterpret_cast(static_cast(X->data) + X->byte_offset); - DumpArray("uint32 tensor:", data, shape); - } else { - MTI_ASSERT(0 && "not implemented!"); - } - }); - -tvm::Array -PrintTVMTensorExtern(const tvm::Tensor& X, - const std::string& name) { - return topi::detail::make_extern( - {X->shape}, - {X->dtype}, - {X}, - [&](tvm::Array ins, tvm::Array outs) { - return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.print"), - topi::detail::pack_buffer(ins[0]), - topi::detail::pack_buffer(outs[0])}); - }, - name + "_print", "", {}); -} - -tvm::Tensor PrintImmutable(const tvm::Tensor& X) { - auto outputs = PrintTVMTensorExtern(X, X->op->name + "_print"); - return outputs[0]; -} - -void Print(tvm::Tensor& X) { - X = PrintImmutable(X); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/debug/tvm_print.h b/onnxruntime/core/codegen/mti/debug/tvm_print.h deleted file mode 100644 index 91a334785a2a4..0000000000000 --- a/onnxruntime/core/codegen/mti/debug/tvm_print.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Array PrintTVMTensorExtern( - const tvm::Tensor& X, - const std::string& name = "PrintTVM2DTensorExtern"); - -tvm::Tensor PrintImmutable(const tvm::Tensor& X); - -void Print(tvm::Tensor& X); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/binary_ops.cc b/onnxruntime/core/codegen/mti/math/binary_ops.cc deleted file mode 100644 index f3048799458f4..0000000000000 --- a/onnxruntime/core/codegen/mti/math/binary_ops.cc +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/binary_ops.h" - -#include "core/codegen/mti/math/unary_ops.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/cast_ops.h" -#include - -// Using namespace topi for override operator +-*/ -using namespace topi; - -namespace onnxruntime { -namespace tvm_codegen { - -#define TVM_BINARY_OP1(op, expr) \ - tvm::Tensor op(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { \ - return Rename(expr, name); \ - } \ - tvm::Tensor op(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { \ - return Rename(expr, name); \ - } - -#define TVM_BINARY_OP(op, expr) \ - TVM_BINARY_OP1(op, expr) \ - tvm::Tensor op(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { \ - return Rename(expr, name); \ - } - -TVM_BINARY_OP(Add, lhs + rhs); -TVM_BINARY_OP(Div, lhs / rhs); -TVM_BINARY_OP(Max, maximum(lhs, rhs)); -TVM_BINARY_OP(Min, minimum(lhs, rhs)); -TVM_BINARY_OP(Mul, lhs* rhs); -TVM_BINARY_OP1(PRelu, Relu(lhs) - rhs * Relu(0 - lhs)); -TVM_BINARY_OP(Sub, lhs - rhs); - -tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { - return topi::equal(lhs, rhs, name); -} -tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { - return topi::equal(lhs, rhs, name); -} -tvm::Tensor Equal(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { - return topi::equal(lhs, rhs, name); -} - -tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { - return topi::greater(lhs, rhs, name); -} -tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { - return topi::greater(lhs, rhs, name); -} -tvm::Tensor Greater(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { - return topi::greater(lhs, rhs, name); -} - -tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { - return topi::less(lhs, rhs, name); -} -tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name) { - return topi::less(lhs, rhs, name); -} -tvm::Tensor Less(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name) { - return topi::less(lhs, rhs, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/binary_ops.h b/onnxruntime/core/codegen/mti/math/binary_ops.h deleted file mode 100644 index dd51ce5e7917d..0000000000000 --- a/onnxruntime/core/codegen/mti/math/binary_ops.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Add(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "add"); -tvm::Tensor Add(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "add"); -tvm::Tensor Add(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "add"); -tvm::Tensor Div(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "div"); -tvm::Tensor Div(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "div"); -tvm::Tensor Div(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "div"); -tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "equal"); -tvm::Tensor Equal(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "equal"); -tvm::Tensor Equal(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "equal"); -tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "greater"); -tvm::Tensor Greater(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "greater"); -tvm::Tensor Greater(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "greater"); -tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "less"); -tvm::Tensor Less(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "less"); -tvm::Tensor Less(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "less"); -tvm::Tensor Max(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "max"); -tvm::Tensor Max(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "max"); -tvm::Tensor Max(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "max"); -tvm::Tensor Min(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "min"); -tvm::Tensor Min(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "min"); -tvm::Tensor Min(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "min"); -tvm::Tensor Mul(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "mul"); -tvm::Tensor Mul(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "mul"); -tvm::Tensor Mul(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "mul"); -tvm::Tensor PRelu(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "prelu"); -tvm::Tensor PRelu(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "prelu"); -tvm::Tensor Sub(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name = "sub"); -tvm::Tensor Sub(const tvm::Tensor& lhs, const tvm::Expr& rhs, const std::string& name = "sub"); -tvm::Tensor Sub(const tvm::Expr& lhs, const tvm::Tensor& rhs, const std::string& name = "sub"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/gemm.cc b/onnxruntime/core/codegen/mti/math/gemm.cc deleted file mode 100644 index 7a79513ccaa97..0000000000000 --- a/onnxruntime/core/codegen/mti/math/gemm.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/gemm.h" - -#include "core/codegen/mti/math/matmul_ops.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -// Using namespace topi for override operator +-*/ -using namespace topi; - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Gemm(const tvm::Tensor& A, const tvm::Tensor& B, const tvm::Tensor& C, - bool trans_A, bool trans_B, float alpha, float beta, - const std::string& name) { - auto A_dot_B = MatMul2D(A, B, trans_A, trans_B, name + "_matmul2d"); - tvm::Expr alphaExpr = tvm::make_const(A->dtype, alpha); - if (beta != 0) { - tvm::Expr betaExpr = tvm::make_const(A->dtype, beta); - return Rename(alphaExpr * A_dot_B + (betaExpr * C), name); - } else { - return Rename(alphaExpr * A_dot_B, name); - } -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/gemm.h b/onnxruntime/core/codegen/mti/math/gemm.h deleted file mode 100644 index 3bb205c13fdc9..0000000000000 --- a/onnxruntime/core/codegen/mti/math/gemm.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Gemm(const tvm::Tensor& p_A, const tvm::Tensor& p_B, const tvm::Tensor& p_C, - bool trans_A, bool trans_B, float alpha, float beta, - const std::string& name = "gemm"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/logsoftmax.cc b/onnxruntime/core/codegen/mti/math/logsoftmax.cc deleted file mode 100644 index cd8c2edae6959..0000000000000 --- a/onnxruntime/core/codegen/mti/math/logsoftmax.cc +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/logsoftmax.h" - -#include "core/codegen/mti/tensor/reshape_ops.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor LogSoftmax(const tvm::Tensor& input, int64_t axis, const std::string& name) { - tvm::Tensor flatten_t = Flatten(input, axis, "logsoftmax_flatten"); - return Reshape(topi::nn::log_softmax(flatten_t, name), input->shape, "logsoftmax_reshape"); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/logsoftmax.h b/onnxruntime/core/codegen/mti/math/logsoftmax.h deleted file mode 100644 index 606a32806434b..0000000000000 --- a/onnxruntime/core/codegen/mti/math/logsoftmax.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor LogSoftmax(const tvm::Tensor& input, int64_t axis, const std::string& name = "logsoftmax"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.cc b/onnxruntime/core/codegen/mti/math/matmul_ops.cc deleted file mode 100644 index 6ecf2f69a9c25..0000000000000 --- a/onnxruntime/core/codegen/mti/math/matmul_ops.cc +++ /dev/null @@ -1,161 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/matmul_ops.h" - -#include "core/codegen/mti/common.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a, bool trans_b, const std::string& name) { - return topi::matmul(A, B, trans_a, trans_b, name); -} - -/* - * Generic Matrix Multiplication - * - * If both arguments are 2-D, they are multiplied like conventional matrices. - * - * If either argument is N-D and N > 2, it is treated as a stack of matrices residing in the last two indexes and broadcast accordingly. - * - * If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. - * After matrix multiplication the prepended 1 is removed. - * - * If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. - * After matrix multiplication the appended 1 is removed. - */ -tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name) { - int64_t a_rank = static_cast(A->shape.size()); - int64_t b_rank = static_cast(B->shape.size()); - const auto& A_shape = A->shape; - const auto& B_shape = B->shape; - if (a_rank == 2 && b_rank == 2) { - // 2-D X 2-D - return MatMul2D(A, B); - } else if (a_rank == 1 && b_rank == 1) { - // 1-D X 1-D - auto k = tvm::reduce_axis(tvm::Range(0, A_shape[0]), "k"); - - return tvm::compute( - {}, - [&](const tvm::Array& /*indices*/) { - return tvm::sum(A[k] * B[k], {k}); - }, - name); - } else if (a_rank == 1) { - // 1-D X n-D - auto k = tvm::reduce_axis(tvm::Range(0, A_shape[0]), "k"); - - auto l = [&](const tvm::Array& indices) { - auto ndims = indices.size(); - MTI_ASSERT(ndims >= 1); - tvm::Array b_indices; - for (size_t bi = 0; bi < ndims - 1; ++bi) { - b_indices.push_back(indices[bi]); - } - b_indices.push_back(k); - b_indices.push_back(indices[ndims - 1]); - return tvm::sum(A({k}) * B(b_indices), {k}); - }; - return tvm::compute(ConcatShapes(SliceShapeToDimension(B_shape, -2), SliceShapeFromDimension(B_shape, -1)), l, name); - } else if (b_rank == 1) { - // n-D X 1-D - auto k = tvm::reduce_axis(tvm::Range(0, B_shape[0]), "k"); - - auto l = [&](const tvm::Array& indices) { - tvm::Array a_indices(indices.begin(), indices.end()); - a_indices.push_back(k); - return tvm::sum(A(a_indices) * B({k}), {k}); - }; - return tvm::compute(SliceShapeToDimension(A->shape, -1), l, name); - } else { - // n-D X m-D - MTI_ASSERT(a_rank >= 2 && b_rank >= 2); - auto k = tvm::reduce_axis(tvm::Range(0, A_shape[a_rank - 1]), "k"); - - auto l = [&](const tvm::Array& indices) { - auto ndims = static_cast(indices.size()); - MTI_ASSERT(ndims > 2); - tvm::Array a_indices, b_indices; - - // handle broadcasting - int i = 0, a_idx = 0, b_idx = 0; - bool a_greater = a_rank > b_rank; - for (; i < std::abs(a_rank - b_rank); ++i) { - if (a_greater) { - a_indices.push_back(indices[i]); - a_idx++; - } else { - b_indices.push_back(indices[i]); - b_idx++; - } - } - for (; i < ndims - 2; ++i, ++a_idx, ++b_idx) { - auto tp = indices[i].type(); - if (IsOne(A_shape, a_idx)) { - a_indices.push_back(tvm::make_zero(tp)); - b_indices.push_back(indices[i]); - } else if (IsOne(B_shape, b_idx)) { - b_indices.push_back(tvm::make_zero(tp)); - a_indices.push_back(indices[i]); - } else { - a_indices.push_back(indices[i]); - b_indices.push_back(indices[i]); - } - } - - MTI_ASSERT(a_idx == a_rank - 2 && b_idx == b_rank - 2); - a_indices.push_back(indices[ndims - 2]); - a_indices.push_back(k); - - b_indices.push_back(k); - b_indices.push_back(indices[ndims - 1]); - - return tvm::sum(A(a_indices) * B(b_indices), {k}); - }; - - return tvm::compute(ComputeMatMulShape(A_shape, B_shape), l, name); - } -} - -tvm::Array -ComputeMatMulShape( - const tvm::Array& A_shape, - const tvm::Array& B_shape, - bool trans_a, - bool trans_b) { - auto a_rank = A_shape.size(); - auto b_rank = B_shape.size(); - tvm::Array output_shape; - int64_t output_rank = std::max(a_rank, b_rank); - MTI_ASSERT(a_rank > 0 && b_rank > 0); - if (a_rank == 1 && b_rank == 1) { - MTI_ASSERT(!trans_a && !trans_b); - // reduction, output shape is empty - } else if (a_rank == 1) { - MTI_ASSERT(!trans_a && !trans_b); - output_shape = SliceShapeToDimension(B_shape, b_rank - 2); - output_shape.push_back(B_shape[b_rank - 1]); - } else if (b_rank == 1) { - MTI_ASSERT(!trans_a && !trans_b); - output_shape = SliceShapeToDimension(A_shape, a_rank - 1); - } else { - for (int64_t i = 0; i < output_rank - 2; i++) { - tvm::Expr broadcasted_dim = tvm::make_const(HalideIR::Int(32), 1); - bool broadcasted = - BroadcastDim(A_shape, i, output_rank, broadcasted_dim) && - BroadcastDim(B_shape, i, output_rank, broadcasted_dim); - MTI_ASSERT(broadcasted); - output_shape.push_back(broadcasted_dim); - } - output_shape.push_back(A_shape[a_rank - (trans_a ? 1 : 2)]); - output_shape.push_back(B_shape[b_rank - (trans_b ? 2 : 1)]); - } - return output_shape; -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/matmul_ops.h b/onnxruntime/core/codegen/mti/math/matmul_ops.h deleted file mode 100644 index ab9986132d34a..0000000000000 --- a/onnxruntime/core/codegen/mti/math/matmul_ops.h +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Array -ComputeMatMulShape( - const tvm::Array& A_shape, - const tvm::Array& B_shape, - bool trans_a = false, - bool trans_b = false); - -tvm::Tensor MatMul2D(const tvm::Tensor& A, const tvm::Tensor& B, bool trans_a = false, bool trans_b = false, const std::string& name = "matmul2d"); - -tvm::Tensor MatMul(const tvm::Tensor& A, const tvm::Tensor& B, const std::string& name = "matmul"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/reduce_ops.cc b/onnxruntime/core/codegen/mti/math/reduce_ops.cc deleted file mode 100644 index 7d179e2b04316..0000000000000 --- a/onnxruntime/core/codegen/mti/math/reduce_ops.cc +++ /dev/null @@ -1,90 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/reduce_ops.h" - -#include "core/codegen/mti/math/binary_ops.h" -#include "core/codegen/mti/math/unary_ops.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor ArgMax(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name) { - return Rename(topi::argmax(X, ToTvmArrayInt({axis}), keep_dims), name); -} - -tvm::Tensor ArgMin(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name) { - return Rename(topi::argmin(X, ToTvmArrayInt({axis}), keep_dims), name); -} - -tvm::Tensor ReduceL1(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return ReduceSum(Abs(X), axes, keep_dims, name); -} - -tvm::Tensor ReduceL2(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return Sqrt(ReduceSumSquare(X, axes, keep_dims), name); -} - -tvm::Tensor ReduceLogSum(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return Log(ReduceSum(X, axes, keep_dims), name); -} - -tvm::Tensor ReduceLogSumExp(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - tvm::Tensor reduce_max = ReduceMax(X, axes, true); - tvm::Tensor exp_delta = Exp(Sub(X, reduce_max)); - tvm::Tensor reduce_max_keep_dims = ReduceMax(X, axes, keep_dims); - return Add(ReduceLogSum(exp_delta, axes, keep_dims), reduce_max_keep_dims, name); -} - -tvm::Tensor ReduceMax(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return Rename(topi::max(X, ToTvmArrayInt(axes), keep_dims), name); -} - -tvm::Tensor ReduceMean(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - tvm::Tensor reduce_sum = ReduceSum(X, axes, keep_dims); - tvm::Expr count = tvm::make_const(reduce_sum->dtype, 1.0f); - if (axes.empty()) { - for (const auto& dim : X->shape) - count = count * dim; - } else { - for (int64_t axis : axes) { - int64_t i = HandleNegativeAxis(axis, X->shape.size()); - count = count * X->shape[i]; - } - } - return tvm::compute( - reduce_sum->shape, - [&](const tvm::Array& i) { - return reduce_sum(i) / count; - }, - name); -} - -tvm::Tensor ReduceMin(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return Rename(topi::min(X, ToTvmArrayInt(axes), keep_dims), name); -} - -tvm::Tensor ReduceProd(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - auto prod = [](tvm::Expr source, tvm::Array rdom) { - tvm::Var x("x", source.type()), y("y", source.type()); - tvm::Expr Rename_element = tvm::make_const(source.type(), 1.0f); - tvm::ir::CommReducer combiner = - tvm::ir::CommReducerNode::make({x}, {y}, {x * y}, {Rename_element}); - return tvm::ir::Reduce::make(combiner, {source}, rdom, tvm::make_const(tvm::Bool(1), true), 0); - }; - - return Rename(topi::CommReduce(X, ToTvmArrayInt(axes), prod, keep_dims, true), name); -} - -tvm::Tensor ReduceSum(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return Rename(topi::sum(X, ToTvmArrayInt(axes), keep_dims), name); -} - -tvm::Tensor ReduceSumSquare(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name) { - return Rename(topi::sum(Mul(X, X), ToTvmArrayInt(axes), keep_dims), name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/reduce_ops.h b/onnxruntime/core/codegen/mti/math/reduce_ops.h deleted file mode 100644 index f782df5e6515f..0000000000000 --- a/onnxruntime/core/codegen/mti/math/reduce_ops.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor ArgMax(const tvm::Tensor& X, - int64_t axis, - bool keep_dims, - const std::string& name = "argmax"); - -tvm::Tensor ArgMin(const tvm::Tensor& X, - int64_t axis, - bool keep_dims, - const std::string& name = "argmin"); - -tvm::Tensor ReduceL1(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_l1"); - -tvm::Tensor ReduceL2(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_l2"); - -tvm::Tensor ReduceLogSum(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_log_sum"); - -tvm::Tensor ReduceLogSumExp(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "argmareduce_log_sum_exp"); - -tvm::Tensor ReduceMax(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_max"); - -tvm::Tensor ReduceMean(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_mean"); - -tvm::Tensor ReduceMin(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_min"); - -tvm::Tensor ReduceProd(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_prod"); - -tvm::Tensor ReduceSum(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_sum"); - -tvm::Tensor ReduceSumSquare(const tvm::Tensor& X, - const std::vector& axes, - bool keep_dims, - const std::string& name = "reduce_sum_square"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/softmax.cc b/onnxruntime/core/codegen/mti/math/softmax.cc deleted file mode 100644 index d7404137bb873..0000000000000 --- a/onnxruntime/core/codegen/mti/math/softmax.cc +++ /dev/null @@ -1,18 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/softmax.h" - -#include "core/codegen/mti/tensor/reshape_ops.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Softmax(const tvm::Tensor& input, int64_t axis, const std::string& name) { - tvm::Tensor flatten_t = Flatten(input, axis, "softmax_flatten"); - return Reshape(topi::nn::softmax(flatten_t, 1, name), input->shape, "softmax_reshape"); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/softmax.h b/onnxruntime/core/codegen/mti/math/softmax.h deleted file mode 100644 index fb16fbaeb56a2..0000000000000 --- a/onnxruntime/core/codegen/mti/math/softmax.h +++ /dev/null @@ -1,11 +0,0 @@ -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Softmax(const tvm::Tensor& input, int64_t axis, const std::string& name = "softmax"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/unary_ops.cc b/onnxruntime/core/codegen/mti/math/unary_ops.cc deleted file mode 100644 index ae732ea33e670..0000000000000 --- a/onnxruntime/core/codegen/mti/math/unary_ops.cc +++ /dev/null @@ -1,155 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/math/unary_ops.h" - -#include "core/codegen/common/settings.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include -#include -#include -#include - -// Using namespace topi for override operator +-*/ -using namespace topi; - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Abs(const tvm::Tensor& X, const std::string& name) { - return abs(X, name); -} - -tvm::Tensor Affine(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - tvm::Expr betaExpr = tvm::make_const(X->dtype, beta); - return Rename(alphaExpr * X + betaExpr, name); -} - -tvm::Tensor Ceil(const tvm::Tensor& X, const std::string& name) { - return topi::ceil(X, name); -} - -tvm::Tensor Clip(const tvm::Tensor& X, tvm::Expr min_value, tvm::Expr max_value, const std::string& name) { - auto Y = tvm::compute( - X->shape, - [&](const tvm::Array& indices) { - return tvm::min(tvm::max(X(indices), min_value), max_value); - }, - name); - return Y; -} - -tvm::Tensor Elu(const tvm::Tensor& X, float alpha, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - return Rename(Relu(X) - alphaExpr * Relu(1 - Exp(X)), name); -} - -tvm::Tensor Exp(const tvm::Tensor& X, const std::string& name) { - return tvm::compute( - X->shape, - [&](const tvm::Array& indices) { - return tvm::exp(X(indices)); - }, - name); -} - -tvm::Tensor Floor(const tvm::Tensor& X, const std::string& name) { - return topi::floor(X, name); -} - -tvm::Tensor HardSigmoid(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - tvm::Expr betaExpr = tvm::make_const(X->dtype, beta); - return maximum(0, minimum(1, alphaExpr * X + betaExpr), name); -} - -tvm::Tensor LeakyRelu(const tvm::Tensor& X, float alpha, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - return Rename(Relu(X) - alphaExpr * Relu(0 - X), name); -} - -tvm::Tensor Log(const tvm::Tensor& X, const std::string& name) { - return tvm::compute( - X->shape, - [&](const tvm::Array& indices) { - return tvm::log(X(indices)); - }, - name); -} - -tvm::Tensor Neg(const tvm::Tensor& X, const std::string& name) { - return negative(X, name); -} - -tvm::Tensor ParametricSoftplus(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - tvm::Expr betaExpr = tvm::make_const(X->dtype, beta); - return Rename(alphaExpr * Softplus(betaExpr * X), name); -} - -tvm::Tensor Reciprocal(const tvm::Tensor& X, const std::string& name) { - return Rename(1 / X, name); -} - -tvm::Tensor Relu(const tvm::Tensor& X, const std::string& name) { - return maximum(X, 0, name); -} - -tvm::Tensor ScaledTanh(const tvm::Tensor& X, float alpha, float beta, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - tvm::Expr betaExpr = tvm::make_const(X->dtype, beta); - return Rename(alphaExpr * Tanh(betaExpr * X), name); -} - -tvm::Tensor Selu(const tvm::Tensor& X, float alpha, float gamma, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - tvm::Expr gammaExpr = tvm::make_const(X->dtype, gamma); - return Rename(gammaExpr * (-alphaExpr * Relu(1 - Exp(X)) + Relu(X)), name); -} - -tvm::Tensor Sigmoid(const tvm::Tensor& X, const std::string& name) { - return tvm::compute( - X->shape, - [&](const tvm::Array& indices) { - return tvm::ir::Select::make(X(indices) > 0, - 1 / (1 + tvm::exp(-X(indices))), - tvm::exp(X(indices)) / (tvm::exp(X(indices)) + 1)); - }, - name); -} - -tvm::Tensor SignNoZero(const tvm::Tensor& X, const std::string& name) { - return Rename(greater_equal(X, 0) * 2 - 1, name); -} - -tvm::Tensor Softplus(const tvm::Tensor& X, const std::string& name) { - return Rename(Log(1 + Exp(Neg(Abs(X)))) + Relu(X), name); -} - -tvm::Tensor Softsign(const tvm::Tensor& X, const std::string& name) { - return Rename(X / (1 + Abs(X)), name); -} - -tvm::Tensor Sqrt(const tvm::Tensor& X, const std::string& name) { - return sqrt(X, name); -} - -tvm::Tensor Tanh(const tvm::Tensor& X, const std::string& name) { - return tvm::compute( - X->shape, - [&](const tvm::Array& indices) { - return tvm::ir::Select::make(X(indices) < 0, - (tvm::exp(2 * X(indices)) - 1) / (tvm::exp(2 * X(indices)) + 1), - (1 - tvm::exp(-2 * X(indices))) / (1 + tvm::exp(-2 * X(indices)))); - }, - name); -} - -tvm::Tensor ThresholdedRelu(const tvm::Tensor& X, float alpha, const std::string& name) { - tvm::Expr alphaExpr = tvm::make_const(X->dtype, alpha); - return topi::where(greater(X, alphaExpr), X, topi::full_like(X, tvm::make_zero(X->dtype)), name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/math/unary_ops.h b/onnxruntime/core/codegen/mti/math/unary_ops.h deleted file mode 100644 index aeb336262e547..0000000000000 --- a/onnxruntime/core/codegen/mti/math/unary_ops.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Abs(const tvm::Tensor& X, const std::string& name = "abs"); -tvm::Tensor Affine(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "affine"); -tvm::Tensor Ceil(const tvm::Tensor& X, const std::string& name = "ceil"); -tvm::Tensor Clip(const tvm::Tensor& X, tvm::Expr min_value, tvm::Expr max_value, const std::string& name = "clip"); -tvm::Tensor Elu(const tvm::Tensor& X, float alpha, const std::string& name = "elu"); -tvm::Tensor Exp(const tvm::Tensor& X, const std::string& name = "exp"); -tvm::Tensor Floor(const tvm::Tensor& X, const std::string& name = "floor"); -tvm::Tensor HardSigmoid(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "hard_sigmoid"); -tvm::Tensor LeakyRelu(const tvm::Tensor& X, float alpha, const std::string& name = "leaky_relu"); -tvm::Tensor Log(const tvm::Tensor& X, const std::string& name = "log"); -tvm::Tensor Neg(const tvm::Tensor& X, const std::string& name = "neg"); -tvm::Tensor ParametricSoftplus(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "parametric_softplus"); -tvm::Tensor Reciprocal(const tvm::Tensor& X, const std::string& name = "reciprocal"); -tvm::Tensor Relu(const tvm::Tensor& X, const std::string& name = "relu"); -tvm::Tensor ScaledTanh(const tvm::Tensor& X, float alpha, float beta, const std::string& name = "scaled_tanh"); -tvm::Tensor Selu(const tvm::Tensor& X, float alpha, float gamma, const std::string& name = "selu"); -tvm::Tensor Sigmoid(const tvm::Tensor& X, const std::string& name = "sigmoid"); -tvm::Tensor SignNoZero(const tvm::Tensor& X, const std::string& name = "sign_no_zero"); -tvm::Tensor Softplus(const tvm::Tensor& X, const std::string& name = "softplus"); -tvm::Tensor Softsign(const tvm::Tensor& X, const std::string& name = "softsign"); -tvm::Tensor Sqrt(const tvm::Tensor& X, const std::string& name = "sqrt"); -tvm::Tensor Tanh(const tvm::Tensor& X, const std::string& name = "tanh"); -tvm::Tensor ThresholdedRelu(const tvm::Tensor& X, float alpha, const std::string& name = "thresholded_relu"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.cc b/onnxruntime/core/codegen/mti/mti_tvm_utils.cc deleted file mode 100644 index 8e73629c05614..0000000000000 --- a/onnxruntime/core/codegen/mti/mti_tvm_utils.cc +++ /dev/null @@ -1,203 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/mti_tvm_utils.h" - -#include "core/codegen/common/settings.h" -#include "core/codegen/mti/tensor/reshape_ops.h" -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Array ToTvmArray(gsl::span shape) { - tvm::Array arr; - for (size_t i = 0; i < shape.size(); ++i) { - arr.push_back(tvm::Expr(static_cast(shape[i]))); - } - return arr; -} - -tvm::Array ToTvmArrayInt(gsl::span shape) { - tvm::Array arr; - for (size_t i = 0; i < shape.size(); ++i) { - arr.push_back(shape[i]); - } - return arr; -} - -tvm::Expr SizeToDimension(const tvm::Array& shape, int64_t axis) { - tvm::Expr size(1); - auto rank = shape.size(); - if (static_cast(axis) != rank) { - axis = HandleNegativeAxis(axis, rank); - } - for (size_t d = 0; d < std::min(rank, static_cast(axis)); ++d) - size = tvm::ir::Simplify(size * shape[d]); - return size; -} - -tvm::Expr SizeFromDimension(const tvm::Array& shape, int64_t axis) { - tvm::Expr size(1); - auto rank = shape.size(); - if (static_cast(axis) != rank) { - axis = HandleNegativeAxis(axis, rank); - } - for (size_t d = static_cast(axis); d < rank; ++d) - size = tvm::ir::Simplify(size * shape[d]); - return size; -} - -tvm::Expr RoundUp(tvm::Expr value, tvm::Expr alignment) { - return tvm::ir::Simplify((value + alignment - 1) / alignment * alignment); -} - -tvm::Array ConcatShapes( - const tvm::Array& shape1, - const tvm::Array& shape2) { - tvm::Array result; - for (size_t i = 0; i < shape1.size(); i++) - result.push_back(shape1[i]); - for (size_t i = 0; i < shape2.size(); i++) - result.push_back(shape2[i]); - return result; -} - -tvm::Tensor Rename(tvm::Tensor X, const std::string& name) { - const_cast(X->op->name) = name; - return X; -} - -tvm::Array SliceShape(const tvm::Array& shape, const std::vector& axes) { - tvm::Array new_shape; - for (auto axis : axes) { - CHECK(axis < static_cast(shape.size())); - new_shape.push_back(shape[axis]); - } - return new_shape; -} - -tvm::Array SliceShapeFromDimension(const tvm::Array& shape, int64_t axis) { - int64_t rank = static_cast(shape.size()); - axis = HandleNegativeAxis(axis, rank); - std::vector axes; - for (auto i = axis; i < rank; ++i) - axes.push_back(i); - return SliceShape(shape, axes); -} - -tvm::Array SliceShapeToDimension(const tvm::Array& shape, int64_t axis) { - int64_t rank = static_cast(shape.size()); - axis = HandleNegativeAxis(axis, rank); - std::vector axes; - for (auto i = 0; i < axis; ++i) - axes.push_back(i); - return SliceShape(shape, axes); -} - -bool IsOne(const tvm::Array& shape, int64_t axis) { - int64_t rank = static_cast(shape.size()); - axis = HandleNegativeAxis(axis, rank); - const auto& dim = shape[axis]; - auto* p = tvm::as_const_int(dim); - return p != nullptr && *p == 1; -} - -tvm::Tensor Promote(const tvm::Expr& expr, const tvm::Array& shape, const std::string& name) { - return tvm::compute( - shape, - [&](const tvm::Array&) { - return expr; - }, - name); -} - -void DumpTVMModuleToFile(const std::string& filename, tvm::runtime::Module& module) { - const codegen::CodeGenSettings& settings = codegen::CodeGenSettings::Instance(); - if (!settings.HasOption(codegen::CodeGenSettings::kCodeGenDumpModule)) - return; - - // ISSUE: note that all option values are converted to lower case. It doesn't cause - // any issue currently, because all supported formats (i.e. file exts) are of lower case. - // Just keep in mind that we might have issue if somehow we started to support dump - // formats with upper case, although it's quite unlikely. - std::string format = settings.GetOptionValue(codegen::CodeGenSettings::kCodeGenDumpModule); - std::string module_filename = filename + "." + format; - module->SaveToFile(module_filename, format); -} - -tvm::Tensor MakeZeroTensor(const tvm::Array& shape, - HalideIR::Type type, - const std::string& name) { - auto l = [&](const tvm::Array& /*indices*/) { - return tvm::make_zero(type); - }; - return tvm::compute(shape, l, name); -} - -bool BroadcastDim(const tvm::Array& shape, size_t i, size_t output_rank, tvm::Expr& dim) { - if (i >= output_rank - shape.size()) { - auto new_dim = shape[shape.size() - output_rank + i]; - if (tvm::ir::Equal(new_dim, dim)) - return true; - - const int64_t* p_new = tvm::as_const_int(new_dim); - if (p_new != nullptr && *p_new == 1) { - return true; - } else { - const int64_t* p_old = tvm::as_const_int(dim); - if (p_old != nullptr && *p_old == 1) { - dim = new_dim; - return true; - } - } - return false; - } - // auto broadcast to outer dims - return true; -} - -tvm::Array MakeInputsForExtern(const tvm::Array& inputs, const std::string& name) { - // note that currently TVM StorageFlatten creates strides like max(symbolic_dim, 1) - // which is not zero when checking symbolic_dim - max(symbolic_dim, 1) - // then triggers error like: Trying to bind compact buffer to strided one - // here's a workaround to reshape inputs to avoid that - tvm::Array fixed_inputs; - for (size_t idx_input = 0; idx_input < inputs.size(); ++idx_input) { - const auto& input = inputs[idx_input]; - tvm::Array fixed_shape; - if (input->shape.size() > 0) { - // stride compute does not use dim 0, so directly push to fixed_shape - fixed_shape.push_back(input->shape[0]); - bool need_fix = false; - for (size_t idx_dim = 1; idx_dim < input->shape.size(); ++idx_dim) { - const auto& dim = input->shape[idx_dim]; - if (tvm::as_const_int(dim) == nullptr) { - fixed_shape.push_back(tvm::max(dim, tvm::make_const(HalideIR::Int(32), 1))); - need_fix = true; - } else { - fixed_shape.push_back(dim); - } - } - if (need_fix) { - fixed_inputs.push_back(tvm_codegen::Reshape(input, fixed_shape, name + "_" + std::to_string(idx_input))); - continue; - } - } - // no fix needed - fixed_inputs.push_back(input); - } - return fixed_inputs; -} - -// Make sure idx is clamped in the range of [-bound, bound - 1] -tvm::Expr ClampIndex(const tvm::Expr& idx, const tvm::Expr& bound) { - // when idx >= 0, we take tvm::max(..., 0), because (idx < 0) is 0 - // when idx < 0, we take bound + tvm::max(...), because tvm::max(idx, 0) is 0 - return tvm::max(tvm::min(idx, bound - 1), 0) + - (idx < 0) * (bound + tvm::max(idx, -bound)); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/mti_tvm_utils.h b/onnxruntime/core/codegen/mti/mti_tvm_utils.h deleted file mode 100644 index c2a14106c1686..0000000000000 --- a/onnxruntime/core/codegen/mti/mti_tvm_utils.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include -#include -#include -#include -#include "core/codegen/mti/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Array ToTvmArray(gsl::span shape); - -tvm::Array ToTvmArrayInt(gsl::span shape); - -// Helper function to compute sub shape size to axis (not included) -tvm::Expr SizeToDimension(const tvm::Array& shape, int64_t axis); - -// Helper function to compute sub shape size from axis (included) -tvm::Expr SizeFromDimension(const tvm::Array& shape, int64_t axis); - -// Helper function to align -tvm::Expr RoundUp(tvm::Expr value, tvm::Expr alignment); - -tvm::Array ConcatShapes( - const tvm::Array& shape1, - const tvm::Array& shape2); - -// Helper function to rename tvm::Tensor -tvm::Tensor Rename(tvm::Tensor X, const std::string& name); - -// Helper function to slice TVM shape -tvm::Array SliceShape(const tvm::Array& shape, const std::vector& axes); - -// Helper function to slice TVM shape from axis (inclusive). -// Basically, this function returns the shape of [axis, shape.size()-1] -tvm::Array SliceShapeFromDimension(const tvm::Array& shape, int64_t axis); - -// this function returns the shape of [0, axis-1] -tvm::Array SliceShapeToDimension(const tvm::Array& shape, int64_t axis); - -// check if dimension is 1 -bool IsOne(const tvm::Array& shape, int64_t axis); - -// Helper function to convert tvm::Expr to tvm::Tensor -tvm::Tensor Promote(const tvm::Expr& expr, - const tvm::Array& shape, - const std::string& name = "PromoteExpr"); - -tvm::Tensor MakeZeroTensor(const tvm::Array& shape, HalideIR::Type type, const std::string& name); - -void DumpTVMModuleToFile(const std::string& filename, tvm::runtime::Module& module); - -bool BroadcastDim(const tvm::Array& shape, size_t i, size_t output_rank, tvm::Expr& dim); - -inline int64_t HandleNegativeAxis(int64_t axis, int64_t rank) { - MTI_ASSERT(axis >= -rank && axis <= rank - 1); - return axis = axis < 0 ? (axis + rank) : axis; -} - -// Make sure idx is clamped in the range of [-bound, bound - 1] -tvm::Expr ClampIndex(const tvm::Expr& idx, const tvm::Expr& bound); - -// Helper function to workaround tvm ExternOp issue when input has symbolic dimensions -tvm::Array MakeInputsForExtern(const tvm::Array& inputs, const std::string& name = "make_inputs_for_extern"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/conv_ops.cc b/onnxruntime/core/codegen/mti/nn/conv_ops.cc deleted file mode 100644 index e2d4acc8843ad..0000000000000 --- a/onnxruntime/core/codegen/mti/nn/conv_ops.cc +++ /dev/null @@ -1,193 +0,0 @@ -#include "core/codegen/mti/nn/conv_ops.h" - -#include "core/codegen/mti/math/matmul_ops.h" -#include "core/codegen/mti/tensor/pad_ops.h" -#include "core/codegen/mti/tensor/reshape_ops.h" -#include "core/codegen/mti/tensor/transpose.h" - -namespace onnxruntime { -namespace tvm_codegen { - -static tvm::Tensor PadTensor1D(const tvm::Tensor& input, - const tvm::Array& padding, - size_t width_axis, - const std::string& name) { - auto pad_left = padding[0]; - auto pad_right = padding[1]; - - tvm::Array pad_before(std::vector(input->shape.size(), 0)); - pad_before.Set(width_axis, pad_left); - tvm::Array pad_after(std::vector(input->shape.size(), 0)); - pad_after.Set(width_axis, pad_right); - - const int64_t* padding_w0 = tvm::as_const_int(pad_left); - const int64_t* padding_w1 = tvm::as_const_int(pad_right); - - const bool do_pad = ((padding_w0 != nullptr && *padding_w0) || - (padding_w1 != nullptr && *padding_w1)); - - return do_pad ? Pad(input, pad_before, pad_after, - 0, "constant", name + "_input_padded") - : input; -} - -tvm::Tensor Conv1D(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& out_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name) { - size_t channel_axis = 1; - size_t width_axis = 2; - - auto stride_width = stride[width_axis - 2]; - - auto input_padded = PadTensor1D(input, padding, width_axis, name); - auto rc = tvm::reduce_axis((tvm::Range(0, filter->shape[1])), "rc"); - auto rx = tvm::reduce_axis((tvm::Range(0, filter->shape[2])), "rx"); - - return tvm::compute( - out_shape, - [&](const tvm::Array& output) { - tvm::Array indices; - for (const tvm::Var& var : output) { - indices.push_back(var); - } - indices.Set(channel_axis, rc); - indices.Set(width_axis, output[width_axis] * stride_width + rx); - - return tvm::sum(input_padded(indices) * filter({output[1], rc, rx}), - {rc, rx}); - }, - name); -} - -tvm::Tensor Conv2D(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& output_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name) { - return Conv2D_native(input, filter, output_shape, stride, padding); -} - -static tvm::Tensor PadTensor2D(const tvm::Tensor& input, - const tvm::Array& padding, - size_t height_axis, - size_t width_axis, - const std::string& name) { - auto pad_top = padding[0]; - auto pad_left = padding[1]; - auto pad_bottom = padding[2]; - auto pad_right = padding[3]; - - tvm::Array pad_before(std::vector(input->shape.size(), 0)); - pad_before.Set(height_axis, pad_top); - pad_before.Set(width_axis, pad_left); - - tvm::Array pad_after(std::vector(input->shape.size(), 0)); - pad_after.Set(height_axis, pad_bottom); - pad_after.Set(width_axis, pad_right); - - const int64_t* padding_h0 = tvm::as_const_int(pad_top); - const int64_t* padding_w0 = tvm::as_const_int(pad_left); - const int64_t* padding_h1 = tvm::as_const_int(pad_bottom); - const int64_t* padding_w1 = tvm::as_const_int(pad_right); - - const bool do_pad = ((padding_h0 != nullptr && *padding_h0) || - (padding_w0 != nullptr && *padding_w0)) || - ((padding_h1 != nullptr && *padding_h1) || - (padding_w1 != nullptr && *padding_w1)); - - return do_pad ? Pad(input, pad_before, pad_after, - 0, "constant", name + "_input_padded") - : input; -} - -tvm::Tensor Conv2D_native(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& out_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name) { - size_t channel_axis = 1; - size_t height_axis = 2; - size_t width_axis = 3; - - auto stride_height = stride[height_axis - 2]; - auto stride_width = stride[width_axis - 2]; - - auto input_padded = PadTensor2D(input, padding, height_axis, width_axis, name); - - auto rc = tvm::reduce_axis((tvm::Range(0, filter->shape[1])), "rc"); - auto ry = tvm::reduce_axis((tvm::Range(0, filter->shape[2])), "ry"); - auto rx = tvm::reduce_axis((tvm::Range(0, filter->shape[3])), "rx"); - - return tvm::compute( - out_shape, - [&](const tvm::Array& output) { - tvm::Array indices; - for (const tvm::Var& var : output) { - indices.push_back(var); - } - indices.Set(channel_axis, rc); - indices.Set(height_axis, output[height_axis] * stride_height + ry); - indices.Set(width_axis, output[width_axis] * stride_width + rx); - - return tvm::sum(input_padded(indices) * filter({output[1], rc, ry, rx}), - {rc, ry, rx}); - }, - name); -} - -tvm::Tensor Conv2D_gemm(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& out_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name) { - size_t height_axis = 2; - size_t width_axis = 3; - - auto stride_height = stride[height_axis - 2]; - auto stride_width = stride[width_axis - 2]; - - auto input_padded = PadTensor2D(input, padding, height_axis, width_axis, name); - - tvm::Array img_col_tmp(std::vector(6, 0)); - img_col_tmp.Set(0, out_shape[0]); - img_col_tmp.Set(1, out_shape[2]); - img_col_tmp.Set(2, out_shape[3]); - img_col_tmp.Set(3, filter->shape[1]); - img_col_tmp.Set(4, filter->shape[2]); - img_col_tmp.Set(5, filter->shape[3]); - - auto img_col = tvm::compute( - img_col_tmp, - [&](const tvm::Array& output) { - tvm::Array indices; - indices.push_back(output[0]); - indices.push_back(output[3]); - indices.push_back(output[1] * stride_height + output[4]); - indices.push_back(output[2] * stride_width + output[5]); - return input_padded(indices); - }, - name); - - tvm::Array input_col_shape(std::vector(2, 0)); - input_col_shape.Set(0, img_col_tmp[1] * img_col_tmp[2]); - input_col_shape.Set(1, img_col_tmp[3] * img_col_tmp[4] * img_col_tmp[5]); - auto input_col = Reshape(img_col, input_col_shape); - - tvm::Array filter_row_shape(std::vector(2, 0)); - filter_row_shape.Set(0, filter->shape[0]); - filter_row_shape.Set(1, filter->shape[1] * filter->shape[2] * filter->shape[3]); - auto filter_row = Reshape(filter, filter_row_shape, name); - - auto Y = MatMul2D(input_col, filter_row, false, true, name); - auto Y_T = Transpose(Y, /*axes=*/{}, name); - return Reshape(Y_T, out_shape, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/conv_ops.h b/onnxruntime/core/codegen/mti/nn/conv_ops.h deleted file mode 100644 index 1396c216865a7..0000000000000 --- a/onnxruntime/core/codegen/mti/nn/conv_ops.h +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Conv1D(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& output_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name = "conv1d"); - -tvm::Tensor Conv2D(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& output_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name = "conv2d"); - -tvm::Tensor Conv2D_native(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& output_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name = "conv2d_native"); - -tvm::Tensor Conv2D_gemm(const tvm::Tensor& input, - const tvm::Tensor& filter, - const tvm::Array& output_shape, - const tvm::Array& stride, - const tvm::Array& padding, - const std::string& name = "conv2d_gemm"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/lstm.cc b/onnxruntime/core/codegen/mti/nn/lstm.cc deleted file mode 100644 index 1148b0924e869..0000000000000 --- a/onnxruntime/core/codegen/mti/nn/lstm.cc +++ /dev/null @@ -1,140 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/nn/lstm.h" - -#include "core/codegen/mti/math/binary_ops.h" -#include "core/codegen/mti/math/unary_ops.h" -#include "core/codegen/mti/math/matmul_ops.h" -#include "core/codegen/mti/math/reduce_ops.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/reshape_ops.h" -#include "core/codegen/mti/tensor/split.h" - -namespace onnxruntime { -namespace tvm_codegen { - -/* -`X` - input tensor -`i` - input gate -`o` - output gate -`f` - forget gate -`c` - cell gate -`t` - time step (t-1 means previous time step) - -`W[iofc]` - W parameter weight matrix for input, output, forget, and cell gates -`R[iofc]` - R recurrence weight matrix for input, output, forget, and cell gates -`Wb[iofc]` - W bias vectors for input, output, forget, and cell gates -`Rb[iofc]` - R bias vectors for input, output, forget, and cell gates -`P[iof]` - P peephole weight vector for input, output, and forget gates -`WB[iofc]` - W parameter weight matrix for backward input, output, forget, and cell gates -`RB[iofc]` - R recurrence weight matrix for backward input, output, forget, and cell gates -`WBb[iofc]` - W bias vectors for backward input, output, forget, and cell gates -`RBb[iofc]` - R bias vectors for backward input, output, forget, and cell gates -`PB[iof]` - P peephole weight vector for backward input, output, and forget gates - -`H` - Hidden state -`num_directions` - 2 if direction == bidirectional else 1 - -Equations (Default: f=Sigmoid, g=Tanh, h=Tanh): - it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi) - ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf) - ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc) - Ct = ft (.) Ct-1 + it (.) ct - ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) - Ht = ot (.) h(Ct) -*/ - -void LSTM_cell( - const LSTMAttributes& lstm_attrs, - const tvm::Tensor& X, - const tvm::Tensor& W, - const tvm::Tensor& R, - const tvm::Tensor& B, - bool has_B, - const tvm::Tensor& prev_H, - const tvm::Tensor& prev_C, - const tvm::Tensor& P, - bool has_P, - tvm::Tensor& Y_h, - tvm::Tensor& Y_c) { - // Input projection: Xt*(W[iofc]^T) for forward direction or Xt*(WB[iofc]^T) for reverse direction - // (batch_size, input_size) * trans(4 * hidden_size, input_size) => (batch_size, 4 * hidden_size) - tvm::Tensor input_proj = MatMul2D(X, W, /*trans_a*/ false, /*trans_b*/ true); - - // Hidden projection: Ht-1*(R[iofc]^T) for forward direction or Ht-1*(RB[iofc]^T) for reverse direction - // (batch_size, hidden_size) * trans(4 * hidden_size, hidden_size) => (batch_size, 4 * hidden_size) - tvm::Tensor hidden_proj = MatMul2D(prev_H, R, /*trans_a*/ false, /*trans_b*/ true); - - // (batch_size, 4 * hidden_size) - tvm::Tensor sum_proj = Add(input_proj, hidden_proj); - - // Concatenation of [Wb[iofc], Rb[iofc]] or [WBb[iofc], RBb[iofc]] - if (has_B) { - // (8 * hidden_size) -> (2, 4 * hidden_size) -> (1, 4 * hidden_size), should be done in const folding - tvm::Tensor reduce_B = - ReduceSum(Reshape(B, {2, 4 * static_cast(lstm_attrs.hidden_size)}), {0}, /*keep_dims*/ true); - // (batch_size, 4 * hidden_size) via broadcasting reduce_B - sum_proj = Add(sum_proj, reduce_B); - } - - std::vector iofc_sum_split_sizes(4, lstm_attrs.hidden_size); - // Split sum_proj into iofc, where each gate proj is of (batch_size, hidden_size) - tvm::Array iofc_sum_projs = Split(sum_proj, ToTvmArray(iofc_sum_split_sizes), /*axis*/ 1); - MTI_ASSERT(iofc_sum_projs.size() == 4); - tvm::Tensor i_proj = iofc_sum_projs[0], - o_proj = iofc_sum_projs[1], - f_proj = iofc_sum_projs[2], - c_proj = iofc_sum_projs[3]; - - tvm::Tensor P_i, P_o, P_f; - if (has_P) { - std::vector iof_p_split_sizes(3, lstm_attrs.hidden_size); - // Split P into P_i, P_o, P_f, in const pre-processing (P_i, P_f might be merged?) - // where each P_[iof] has the shape of (hidden_size) - tvm::Array iof_P_projs = Split(P, ToTvmArray(iof_p_split_sizes), /*axis*/ 0); - MTI_ASSERT(iof_P_projs.size() == 3); - P_i = iof_P_projs[0], - P_o = iof_P_projs[1], - P_f = iof_P_projs[2]; - - // (batch_size, hidden_size) via broadcasting P_[if] - i_proj = Add(i_proj, Mul(P_i, prev_C)); - f_proj = Add(f_proj, Mul(P_f, prev_C)); - } - - // TODO: handle more general cases for activations f, h, g and activation_alpha and - // activation_beta. We may consider to move some code such as ActivationInfo from deep_cpu_lstm - // into a common header file, because the code can be used here. - - // Note that by default f = Sigmoid, g = Tanh, h = Tanh - - // it = f(Xt*(Wi^T) + Ht-1*(Ri^T) + Pi (.) Ct-1 + Wbi + Rbi) - // shape: (batch_size, hidden_size) - tvm::Tensor i_t = Sigmoid(i_proj); - // ft = f(Xt*(Wf^T) + Ht-1*(Rf^T) + Pf (.) Ct-1 + Wbf + Rbf) - // shape: (batch_size, hidden_size) - tvm::Tensor f_t = Sigmoid(f_proj); - // ct = g(Xt*(Wc^T) + Ht-1*(Rc^T) + Wbc + Rbc) - // shape: (batch_size, hidden_size) - tvm::Tensor c_t = Tanh(c_proj); - - // Ct = ft (.) Ct-1 + it (.) ct - // shape: (batch_size, hidden_size) - Y_c = Add(Mul(f_t, prev_C), Mul(i_t, c_t), Y_c->op->name); - - // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) - // shape: (batch_size, hidden_size) - if (has_P) { - o_proj = Add(o_proj, Mul(P_o, Y_c)); - } - // ot = f(Xt*(Wo^T) + Ht-1*(Ro^T) + Po (.) Ct + Wbo + Rbo) - // shape: (batch_size, hidden_size) - o_proj = Sigmoid(o_proj); - // Ht = ot (.) h(Ct) - // shape: (batch_size, hidden_size) - Y_h = Mul(o_proj, Tanh(Y_c), Y_h->op->name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/lstm.h b/onnxruntime/core/codegen/mti/nn/lstm.h deleted file mode 100644 index 851fa880c4427..0000000000000 --- a/onnxruntime/core/codegen/mti/nn/lstm.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -// A bubble now. But don't remove it -// TODO: refactor the LSTMcell building to a tvm function -// and move it here - -namespace onnxruntime { -namespace tvm_codegen { - -struct LSTMAttributes { - LSTMAttributes(int64_t hidden_size_p) : hidden_size(hidden_size_p) {} - int64_t hidden_size; -}; - -void LSTM_cell( - const LSTMAttributes& lstm_attrs, - const tvm::Tensor& X, - const tvm::Tensor& W, - const tvm::Tensor& R, - const tvm::Tensor& B, - bool has_B, - const tvm::Tensor& prev_H, - const tvm::Tensor& prev_C, - const tvm::Tensor& P, - bool has_P, - tvm::Tensor& Y_h, - tvm::Tensor& Y_c); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.cc b/onnxruntime/core/codegen/mti/nn/pool_ops.cc deleted file mode 100644 index 868a14748cabc..0000000000000 --- a/onnxruntime/core/codegen/mti/nn/pool_ops.cc +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/nn/pool_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/mlas/inc/mlas.h" -#include "core/providers/cpu/nn/pool_attributes.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// TODO: topi only support 2d-pool, MaxPool1d and MaxPool3d will need to be added if necessary. -// only support version < 8 for topi doesn't come with implementation to output index tensor -tvm::Tensor MaxPool(const tvm::Tensor& input, - const PoolAttributes& pool_attrs, - const tvm::Array& /*output_shape*/, - const std::string& /*name*/) { - return topi::nn::pool(input, - ToTvmArray(pool_attrs.kernel_shape), - ToTvmArray(pool_attrs.strides), - ToTvmArray(pool_attrs.pads), - /*pool_type*/ topi::nn::kMaxPool, - /*ceil_mode*/ false, - /*layout*/ pool_attrs.storage_order == 0 ? "NCWH" : "NCHW", - pool_attrs.count_include_pad); -} - -tvm::Tensor AveragePool(const tvm::Tensor& input, - const PoolAttributes& pool_attrs, - const tvm::Array& /*output_shape*/, - const std::string& /*name*/) { - return topi::nn::pool(input, - ToTvmArray(pool_attrs.kernel_shape), - ToTvmArray(pool_attrs.strides), - ToTvmArray(pool_attrs.pads), - /*pool_type*/ topi::nn::kAvgPool, - /*ceil_mode*/ false, - /*layout*/ "NCHW", - pool_attrs.count_include_pad); -} - -tvm::Tensor GlobalMaxPool(const tvm::Tensor& input, - const PoolAttributes& /*pool_attrs*/, - const tvm::Array& /*output_shape*/, - const std::string& /*name*/) { - return topi::nn::global_pool(input, - /*pool_type*/ topi::nn::kMaxPool, - /*layout*/ "NCHW"); -} - -tvm::Tensor GlobalAveragePool(const tvm::Tensor& input, - const PoolAttributes& /*pool_attrs*/, - const tvm::Array& /*output_shape*/, - const std::string& /*name*/) { - return topi::nn::global_pool(input, - /*pool_type*/ topi::nn::kAvgPool, - /*layout*/ "NCHW"); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/nn/pool_ops.h b/onnxruntime/core/codegen/mti/nn/pool_ops.h deleted file mode 100644 index d381f9ddff859..0000000000000 --- a/onnxruntime/core/codegen/mti/nn/pool_ops.h +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { - -// Forward declaration -struct PoolAttributes; - -namespace tvm_codegen { - -tvm::Tensor MaxPool(const tvm::Tensor& input, - const PoolAttributes& pool_attrs, - const tvm::Array& output_shape, - const std::string& name = "max_pool"); - -tvm::Tensor AveragePool(const tvm::Tensor& input, - const PoolAttributes& pool_attrs, - const tvm::Array& output_shape, - const std::string& name = "average_pool"); - -tvm::Tensor GlobalMaxPool(const tvm::Tensor& input, - const PoolAttributes& pool_attrs, - const tvm::Array& output_shape, - const std::string& name = "global_max_pool"); - -tvm::Tensor GlobalAveragePool(const tvm::Tensor& input, - const PoolAttributes& pool_attrs, - const tvm::Array& output_shape, - const std::string& name = "global_average_pool"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/cast_ops.cc b/onnxruntime/core/codegen/mti/tensor/cast_ops.cc deleted file mode 100644 index a8fc86488d82b..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/cast_ops.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/cast_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Cast(const tvm::Tensor& X, tvm::Type type, const std::string& name) { - return topi::cast(X, type, name); -} - -// handle cases where bool is reprented as uint8 (e.g. in ONNX). -tvm::Tensor CastToUInt8Bool(const tvm::Tensor& X, const std::string& name) { - return tvm::compute( - X->shape, - [&](const tvm::Array& indices) { - auto val = X(indices); - // A special cast from float16 to bool, first cast up to float32, - // to workaround a float16 bug in many TVM backends. - // Intel Skylake is one of them. https://github.com/dmlc/tvm/issues/2959 - // TODO: remove it, after TVM is fixed - if (X->dtype == HalideIR::Float(16)) - val = tvm::cast(HalideIR::Float(32), val); - return tvm::ir::Select::make(topi::equal(val, tvm::make_zero(val.type())), - tvm::make_zero(HalideIR::UInt(8)), - tvm::make_const(HalideIR::UInt(8), 1)); - }, - name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/cast_ops.h b/onnxruntime/core/codegen/mti/tensor/cast_ops.h deleted file mode 100644 index 02f6f9cb1fde7..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/cast_ops.h +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Cast(const tvm::Tensor& X, tvm::Type type, const std::string& name = "cast"); -tvm::Tensor CastToUInt8Bool(const tvm::Tensor& X, const std::string& name = "cast_uint8_bool"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc b/onnxruntime/core/codegen/mti/tensor/concat_ops.cc deleted file mode 100644 index 3394d5b7e00a2..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/concat_ops.cc +++ /dev/null @@ -1,83 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/concat_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Concat(const tvm::Array& inputs, - int64_t axis, - const std::string& name) { - return ConcatSafe(inputs, axis, name); -} - -// Note topi's implementation requires control flow within iterations to avoid out-of-bound access. -// Therefore, MTI implements a ConcatSafe that does not have out-of-bound access, -// and does not requires control or predicate. -tvm::Tensor ConcatSafe(const tvm::Array& inputs, - int64_t axis, - const std::string& name) { - axis = HandleNegativeAxis(axis, gsl::narrow(inputs[0]->shape.size())); - MTI_ASSERT(axis < gsl::narrow(inputs[0]->shape.size()) && "axis out of bounds"); - - tvm::Array axis_sizes; - for (auto t : inputs) { - axis_sizes.push_back(t->shape[axis]); - } - - tvm::Expr join_size = axis_sizes[0]; - for (size_t i = 1; i < axis_sizes.size(); ++i) { - join_size += axis_sizes[i]; - } - join_size = tvm::ir::Simplify(join_size); - tvm::Array out_shape; - for (size_t i = 0; i < inputs[0]->shape.size(); ++i) { - out_shape.push_back(i == gsl::narrow(axis) ? join_size : inputs[0]->shape[i]); - } - - return tvm::compute( - out_shape, [&](const tvm::Array& ovars) { - tvm::Array indices; - - // preset - tvm::Expr min = 0; - tvm::Expr extent = axis_sizes[0]; - tvm::Expr offset = 0; - tvm::Expr ret; - - // input i = 0 - for (size_t j = 0; j < ovars.size(); ++j) { - if (j == gsl::narrow(axis)) { - tvm::Expr ivar = ovars[j]; - indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min)); - } else { - indices.push_back(ovars[j]); - } - } - ret = inputs[0](indices); - - for (size_t i = 1; i < inputs.size(); ++i) { - offset += extent; - tvm::Expr min = 0; - extent = axis_sizes[i]; - auto j = gsl::narrow(axis); - tvm::Expr ivar = ovars[j] - offset; - indices.Set(j, tvm::max(tvm::min(ivar, min + extent - 1), min)); - - ret = tvm::ir::Select::make(ivar >= 0, - inputs[i](indices), - ret); - } - - return ret; - }, - name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/concat_ops.h b/onnxruntime/core/codegen/mti/tensor/concat_ops.h deleted file mode 100644 index 153afebb44615..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/concat_ops.h +++ /dev/null @@ -1,15 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Concat(const tvm::Array& inputs, int64_t axis, const std::string& name = "concat"); -tvm::Tensor ConcatSafe(const tvm::Array& inputs, int64_t axis, const std::string& name = "concat_safe"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/crop.cc b/onnxruntime/core/codegen/mti/tensor/crop.cc deleted file mode 100644 index 3fe569100df12..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/crop.cc +++ /dev/null @@ -1,58 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/crop.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Crop(const tvm::Tensor& t, - const tvm::Array& border, - const tvm::Array& scale, - const std::string& name) { - MTI_ASSERT(t->shape.size() == 4); - tvm::Expr N = t->shape[0]; - tvm::Expr C = t->shape[1]; - tvm::Expr H = t->shape[2]; - tvm::Expr W = t->shape[3]; - - MTI_ASSERT(border.size() == 4); - tvm::Expr leftBorder = border[0]; - tvm::Expr topBorder = border[1]; - tvm::Expr rightBorder = border[2]; - tvm::Expr bottomBorder = border[3]; - - tvm::Expr bottomLimit = H - bottomBorder; - tvm::Expr rightLimit = W - rightBorder; - - if (!scale.empty()) { - CHECK_EQ(scale.size(), 2); - bottomLimit = topBorder + scale[0]; - rightLimit = leftBorder + scale[1]; - } - - tvm::Array output_shape; - output_shape.push_back(tvm::ir::Simplify(N)); - output_shape.push_back(tvm::ir::Simplify(C)); - output_shape.push_back(tvm::ir::Simplify(bottomLimit - topBorder)); - output_shape.push_back(tvm::ir::Simplify(rightLimit - leftBorder)); - - auto l = [&](const tvm::Array& ovars) { - tvm::Array indices; - - indices.push_back(tvm::min(ovars[0], output_shape[0] - 1)); - indices.push_back(tvm::min(ovars[1], output_shape[1] - 1)); - indices.push_back(tvm::min(topBorder + ovars[2], topBorder + output_shape[2] - 1)); - indices.push_back(tvm::min(leftBorder + ovars[3], leftBorder + output_shape[3] - 1)); - - return t(indices); - }; - - return tvm::compute(output_shape, l, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/crop.h b/onnxruntime/core/codegen/mti/tensor/crop.h deleted file mode 100644 index ffb6a05c70504..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/crop.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Crop(const tvm::Tensor& t, - const tvm::Array& border, - const tvm::Array& scale = {}, - const std::string& name = "crop"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/expand.cc b/onnxruntime/core/codegen/mti/tensor/expand.cc deleted file mode 100644 index cdac4f56e1f9f..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/expand.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/expand.h" -#include "core/codegen/mti/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Expand(const tvm::Tensor& X, const tvm::Array& new_shape, const std::string& name) { - MTI_ASSERT(new_shape.size() >= X->shape.size()); - return tvm::compute( - new_shape, - [&](const tvm::Array& out_indices) { - tvm::Array indices; - size_t broadcasted_rank = new_shape.size() - X->shape.size(); - for (size_t d = broadcasted_rank; d < new_shape.size(); ++d) { - if (tvm::is_const_int(X->shape[d - broadcasted_rank], 1)) { - indices.push_back(tvm::make_zero(HalideIR::Int(32))); - } else { - indices.push_back(out_indices[d]); - } - } - return X(indices); - }, - name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/expand.h b/onnxruntime/core/codegen/mti/tensor/expand.h deleted file mode 100644 index d66d41aeb0194..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/expand.h +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Expand(const tvm::Tensor& X, const tvm::Array& new_shape, const std::string& name = "expand"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/gather.cc b/onnxruntime/core/codegen/mti/tensor/gather.cc deleted file mode 100644 index 152b3981f1623..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/gather.cc +++ /dev/null @@ -1,55 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/gather.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Gather(const tvm::Tensor& t, - int64_t axis, - const tvm::Tensor& indices, - const std::string& name) { - // handle negative axis - axis = HandleNegativeAxis(axis, gsl::narrow(t->shape.size())); - size_t axis_t = gsl::narrow(axis); - - tvm::Array output_shape; - for (size_t i = 0; i < axis_t; ++i) - output_shape.push_back(t->shape[i]); - - for (size_t i = 0; i < indices->shape.size(); ++i) - output_shape.push_back(indices->shape[i]); - - for (size_t i = axis_t + 1; i < t->shape.size(); ++i) - output_shape.push_back(t->shape[i]); - - tvm::Expr idx_upper_bound = t->shape[axis_t]; - auto l = [&](const tvm::Array& ovars) { - tvm::Array ivars; - for (size_t i = 0; i < t->shape.size(); ++i) { - if (i < axis_t) { - ivars.push_back(ovars[i]); - } else if (i == axis_t) { - tvm::Array idx_vars; - for (size_t d = 0; d < indices->shape.size(); ++d) - idx_vars.push_back(ovars[axis_t + d]); - // make sure idx is clamped in the range of [-idx_upper_bound, idx_upper_bound - 1] - tvm::Expr real_idx = tvm_codegen::ClampIndex(indices(idx_vars), idx_upper_bound); - ivars.push_back(tvm::cast(tvm::Int(32), real_idx)); // tvm indices must be Int32 - } else { - ivars.push_back(ovars[i - 1 + indices->shape.size()]); - } - } - return t(ivars); - }; - - return tvm::compute(output_shape, l, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/gather.h b/onnxruntime/core/codegen/mti/tensor/gather.h deleted file mode 100644 index a44bf3e4127d5..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/gather.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Gather(const tvm::Tensor& t, - int64_t axis, - const tvm::Tensor& indices, - const std::string& name = "gather"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/gather_elements.cc b/onnxruntime/core/codegen/mti/tensor/gather_elements.cc deleted file mode 100644 index 12d2983335890..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/gather_elements.cc +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/gather_elements.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor GatherElements(const tvm::Tensor& t, - int64_t axis, - const tvm::Tensor& indices, - const std::string& name) { - tvm::Array output_shape; - int64_t indices_rank = static_cast(indices->shape.size()); - // output shape is the same as indices - for (int64_t i = 0; i < indices_rank; ++i) - output_shape.push_back(indices->shape[i]); - - tvm::Expr idx_upper_bound = t->shape[axis]; - auto l = [&](const tvm::Array& ovars) { - tvm::Array ivars; - for (int64_t i = 0; i < indices_rank; i++) { - if (i == axis) { - tvm::Array idx_vars; - for (int64_t j = 0; j < indices_rank; j++) - idx_vars.push_back(ovars[j]); - // make sure idx is clamped in the range of [-idx_upper_bound, idx_upper_bound - 1] - tvm::Expr real_idx = tvm_codegen::ClampIndex(indices(idx_vars), idx_upper_bound); - // tvm idx must be of Int(32) - ivars.push_back(tvm::cast(tvm::Int(32), real_idx)); - } else { - ivars.push_back(ovars[i]); - } - } - return t(ivars); - }; - - return tvm::compute(output_shape, l, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/gather_elements.h b/onnxruntime/core/codegen/mti/tensor/gather_elements.h deleted file mode 100644 index 650086f0f2e87..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/gather_elements.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor GatherElements(const tvm::Tensor& t, - int64_t axis, - const tvm::Tensor& indices, - const std::string& name = "gather_elements"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/pad_ops.cc b/onnxruntime/core/codegen/mti/tensor/pad_ops.cc deleted file mode 100644 index 2f688290d109e..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/pad_ops.cc +++ /dev/null @@ -1,121 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/pad_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// Note topi::pad does not support modes {edge, reflect} -// Therefore, MTI implements a generic Pad -tvm::Tensor Pad(const tvm::Tensor& t, - const tvm::Array& pad_before, - const tvm::Array& pad_after, - float pad_value, - const std::string& mode, - const std::string& name) { - MTI_ASSERT(pad_before.size() >= 1); - MTI_ASSERT(pad_before.size() == pad_after.size()); - MTI_ASSERT(pad_before.size() == t->shape.size()); - - tvm::Array output_shape; - for (size_t i = 0; i < t->shape.size(); ++i) { - output_shape.push_back( - tvm::ir::Simplify(t->shape[i] + pad_before[i] + pad_after[i])); - } - - auto l = [&](const tvm::Array& ovars) { - tvm::Array conds; - tvm::Array indices; - tvm::Array coords; - - for (size_t i = 0; i < t->shape.size(); ++i) { - tvm::Expr ivar = ovars[i] - pad_before[i]; - tvm::Expr min = 0; - tvm::Expr extent = t->shape[i]; - - conds.push_back(ivar < min); - conds.push_back(ivar >= min + extent); - indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min)); - - if (mode == "reflect") { - // calculate indices for reflect mode - tvm::Expr limit = extent - 1; - tvm::Expr coord = ivar - min; - // Avoid mod zero when tensor shape has 1, - // e.g. input shape is [1, 3, 3] instead of [3, 3] - auto* p_limit = tvm::as_const_int(limit); - if (p_limit != nullptr && *p_limit != 0) - coord = (coord + 2 * limit) % (2 * limit); // avoid negative value - coord = coord - limit; - coord = tvm::abs(coord); - coord = limit - coord; - coord = coord + min; - coords.push_back(coord); - } - } - - if (mode == "reflect") { - return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make), - t(coords), t(indices)); - } else if (mode == "constant") { - return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make), - tvm::make_const(t->dtype, pad_value), t(indices)); - } - - // default mode is edge - return t(indices); - }; - - return tvm::compute(output_shape, l, name); -} - -tvm::Tensor Pad(const tvm::Tensor& t, - const tvm::Array& output_shape, - const tvm::Expr& pad_value, - const std::string& name) { - MTI_ASSERT(t->dtype == pad_value.type()); - - auto l = [&](const tvm::Array& ovars) { - tvm::Array conds; - tvm::Array indices; - - for (size_t i = 0; i < t->shape.size(); ++i) { - tvm::Expr ivar = ovars[i]; - tvm::Expr min = 0; - tvm::Expr extent = t->shape[i]; - - conds.push_back(ivar < min); - conds.push_back(ivar >= min + extent); - indices.push_back(tvm::max(tvm::min(ivar, min + extent - 1), min)); - } - - return tvm::ir::Select::make(topi::detail::Map(conds, tvm::ir::Or::make), - pad_value, t(indices)); - }; - - return tvm::compute(output_shape, l, name); -} - -tvm::Tensor PadLastDim(const tvm::Tensor& t, - const int32_t align_size, - const tvm::Expr& pad_value, - const std::string& name) { - auto input_shape = t->shape; - tvm::Array out_shape; - size_t input_shape_rank = input_shape.size(); - for (size_t i = 0; i < input_shape_rank - 1; ++i) { - out_shape.push_back(input_shape[i]); - } - out_shape.push_back( - (input_shape[input_shape_rank - 1] + align_size - 1) / - align_size * align_size); - - return Pad(t, out_shape, pad_value, name + "_pad"); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/pad_ops.h b/onnxruntime/core/codegen/mti/tensor/pad_ops.h deleted file mode 100644 index 6e8e350d71e97..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/pad_ops.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// ONNX Pad semantics -tvm::Tensor Pad(const tvm::Tensor& t, - const tvm::Array& pad_before, - const tvm::Array& pad_after, - float pad_value = 0.0f, - const std::string& mode = "constant", - const std::string& name = "pad"); - -// Other common Pad interfaces -// Pad for a given shape -tvm::Tensor Pad(const tvm::Tensor& t, - const tvm::Array& output_shape, - const tvm::Expr& pad_value, - const std::string& name = "pad"); - -// Pad for the last dim only. -// This is widely used for weight layout to guard alignment -tvm::Tensor PadLastDim(const tvm::Tensor& t, - const int32_t align_size, - const tvm::Expr& pad_value, - const std::string& name = "pad_last_dim"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc b/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc deleted file mode 100644 index 817fb32c2837a..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/reshape_ops.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/reshape_ops.h" - -#include "core/codegen/mti/common.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Flatten(const tvm::Tensor& X, int64_t axis, const std::string& name) { - const auto& input_shape = X->shape; - return Reshape(X, {SizeToDimension(input_shape, axis), SizeFromDimension(input_shape, axis)}, name); -} - -tvm::Tensor Identity(const tvm::Tensor& X, const std::string& name) { - return Reshape(X, X->shape, name); -} - -tvm::Tensor Reshape(const tvm::Tensor& X, const tvm::Array& new_shape, const std::string& name) { - if (new_shape.size() > 0) { - auto X_dim = SizeToDimension(X->shape, X->shape.size()); - auto new_dim = SizeToDimension(new_shape, new_shape.size()); - auto* pX_dim = tvm::as_const_int(X_dim); - auto* pNew_dim = tvm::as_const_int(new_dim); - - if (pX_dim != nullptr && pNew_dim != nullptr) { - MTI_ASSERT(*pX_dim == *pNew_dim); - } - return topi::reshape(X, new_shape, name); - } else { - // generate empty dim tensor with origial input data value - tvm::Array tmp_shape; - tmp_shape.push_back(1); - auto tmp_tensor = topi::reshape(X, tmp_shape); - return tvm::compute( - new_shape, - [&](const tvm::Array&) { - return tmp_tensor[0]; - }, - name); - } -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/reshape_ops.h b/onnxruntime/core/codegen/mti/tensor/reshape_ops.h deleted file mode 100644 index e23d62e4c57b0..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/reshape_ops.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Flatten(const tvm::Tensor& X, int64_t axis, const std::string& name = "flatten"); -tvm::Tensor Identity(const tvm::Tensor& X, const std::string& name = "identity"); -tvm::Tensor Reshape(const tvm::Tensor& X, const tvm::Array& new_shape, const std::string& name = "reshape"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/shape_op.cc b/onnxruntime/core/codegen/mti/tensor/shape_op.cc deleted file mode 100644 index b51bd67a8b2dc..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/shape_op.cc +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/shape_op.h" - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Shape(const tvm::Tensor& X, const std::string& name) { - int ndim = static_cast(X->shape.size()); - tvm::Array out_shape{ndim}; - return tvm::compute( - out_shape, [&](const tvm::Array& indices) { - auto idx = indices[0]; - tvm::Expr ret = 0; - for (int i = 0; i < ndim; ++i) { - ret = tvm::ir::Select::make(idx == i, X->shape[i], ret); - } - return tvm::cast(HalideIR::Int(64), ret); - }, - name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/shape_op.h b/onnxruntime/core/codegen/mti/tensor/shape_op.h deleted file mode 100644 index 67ee2de50eca9..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/shape_op.h +++ /dev/null @@ -1,14 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Shape(const tvm::Tensor& X, const std::string& name = "shape"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/slice.cc b/onnxruntime/core/codegen/mti/tensor/slice.cc deleted file mode 100644 index 6cbab43584d4b..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/slice.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/slice.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include -#include -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// local constexpr for INT_MAX -constexpr int64_t max_range = INT_MAX; - -tvm::Expr position(const tvm::Expr& dim, const tvm::Integer& offset, bool allow_out_of_bound = false) { - if (offset->value >= max_range) { - return allow_out_of_bound ? dim : dim - 1; - } else if (offset->value <= -max_range) { - return tvm::make_const(HalideIR::Int(32), allow_out_of_bound ? -1 : 0); - } else { - if (offset->value >= 0) { - return tvm::ir::Simplify(tvm::ir::Min::make(offset, dim + (allow_out_of_bound ? 0 : -1))); - } else { - return tvm::ir::Simplify(dim + tvm::ir::Max::make(offset, -dim + (allow_out_of_bound ? -1 : 0))); - } - } -} - -tvm::Tensor Slice(const tvm::Tensor& X, - const std::vector& starts, - const std::vector& ends, - const std::vector& axes1, - const std::vector& steps, - const std::string& name) { - MTI_ASSERT(starts.size() == ends.size()); - MTI_ASSERT(starts.size() == axes1.size()); - MTI_ASSERT(starts.size() == steps.size()); - - std::vector axes; - for (const auto& i : axes1) { - axes.push_back(HandleNegativeAxis(i, X->shape.size())); - } - - tvm::Array output_shape; - bool empty = false; - for (int64_t i = 0; i < gsl::narrow(X->shape.size()); ++i) { - auto axes_iter = std::find(axes.begin(), axes.end(), i); - if (axes_iter != axes.end()) { - auto axis = axes_iter - axes.begin(); - tvm::Expr start = position(X->shape[i], starts[axis]); - tvm::Expr end = position(X->shape[i], ends[axis], /*allow_out_of_bound*/ true); - auto dim = tvm::ir::Simplify((end - start + tvm::Integer(steps[axis] + (steps[axis] < 0 ? 1 : -1))) / tvm::Integer(steps[axis])); - auto int_dim = tvm::as_const_int(dim); - if (int_dim && *int_dim <= 0) { - output_shape.push_back(0); - empty = true; - } else { - output_shape.push_back(dim); - } - } else { - output_shape.push_back(X->shape[i]); - } - } - - if (empty) { - return MakeZeroTensor(output_shape, X->dtype, name); - } - - return tvm::compute( - output_shape, - [&](const tvm::Array& ovars) { - tvm::Array ivars; - for (size_t i = 0; i < X->shape.size(); ++i) { - auto axes_iter = std::find(axes.begin(), axes.end(), i); - if (axes_iter != axes.end()) { - auto axis = axes_iter - axes.begin(); - ivars.push_back(tvm::ir::Simplify(ovars[i] * tvm::Integer(steps[axis]) + position(X->shape[i], starts[axis]))); - } else { - ivars.push_back(ovars[i]); - } - } - return X(ivars); - }, - name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/slice.h b/onnxruntime/core/codegen/mti/tensor/slice.h deleted file mode 100644 index ac5c9437791f6..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/slice.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Slice(const tvm::Tensor& X, - const std::vector& starts, - const std::vector& ends, - const std::vector& axes, - const std::vector& steps, - const std::string& name = "slice"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/split.cc b/onnxruntime/core/codegen/mti/tensor/split.cc deleted file mode 100644 index 6ee366314858f..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/split.cc +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/split.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// Similar to numpy, topi::split takes split indices rather than the -// sizes of the splits. Thus we implement our own. -tvm::Array Split(const tvm::Tensor& X, - const tvm::Array& split_sizes, - int64_t axis, - const std::string& name) { - MTI_ASSERT(axis < gsl::narrow(X->shape.size())); - size_t axis_t = gsl::narrow(axis); - - tvm::Array> output_shapes; - int num_splits = gsl::narrow(split_sizes.size()); - for (auto& s : split_sizes) { - tvm::Array shape; - for (size_t i = 0; i < axis_t; i++) { - shape.push_back(X->shape[i]); - } - shape.push_back(s); - for (size_t i = axis_t + 1; i < X->shape.size(); i++) { - shape.push_back(X->shape[i]); - } - output_shapes.push_back(shape); - } - - tvm::Array res; - int idx = 0; - for (int i_split = 0; i_split < num_splits; ++i_split) { - tvm::Expr s = split_sizes[i_split]; - auto l = [&](const tvm::Array& indices) { - tvm::Array new_indices; - for (size_t i = 0; i < axis_t; i++) { - new_indices.push_back(indices[i]); - } - new_indices.push_back(indices[axis_t] + idx); - for (size_t i = axis_t + 1; i < X->shape.size(); i++) { - new_indices.push_back(indices[i]); - } - MTI_ASSERT(topi::detail::IsConstInt(s)); - MTI_ASSERT(new_indices.size() == X->shape.size()); - int size = topi::detail::GetConstInt(s); - idx += size; - return X(new_indices); - }; - res.push_back(tvm::compute(output_shapes[i_split], l, name)); - } - - MTI_ASSERT(topi::detail::IsConstInt(X->shape[axis_t])); - int size_of_splitted_axis = static_cast(topi::detail::GetConstInt(X->shape[axis_t])); - MTI_ASSERT(idx == size_of_splitted_axis); - return res; -} - -tvm::Array SplitWithIndices(const tvm::Tensor& X, - const tvm::Array& split_sizes, - int64_t axis, - const std::string& name) { - return topi::split(X, split_sizes, gsl::narrow(axis), name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/split.h b/onnxruntime/core/codegen/mti/tensor/split.h deleted file mode 100644 index bcb9c47d936dd..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/split.h +++ /dev/null @@ -1,25 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// ONNX Split semantics -tvm::Array Split(const tvm::Tensor& X, - const tvm::Array& split_sizes, - int64_t axis, - const std::string& name = "split"); - -// Another common Split interface -// Split with chunck indices -tvm::Array SplitWithIndices(const tvm::Tensor& X, - const tvm::Array& split_sizes, - int64_t axis, - const std::string& name = "split_with_indices"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/tile.cc b/onnxruntime/core/codegen/mti/tensor/tile.cc deleted file mode 100644 index 2fef86adcbaea..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/tile.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/tile.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Tile(const tvm::Tensor& t, - const std::vector& repeats, - const std::string& name) { - MTI_ASSERT(repeats.size() == t->shape.size()); - tvm::Array output_shape; - - bool repeats_zero = false; - for (size_t i = 0; i < t->shape.size(); ++i) { - if (repeats[i] == 0) - repeats_zero = true; - output_shape.push_back(t->shape[i] * gsl::narrow(repeats[i])); - } - - auto l = [&](const tvm::Array& ovars) { - if (repeats_zero) - return tvm::make_zero(t->dtype); - - tvm::Array ivars; - for (size_t i = 0; i < t->shape.size(); ++i) { - tvm::Expr ovar = ovars[i]; - ivars.push_back(ovar % t->shape[i]); - } - return t(ivars); - }; - - return tvm::compute(output_shape, l, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/tile.h b/onnxruntime/core/codegen/mti/tensor/tile.h deleted file mode 100644 index 7ce331fb5ea95..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/tile.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Tile(const tvm::Tensor& t, - const std::vector& repeats, - const std::string& name = "tile"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/transpose.cc b/onnxruntime/core/codegen/mti/tensor/transpose.cc deleted file mode 100644 index 873ff8d7f1708..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/transpose.cc +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/tensor/transpose.h" - -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Transpose(const tvm::Tensor& X, const tvm::Array& axes, const std::string& name) { - return topi::transpose(X, axes, name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/transpose.h b/onnxruntime/core/codegen/mti/tensor/transpose.h deleted file mode 100644 index a2a98fedf1e79..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/transpose.h +++ /dev/null @@ -1,16 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Transpose(const tvm::Tensor& X, - const tvm::Array& axes, - const std::string& name = "transpose"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/where.cc b/onnxruntime/core/codegen/mti/tensor/where.cc deleted file mode 100644 index 2bdac3cae7ef5..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/where.cc +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/where.h" - -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Where(const tvm::Tensor& B, - const tvm::Tensor& X, - const tvm::Tensor& Y, - const std::string& name) { - size_t rank = std::max(std::max(B->shape.size(), X->shape.size()), Y->shape.size()); - tvm::Array output_shape; - for (size_t i = 0; i < rank; ++i) { - tvm::Expr dim = tvm::make_const(HalideIR::Int(32), 1); - bool broadcasted = - BroadcastDim(B->shape, i, rank, dim) && - BroadcastDim(X->shape, i, rank, dim) && - BroadcastDim(Y->shape, i, rank, dim); - MTI_ASSERT(broadcasted); - output_shape.push_back(dim); - } - - return topi::where(topi::broadcast_to(B, output_shape), - topi::broadcast_to(X, output_shape), - topi::broadcast_to(Y, output_shape), - name); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/mti/tensor/where.h b/onnxruntime/core/codegen/mti/tensor/where.h deleted file mode 100644 index 68c5288eb3580..0000000000000 --- a/onnxruntime/core/codegen/mti/tensor/where.h +++ /dev/null @@ -1,17 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Where(const tvm::Tensor& B, - const tvm::Tensor& X, - const tvm::Tensor& Y, - const std::string& name = "where"); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h b/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h deleted file mode 100644 index 1463e50bd72fb..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/all_ops.h +++ /dev/null @@ -1,47 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/passes/utils/codegen_context.h" -#include "core/codegen/common/op_macro.h" -#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// This macro declares a TVM IR builder -// based on ORT OP type with postfix DefaultTVM -#define DECLARE_GENERIC_OP_IR_CREATOR_CLASS(OP) \ - DECLARE_OP_IR_CREATOR_CLASS(OP, DefaultTVM) - -// This macro returns a TVM IR builder class name -// based ORT OP type with postfix DefaultTVM -#define GENERIC_OP_IR_CREATOR_CLASS(OP) \ - CREATOR_CLASS(OP, DefaultTVM##IRCreator) - -#define GENERIC_OP_IR_CREATOR_STRING(OP) \ - STRINGIZE(GENERIC_OP_IR_CREATOR_CLASS(OP)) - -// define all ops for DefaultTVM -#define ADD_OP_ITEM(OP) DECLARE_GENERIC_OP_IR_CREATOR_CLASS(OP) -#define BINARY_OP(OP) ADD_OP_ITEM(OP) -#define BINARY_CMP_OP(OP) ADD_OP_ITEM(OP) -#define POOL_OP(OP) ADD_OP_ITEM(OP) -#define UNARY_OP(OP) ADD_OP_ITEM(OP) -#define VARIADIC_OP(OP) ADD_OP_ITEM(OP) -#define REDUCE_INDEXED_OP(OP) ADD_OP_ITEM(OP) -#define REDUCE_OP(OP) ADD_OP_ITEM(OP) - -LIST_ALL_GENERIC_OPS() - -#undef ADD_OP_ITEM -#undef BINARY_OP -#undef BINARY_CMP_OP -#undef POOL_OP -#undef REDUCE_OP -#undef REDUCE_INDEXED_OP -#undef UNARY_OP -#undef VARIADIC_OP - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc deleted file mode 100644 index 9452146621ac7..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/binary_ops.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/common/op_macro.h" -#include "core/codegen/mti/math/binary_ops.h" -#include "core/codegen/mti/tensor/cast_ops.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// helper local macro defines Evaluate of BINARY_OP OpIRCreators -#define BINARY_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - tvm::Tensor Y = name(inputs[0], inputs[1], node.Name()); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_BINARY_OPS() - -#undef BINARY_OP - -// helper local macro defines Evaluate of BINARY_CMP_OP OpIRCreators -#define BINARY_CMP_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - tvm::Tensor Y = Cast(name(inputs[0], inputs[1], node.Name()), HalideIR::UInt(8), "cast_bool_" #name); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_BINARY_CMP_OPS() - -#undef BINARY_CMP_OP - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc deleted file mode 100644 index bb33e6e70accf..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/clip.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/unary_ops.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Clip OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Clip)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); - tvm::Expr min_value, max_value; - if (version < 11) { - float max_v, min_v; - info.GetAttrOrDefault("min", &min_v, std::numeric_limits::lowest()); - info.GetAttrOrDefault("max", &max_v, std::numeric_limits::max()); - min_value = tvm::make_const(tvm::Float(32), min_v); - max_value = tvm::make_const(tvm::Float(32), max_v); - } else { - // for op_version >= 11, max and min are optional inputs - min_value = tvm::make_const(tvm::Float(32), std::numeric_limits::lowest()); - max_value = tvm::make_const(tvm::Float(32), std::numeric_limits::max()); - auto num_inputs = inputs.size(); - if (num_inputs >= 2 && inputs[1].defined()) { - min_value = inputs[1](); - } - if (num_inputs == 3 && inputs[2].defined()) { - max_value = inputs[2](); - } - } - - tvm::Tensor Y = Clip(inputs[0], min_value, max_value, node.Name() + "_Clip"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc deleted file mode 100644 index 64f995076e1bb..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/gemm.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/gemm.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Gemm OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Gemm)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& /*ctx_codegen*/, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - tvm::Tensor A = inputs[0]; - tvm::Tensor B = inputs[1]; - tvm::Tensor C = inputs[2]; - - int64_t trans_A, trans_B; - ORT_RETURN_IF_ERROR(attrs.GetAttr("transA", &trans_A)); - ORT_RETURN_IF_ERROR(attrs.GetAttr("transB", &trans_B)); - - float alpha, beta; - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha).IsOK()); - ORT_ENFORCE(attrs.GetAttr("beta", &beta).IsOK()); - - tvm::Tensor Y = Gemm(A, B, C, trans_A != 0, trans_B != 0, alpha, beta, node.Name() + "_Gemm"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc deleted file mode 100644 index cb09518bf63d1..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/logsoftmax.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/logsoftmax.h" -#include "core/framework/op_kernel_info.h" -#include "core/providers/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of LogSoftmax OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(LogSoftmax)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - int64_t axis_i64; - ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis_i64)); - axis_i64 = HandleNegativeAxis(axis_i64, gsl::narrow_cast(inputs[0]->shape.size())); - - tvm::Tensor Y = LogSoftmax(inputs[0], axis_i64, node.Name() + "_LogSoftmax"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc deleted file mode 100644 index ab1ac237bfa5d..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/matmul.cc +++ /dev/null @@ -1,23 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/matmul_ops.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of MatMul OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(MatMul)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - tvm::Tensor Y = MatMul(inputs[0], inputs[1], node.Name() + "_MatMul"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc deleted file mode 100644 index 6f66b1f1a2afb..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/quantize/matmul_integer.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/binary_ops.h" -#include "core/codegen/mti/math/matmul_ops.h" -#include "core/codegen/mti/tensor/cast_ops.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of MatMulInteger OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(MatMulInteger)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - const auto& A = inputs[0]; - const auto& B = inputs[1]; - auto& name = node.Name(); - - // A generic path, cast to int32 - // Support skipped trailing inputs - auto A_Int32 = (node.InputDefs().size() >= 3 && node.InputDefs()[2]->Exists()) - ? Sub(Cast(A, HalideIR::Int(32)), Cast(inputs[2], HalideIR::Int(32))) - : Cast(A, HalideIR::Int(32)); - auto B_Int32 = (node.InputDefs().size() >= 4 && node.InputDefs()[3]->Exists()) - ? Sub(Cast(B, HalideIR::Int(32)), Cast(inputs[3], HalideIR::Int(32))) - : Cast(B, HalideIR::Int(32)); - tvm::Tensor Y = MatMul(A_Int32, B_Int32, name + "_MatMulInteger"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc deleted file mode 100644 index f29a3f3e7cdf7..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/reduce_ops.cc +++ /dev/null @@ -1,111 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/common/op_macro.h" -#include "core/codegen/mti/math/reduce_ops.h" -#include "core/codegen/mti/tensor/cast_ops.h" -#include "core/codegen/mti/tensor/reshape_ops.h" -#include "core/framework/op_kernel_info.h" -#include "core/providers/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -using ReduceIndexedFunc = tvm::Tensor (*)(const tvm::Tensor& X, int64_t axis, bool keep_dims, const std::string& name); -using ReduceFunc = tvm::Tensor (*)(const tvm::Tensor& X, const std::vector& axes, bool keep_dims, const std::string& name); - -// helper class for for REDUCE_INDEXED_OP -class FuncReduceIndexed { - public: - FuncReduceIndexed(const Node& node, ReduceIndexedFunc func, const std::string& name) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - axis_ = info.GetAttrOrDefault("axis", 0); - int64_t keepdims_i = 1; - ORT_ENFORCE(info.GetAttr("keepdims", &keepdims_i).IsOK()); - keep_dims_ = (keepdims_i == 1); - func_ = func; - name_ = name; - } - - tvm::Tensor operator()(const tvm::Tensor& X) const { - auto axis = HandleNegativeAxis(axis_, gsl::narrow_cast(X->shape.size())); - tvm::Tensor index32 = func_(X, axis, keep_dims_, name_); - return Cast(index32, tvm::Int(64)); - } - - private: - int64_t axis_; - bool keep_dims_; - ReduceIndexedFunc func_; - std::string name_; -}; - -// helper class for REDUCE_OP -class FuncReduce { - public: - FuncReduce(const Node& node, ReduceFunc func, const std::string& name) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - axes_ = info.GetAttrsOrDefault("axes"); - int64_t keepdims_i = 1; - ORT_ENFORCE(info.GetAttr("keepdims", &keepdims_i).IsOK()); - keep_dims_ = (keepdims_i == 1); - func_ = func; - name_ = name; - } - - tvm::Tensor operator()(const tvm::Tensor& X) const { - std::vector axes; - for (auto i : axes_) - axes.push_back(HandleNegativeAxis(i, gsl::narrow_cast(X->shape.size()))); - - return func_(X, axes, keep_dims_, name_); - } - - private: - std::vector axes_; - bool keep_dims_; - ReduceFunc func_; - std::string name_; -}; - -// helper macro defines Evaluate of REDUCE_OP OpIRCreators -#define REDUCE_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - tvm::Tensor Y; \ - if (ShapeRank(node.OutputDefs()[0]) == 0) { \ - tvm::Tensor temp = FuncReduce(node, &name, #name)(inputs[0]); \ - Y = Reshape(temp, {}); \ - } else { \ - Y = FuncReduce(node, &name, #name)(inputs[0]); \ - } \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -// helper macro defines Evaluate of REDUCE_INDEXED_OP OpIRCreators -#define REDUCE_INDEXED_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - tvm::Tensor Y = FuncReduceIndexed(node, &name, #name)(inputs[0]); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_REDUCE_OPS() - -#undef REDUCE_OP -#undef REDUCE_INDEXED_OP - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc deleted file mode 100644 index 7b13de5a94e48..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/softmax.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/softmax.h" -#include "core/framework/op_kernel_info.h" -#include "core/providers/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Softmax OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Softmax)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - int64_t axis_i64; - ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis_i64)); - - axis_i64 = HandleNegativeAxis(axis_i64, gsl::narrow_cast(inputs[0]->shape.size())); - tvm::Tensor Y = Softmax(inputs[0], axis_i64, node.Name() + "_Softmax"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h deleted file mode 100644 index 29e6519af0ef1..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_funcs.h +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { -// helper class for unary_ops with alpha -class FuncWithAlpha { - public: - FuncWithAlpha(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - } - - protected: - float alpha_; -}; - -// helper class for unary_ops with alpha and beta -class FuncWithAlphaBeta { - public: - FuncWithAlphaBeta(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - ORT_ENFORCE(attrs.GetAttr("beta", &beta_).IsOK()); - } - - protected: - float alpha_; - float beta_; -}; - -// helper class for unary_ops with alpha and gamma -class FuncWithAlphaGamma { - public: - FuncWithAlphaGamma(const Node& node) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - ORT_ENFORCE(attrs.GetAttr("alpha", &alpha_).IsOK()); - ORT_ENFORCE(attrs.GetAttr("gamma", &gamma_).IsOK()); - } - - protected: - float alpha_; - float gamma_; -}; -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc deleted file mode 100644 index 0407c0a06abf6..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/unary_ops.cc +++ /dev/null @@ -1,93 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/common/op_macro.h" -#include "core/codegen/mti/math/unary_ops.h" -#include "core/codegen/passes/op_ir_creator/math/unary_funcs.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// helper macro declares unary_ops helper class without attribute -#define FuncClass(name) \ - class Func##name { \ - public: \ - Func##name(const Node&) {} \ - tvm::Tensor operator()(const tvm::Tensor& X) const { \ - return name(X); \ - } \ - } - -// helper macro declares unary_ops helper class with alpha -#define FuncClassAlpha(name) \ - class Func##name : public FuncWithAlpha { \ - public: \ - Func##name(const Node& node) : FuncWithAlpha(node) {} \ - tvm::Tensor operator()(const tvm::Tensor& X) const { \ - return name(X, alpha_); \ - } \ - } - -// helper macro declares unary_ops helper class with alpha and beta -#define FuncClassAlphaBeta(name) \ - class Func##name : public FuncWithAlphaBeta { \ - public: \ - Func##name(const Node& node) : FuncWithAlphaBeta(node) {} \ - tvm::Tensor operator()(const tvm::Tensor& X) const { \ - return name(X, alpha_, beta_); \ - } \ - } - -// helper macro declares unary_ops helper class with alpha and gamma -#define FuncClassAlphaGamma(name) \ - class Func##name : public FuncWithAlphaGamma { \ - public: \ - Func##name(const Node& node) : FuncWithAlphaGamma(node) {} \ - tvm::Tensor operator()(const tvm::Tensor& X) const { \ - return name(X, alpha_, gamma_); \ - } \ - } - -FuncClass(Abs); -FuncClassAlphaBeta(Affine); -FuncClass(Ceil); -FuncClassAlpha(Elu); -FuncClass(Exp); -FuncClass(Floor); -FuncClassAlphaBeta(HardSigmoid); -FuncClassAlpha(LeakyRelu); -FuncClass(Log); -FuncClass(Neg); -FuncClassAlphaBeta(ParametricSoftplus); -FuncClass(Reciprocal); -FuncClass(Relu); -FuncClassAlphaBeta(ScaledTanh); -FuncClassAlphaGamma(Selu); -FuncClass(Sigmoid); -FuncClass(Softplus); -FuncClass(Softsign); -FuncClass(Sqrt); -FuncClass(Tanh); -FuncClassAlpha(ThresholdedRelu); - -// helper macro defines Evaluate of UNARY_OP OpIRCreators -#define UNARY_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - tvm::Tensor Y = Func##name(node)(inputs[0]); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -// helper local macros to replace some calls in LIST_UNARY_OPS -LIST_UNARY_OPS() - -#undef UNARY_OP - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc deleted file mode 100644 index 9559a713c2876..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/math/variadic_ops.cc +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/math/binary_ops.h" -#include "core/codegen/mti/tensor/reshape_ops.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -tvm::Tensor Sum(const tvm::Tensor& lhs, const tvm::Tensor& rhs, const std::string& name) { - return Add(lhs, rhs, name); -} - -// helper local macro defines Evaluate of BINARY_OP OpIRCreators -#define VARIADIC_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext&, \ - tvm::Array& outputs) { \ - tvm::Tensor Y = Identity(inputs[0], node.Name() + "0"); \ - for (size_t i = 1; i < inputs.size(); ++i) \ - Y = name(Y, inputs[i], node.Name() + std::to_string(i)); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_VARIADIC_OPS() - -#undef VARIADIC_OP - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc deleted file mode 100644 index 19545d1554405..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/conv.cc +++ /dev/null @@ -1,131 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/nn/conv_ops.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/concat_ops.h" -#include "core/codegen/mti/tensor/split.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -Status GENERIC_OP_IR_CREATOR_CLASS(Conv)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - // Attributes - int64_t group; - std::string auto_pad; - std::vector kernel_shape, strides, dilations, pads; - - info.GetAttrOrDefault("group", &group, 1); - info.GetAttrOrDefault("auto_pad", &auto_pad, "NOTSET"); - - ORT_THROW_IF_ERROR(info.GetAttrs("kernel_shape", kernel_shape)); - ORT_ENFORCE(kernel_shape.size() <= 2, "Only support 1D/2D convolution currently!"); - ORT_THROW_IF_ERROR(info.GetAttrs("strides", strides)); - - dilations = info.GetAttrs("dilations", dilations).IsOK() ? dilations : std::vector(kernel_shape.size(), 1); - ORT_ENFORCE(dilations == std::vector(kernel_shape.size(), 1), "Only support dilation is 1 currently"); - - pads = info.GetAttrs("pads", pads).IsOK() ? pads : std::vector(kernel_shape.size() * 2, 0); - - // auto_pad - if (auto_pad != "NOTSET") { - auto rank = inputs[0]->shape.size() - 2; - ORT_ENFORCE(rank > 0); - for (uint64_t i = 0; i < rank; i++) { - if (auto_pad == "VALID") { - pads[i] = 0; - pads[i + rank] = 0; - } else if (auto_pad == "SAME_UPPER" || auto_pad == "SAME_LOWER") { - // TODO: handle symbolic dim - ORT_ENFORCE(ShapeHasValue(node.InputDefs()[0], 2 + i)); - - int64_t input_dim_value = ShapeValue(node.InputDefs()[0], 2 + i); - int64_t output_dim_value = (input_dim_value + strides[i] - 1) / strides[i]; - int64_t pad_needed = (output_dim_value - 1) * strides[i] + kernel_shape[i] - input_dim_value; - - pads[i] = auto_pad == "SAME_LOWER" ? (pad_needed + 1) / 2 : pad_needed / 2; - pads[i + rank] = pad_needed - pads[i]; - } else { - ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Unknown auto_pad value"); - } - } - } - - // Inputs - tvm::Tensor X = inputs[0]; - tvm::Tensor W = inputs[1]; - // Outputs - tvm::Tensor Y; - tvm::Array Y_shape = ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen); - - // 1-D convolution - if (kernel_shape.size() == 1) { - Y = Conv1D(X, W, Y_shape, ToTvmArray(strides), ToTvmArray(pads), node.Name() + "_Conv1D"); - } - // 2-D convolution - else if (kernel_shape.size() == 2) { - if (group == 1) { - Y = Conv2D(X, W, Y_shape, ToTvmArray(strides), ToTvmArray(pads), node.Name() + "_Conv2D"); - } else { - int64_t channel_out = ShapeValue(node.InputDefs()[1], 0); - int64_t channel_in = ShapeValue(node.InputDefs()[1], 1); - ORT_ENFORCE(channel_out % group == 0); - - int64_t cout_group = channel_out / group; - Y_shape.Set(1, Y_shape[1] / gsl::narrow_cast(group)); - - tvm::Array split_index0; - tvm::Array split_index1; - - for (int i = 1; i < group; i++) { - split_index0.push_back(i * channel_in); - split_index1.push_back(i * cout_group); - } - - auto input_groups = SplitWithIndices(X, split_index0, 1); - auto weight_groups = SplitWithIndices(W, split_index1, 0); - - // FIXME: This will trigger a llvm buffer overflow when group is too large - // TODO: fix this change it to batched gemm/conv - tvm::Array output_tensors; - for (int i = 0; i < group; i++) { - auto output_tensor = Conv2D(input_groups[i], - weight_groups[i], - Y_shape, - ToTvmArray(strides), - ToTvmArray(pads), - node.Name() + "_Conv2D"); - output_tensors.push_back(output_tensor); - } - Y = Concat(output_tensors, 1); - } - } - - // Add bias if provided - // Support skipped trailing inputs - if (node.InputDefs().size() > 2 && node.InputDefs()[2]->Exists()) { - tvm::Tensor B = inputs[2]; - Y = tvm::compute( - Y_shape, - [&](const tvm::Array& indices) { - return Y(indices) + B(indices[1]); - }); - } - - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc deleted file mode 100644 index 88170bb56dd2d..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/lstm.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/nn/lstm.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// In the cell computation, we don't have the "direction" dimension and sequence dimension, -// which have been processed outside of the cell. -// Here we implement an LTSM cell. -// For those args (inputs/outputs) of hidden states we put AFTER regular args (inputs/outputs) -// with a pre-defined order -// In a LSTM, the order is H and then C. -// Ouputs of LSTM is Y_h and then Y_c -Status GENERIC_OP_IR_CREATOR_CLASS(LSTM)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - std::string direction_attr; - ORT_RETURN_IF_ERROR(attrs.GetAttr("direction", &direction_attr)); - int64_t hidden_size; - ORT_RETURN_IF_ERROR(attrs.GetAttr("hidden_size", &hidden_size)); - - // input tensor with shape [seq_length, batch_size, input_size] - const tvm::Tensor& X = inputs[0]; // input tensor with shape [seq_length, batch_size, input_size] - const tvm::Tensor& W = inputs[1]; // weights tensor with shape [4*hidden_size, input_size] - const tvm::Tensor& R = inputs[2]; // recurrence tensor with shape [4*hidden_size, hidden_size] - const tvm::Tensor& B = inputs[3]; // optional bias tensor with shape [8*hidden_size] - bool has_B = node.InputDefs()[3]->Exists(); - - // Unsupported the 4th inputs - // optional tensor specifying sequence lengths in a batch, shape: [batch_size] - // const tvm::Tensor* seq_len = inputs[4] ? &inputs[4]->tensor : nullptr; - - const tvm::Tensor& prev_H = inputs[5]; // optional initial H, shape: [batch_size, hidden_size] - const tvm::Tensor& prev_C = inputs[6]; // optional initial C, shape: [batch_size, hidden_size] - - const tvm::Tensor& P = inputs[7]; // optional peepholes tensor with shape [3*hidde_size] - bool has_P = node.InputDefs()[7]->Exists(); - - tvm::Tensor Y_h; // shape: [batch_size, hidden_size] - tvm::Tensor Y_c; // shape: [batch_size, hidden_size] - LSTMAttributes lstm_attrs(hidden_size); - LSTM_cell(lstm_attrs, X, W, R, B, has_B, prev_H, prev_C, P, has_P, Y_h, Y_c); - - // Since we only generate lstm cell, lstm's states need to be always outputs, - // regardless whethere they are skipped or not. - // The skipped trailing outputs need to be handled by Execution - outputs.push_back(Y_h); - outputs.push_back(Y_c); - - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc deleted file mode 100644 index 84d3b7c1e0f79..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/nn/pool_ops.cc +++ /dev/null @@ -1,51 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/nn/pool_ops.h" -#include "core/framework/op_kernel_info.h" -#include "core/providers/cpu/nn/pool_attributes.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// A local macro to create Pool Ops - -// helper macro defines Evaluate of of POOL_OP OpIRCreators -#define POOL_OP(name) \ - Status GENERIC_OP_IR_CREATOR_CLASS(name)::Evaluate( \ - const tvm::Array& inputs, \ - const Node& node, \ - CodeGenContext& ctx_codegen, \ - tvm::Array& outputs) { \ - ORT_RETURN_IF_NOT(outputs.size() == 1, "multiple outputs are not supported yet!"); \ - ProtoHelperNodeContext ctx(node); \ - OpNodeProtoHelper info(&ctx); \ - int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); \ - PoolAttributes pool_attrs(info, #name, version); \ - for (auto n : pool_attrs.dilations) { \ - ORT_RETURN_IF_NOT(n <= 1, "dilations are not supported yet!"); \ - } \ - if (pool_attrs.global_pooling) { \ - if (inputs[0]->shape.size() != 4) { \ - ORT_NOT_IMPLEMENTED(gsl::narrow_cast(inputs[0]->shape.size()) - 2, "d global pooling is not implementated"); \ - } \ - } else { \ - if (pool_attrs.kernel_shape.size() != 2) { \ - ORT_NOT_IMPLEMENTED(pool_attrs.kernel_shape.size(), "d pooling is not implementated"); \ - } \ - } \ - tvm::Array dummy_output_shape; \ - tvm::Tensor Y = name(inputs[0], pool_attrs, dummy_output_shape); \ - outputs.push_back(Y); \ - return Status::OK(); \ - } - -LIST_POOL_OPS() - -#undef POOL_OP - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc deleted file mode 100644 index bd324fd359edf..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/cast.cc +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/tensor/cast_ops.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Cast OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Cast)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - int64_t to; - ORT_RETURN_IF_ERROR(attrs.GetAttr("to", &to)); - auto to_type_proto = gsl::narrow_cast(to); - - tvm::Tensor X = inputs[0]; - tvm::Tensor Y; - if (to_type_proto == ONNX_NAMESPACE::TensorProto_DataType_BOOL) { - // special case for bool as ONNX bool is uint8, while in tvm it's uint1 - Y = CastToUInt8Bool(X, node.Name() + "_Cast"); - } else { - Y = Cast(X, ToTvmType(to_type_proto), node.Name() + "_Cast"); - } - - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc deleted file mode 100644 index 418296889419e..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/concat.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/tensor/concat_ops.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Concat OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Concat)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - int64_t axis; - ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis)); - - tvm::Tensor Y = Concat(inputs, axis, node.Name() + "_Concat"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc deleted file mode 100644 index 3b6a9a76f0723..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/crop.cc +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/crop.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Crop OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Crop)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - if (inputs[0]->shape.size() != 4) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Input is expected to have four dimensions corresponding to [N,C,H,W]"); - } - - std::vector border; - std::vector scale; - - ORT_ENFORCE(attrs.GetAttrs("border", border).IsOK()); - // scale is optional and status is false when omit - bool is_ok = attrs.GetAttrs("scale", scale).IsOK(); - ORT_UNUSED_PARAMETER(is_ok); - - if (border.size() != 4) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, - "Attribute border needs to be specified with four border elements"); - } - - tvm::Tensor Y = Crop(inputs[0], ToTvmArray(border), ToTvmArray(scale), node.Name() + "_Crop"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc deleted file mode 100644 index 0f0e0cf0987b3..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/expand.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/expand.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Expand OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Expand)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - tvm::Tensor Y = Expand(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Expand"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc deleted file mode 100644 index 3a5d801b6839f..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/tensor/gather.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Gather OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Gather)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - int64_t axis; - ORT_ENFORCE(attrs.GetAttr("axis", &axis).IsOK()); - - tvm::Tensor Y = Gather(inputs[0], axis, inputs[1], node.Name() + "_Gather"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc deleted file mode 100644 index 0b71506cceed3..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/gather_elements.cc +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/tensor/gather_elements.h" -#include "core/framework/op_kernel_info.h" -#include "core/providers/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of GatherElements OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(GatherElements)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - int64_t axis; - ORT_ENFORCE(attrs.GetAttr("axis", &axis).IsOK()); - axis = HandleNegativeAxis(axis, gsl::narrow_cast(inputs[0]->shape.size())); - - tvm::Tensor Y = GatherElements(inputs[0], axis, inputs[1], node.Name() + "_GatherElements"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc deleted file mode 100644 index e9e20e8a43998..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/pad.cc +++ /dev/null @@ -1,49 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/pad_ops.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Pad OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Pad)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - std::string mode; - std::vector pads; - float value; - - ORT_THROW_IF_ERROR(attrs.GetAttr("mode", &mode)); - ORT_THROW_IF_ERROR(attrs.GetAttrs("pads", pads)); - ORT_THROW_IF_ERROR(attrs.GetAttr("value", &value)); - - if (mode != "constant" && mode != "edge" && mode != "reflect") - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad: Unsupported padding mode!"); - - if (pads.size() != 2 * inputs[0]->shape.size()) - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Pad: pads rank does not match inputs rank!"); - - std::vector pad_before, pad_after; - size_t offset = pads.size() / 2; - for (size_t i = 0; i < offset; i++) { - pad_before.push_back(pads[i]); - pad_after.push_back(pads[i + offset]); - } - - tvm::Tensor Y = Pad(inputs[0], ToTvmArray(pad_before), ToTvmArray(pad_after), value, mode, node.Name() + "_Pad"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc deleted file mode 100644 index a83f598bc8ad1..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/reshape_ops.cc +++ /dev/null @@ -1,99 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/reshape_ops.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Dropout OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Dropout)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - tvm::Tensor Y = Identity(inputs[0]); - outputs.push_back(Y); - - // optional mask - // Support skipped trailing outputs - if (node.OutputDefs().size() > 1 && node.OutputDefs()[1]->Exists()) { - // A fake mask with all ones - auto l = [&](const tvm::Array& /*indices*/) { - return tvm::make_const(tvm::UInt(8), 1); - }; - tvm::Tensor mask = tvm::compute(inputs[0]->shape, l, "mask"); - outputs.push_back(mask); - } - - return Status::OK(); -} - -// Evaluate of Flatten OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Flatten)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - int64_t axis; - ORT_RETURN_IF_ERROR(attrs.GetAttr("axis", &axis)); - - tvm::Tensor Y = Flatten(inputs[0], axis, node.Name() + "_Flatten"); - outputs.push_back(Y); - return Status::OK(); -} - -// Evaluate of Identity OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Identity)::Evaluate( - const tvm::Array& inputs, - const Node&, - CodeGenContext&, - tvm::Array& outputs) { - tvm::Tensor Y = Identity(inputs[0]); - outputs.push_back(Y); - return Status::OK(); -} - -// Evaluate of Reshape OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Reshape)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Reshape"); - outputs.push_back(Y); - return Status::OK(); -} - -// Evaluate of Squeeze OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Squeeze)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Squeeze"); - outputs.push_back(Y); - return Status::OK(); -} - -// Evaluate of Unsqueeze OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Unsqueeze)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - tvm::Tensor Y = Reshape(inputs[0], ShapeToTvmArray(node.OutputDefs()[0], ctx_codegen), node.Name() + "_Unsqueeze"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc deleted file mode 100644 index 84761ecac1397..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/shape_op.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/shape_op.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Expand OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Shape)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - tvm::Tensor Y = Shape(inputs[0], node.Name() + "_Expand"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc deleted file mode 100644 index 6a016580c41e4..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/slice.cc +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/slice.h" -#include "core/framework/op_kernel_info.h" -#include "core/framework/tensorprotoutils.h" - -#include - -namespace onnxruntime { -namespace tvm_codegen { - -Status SliceCommon(const tvm::Array& inputs, - const Node& node, - tvm::Array& outputs, - const std::vector& starts, - const std::vector& ends, - const std::vector& axes1, - const std::vector& steps1) { - ORT_RETURN_IF_NOT(nullptr != node.InputDefs()[0], "nullptr == node.InputDefs()[0]"); - - std::vector axes; - if (axes1.size() == 0) { - for (size_t i = 0; i < starts.size(); ++i) { - axes.push_back(gsl::narrow_cast(i)); - } - } else { - axes = axes1; - } - - std::vector steps; - if (steps1.size() == 0) { - steps.resize(starts.size(), 1); - } else { - steps = steps1; - } - - tvm::Tensor Y = Slice(inputs[0], starts, ends, axes, steps, node.Name() + "_Slice"); - outputs.push_back(Y); - return Status::OK(); -} - -// Evaluate of Slice OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Slice)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - // NOTE that in opset 10, Slice has changed starts/ends/axes from attribute to input - // which may lead to dynamic output shape. - int version = ctx_codegen.GetCodeGenHandle()->domain_version_lookup_func(node.Domain()); - ORT_RETURN_IF_NOT(version <= 9, "Dynamic Slice is not supported yet"); - - std::vector starts, ends, steps; - ORT_RETURN_IF_ERROR(info.GetAttrs("starts", starts)); - ORT_RETURN_IF_ERROR(info.GetAttrs("ends", ends)); - ORT_RETURN_IF_NOT(starts.size() == ends.size(), "starts.size() != ends.size()"); - - auto axes = info.GetAttrsOrDefault("axes"); - - return SliceCommon(inputs, node, outputs, starts, ends, axes, steps); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc deleted file mode 100644 index ec52d98b5bf96..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/split.cc +++ /dev/null @@ -1,66 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/split.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Split OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Split)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper info(&ctx); - - int64_t axis; - ORT_RETURN_IF_ERROR(info.GetAttr("axis", &axis)); - axis = HandleNegativeAxis(axis, gsl::narrow_cast(inputs[0]->shape.size())); - std::vector split_sizes; - - int64_t split_size_sum = 0; - if (info.GetAttrs("split", split_sizes).IsOK()) { - // optional - split_size_sum = std::accumulate(split_sizes.cbegin(), split_sizes.cend(), 0LL); - ORT_RETURN_IF_NOT(std::all_of(split_sizes.cbegin(), split_sizes.cend(), [](int64_t value) { return value > 0; }), - "Invalid value in 'split' attribute. All values must be > 0"); - - // check split sizes - for (size_t i = 0; i < node.OutputDefs().size(); ++i) { - ORT_RETURN_IF_NOT(split_sizes[i] == ShapeValue(node.OutputDefs()[i], gsl::narrow(axis)), - "split_sizes[i] != ShapeValue(node.OutputDefs()[i], axis)"); - } - - } else { - for (size_t i = 0; i < node.OutputDefs().size(); ++i) { - split_sizes.push_back(ShapeValue(node.OutputDefs()[i], gsl::narrow(axis))); - split_size_sum += split_sizes[i]; - } - } - - // check total size - if (ShapeHasValue(node.InputDefs()[0], axis)) { - int64_t input_axis_dim = ShapeValue(node.InputDefs()[0], axis); - if (split_size_sum != input_axis_dim) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, - "Cannot split using values in 'split' attribute. Axis=", axis, - " Dim being splitted=", input_axis_dim, - " Sum of sizes in 'split' (must equal size of selected axis) was ", split_size_sum); - } - } - - tvm::Array output_tensors = Split(inputs[0], ToTvmArray(split_sizes), axis, node.Name() + "_Split"); - for (size_t i = 0; i < node.OutputDefs().size(); ++i) { - outputs.push_back(output_tensors[i]); - } - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc deleted file mode 100644 index 43999ebd1f465..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/transpose.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/transpose.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Transpose OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Transpose)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - size_t input_0_shape_rank = inputs[0]->shape.size(); - std::vector permute; - bool is_ok = attrs.GetAttrs("perm", permute).IsOK(); - if (permute.size() != 0 && permute.size() != input_0_shape_rank) - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Transpose: Incorrect permute size"); - - std::vector default_permute; - const std::vector* perm; - // either we don't have perm attribute or the perm attribute is empty - bool use_default_perm = !is_ok || permute.size() == 0; - if (use_default_perm) { - default_permute.resize(input_0_shape_rank); - for (size_t i = 0; i < input_0_shape_rank; ++i) { - default_permute[i] = gsl::narrow(input_0_shape_rank - 1 - i); - } - perm = &default_permute; - } else { - perm = &permute; - } - - tvm::Tensor Y = Transpose(inputs[0], ToTvmArrayInt(*perm), node.Name() + "_Transpose"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc deleted file mode 100644 index 9d6df7c1c430d..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tensor/where.cc +++ /dev/null @@ -1,28 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/all_ops.h" - -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/mti/tensor/where.h" -#include "core/framework/op_kernel_info.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Evaluate of Transpose OpIRCreator -Status GENERIC_OP_IR_CREATOR_CLASS(Where)::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext&, - tvm::Array& outputs) { - ProtoHelperNodeContext ctx(node); - OpNodeProtoHelper attrs(&ctx); - - tvm::Tensor Y = Where(inputs[0], inputs[1], inputs[2], node.Name() + "_Where"); - outputs.push_back(Y); - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc deleted file mode 100644 index 7889e2add755e..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.cc +++ /dev/null @@ -1,125 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/tvm_ir_builder.h" - -#include "core/codegen/common/op_macro.h" -#include "core/codegen/passes/op_ir_creator/all_ops.h" -#include "core/common/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -TVMIRBuilder::TVMIRBuilder(const std::string& name) - : name_(name) {} - -const std::string& TVMIRBuilder::Name() const { - return name_; -} - -void TVMIRBuilder::InsertDispatcher(std::unique_ptr&& ptr) { - dispatchers_.push_back(std::move(ptr)); -} - -void TVMIRBuilder::ClearAllDispatchers() { - dispatchers_.clear(); -} - -void TVMIRBuilder::DumpAllOpIRCreators() const { - int count = 0; - for (auto& d : dispatchers_) { - std::cout << "************ TVM OpIRDispatcher " - << count << " : " - << d->Name() - << " ************" << std::endl; - - d->ForEach([](const std::string& key, OpIRCreator* builder) { - std::cout << "Key " << key - << ", Creator " << builder->Name() << std::endl; - }); - - ++count; - } -} - -// Evaluate finds ONE proper OpIRCreator and build the corresponding OpIR -// If a TVMIRBuilder has more than one OpIRCreator for an ORT Op, -// the first one will be used. -// Please adjust registration order and dispatcher in TVMIRBuilder -// to make sure the proper OpIRCreator is called. -Status TVMIRBuilder::Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx_codegen, - tvm::Array& outputs) { - OpIRCreator* candidate = nullptr; - for (auto& d : dispatchers_) { - candidate = d->Find(node); - if (nullptr != candidate) - break; - } - - if (nullptr == candidate) { - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented: ", node.OpType()); - } - - ORT_RETURN_IF_ERROR(candidate->Evaluate(inputs, node, ctx_codegen, outputs)); - - return Status::OK(); -} - -// BEGIN: Generic IR creator classes -#define ADD_OP_ITEM(name) \ - op_ir_registry->Register(std::make_unique()); - -#define BINARY_OP(name) ADD_OP_ITEM(name) -#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) -#define POOL_OP(name) ADD_OP_ITEM(name) -#define REDUCE_OP(name) ADD_OP_ITEM(name) -#define REDUCE_INDEXED_OP(name) ADD_OP_ITEM(name) -#define UNARY_OP(name) ADD_OP_ITEM(name) -#define VARIADIC_OP(name) ADD_OP_ITEM(name) - -void RegisterAllGenericOpIRCreators(OpIRRegistry* op_ir_registry) { - LIST_ALL_GENERIC_OPS(); -} - -#undef ADD_OP_ITEM -#undef BINARY_OP -#undef BINARY_CMP_OP -#undef POOL_OP -#undef REDUCE_OP -#undef REDUCE_INDEXED_OP -#undef UNARY_OP -#undef VARIADIC_OP - -// BEGIN: Plugin Generic IR creator classes -#define ADD_OP_ITEM(name) \ - dispatcher->Register(#name, registry->Get(GENERIC_OP_IR_CREATOR_STRING(name))); - -#define BINARY_OP(name) ADD_OP_ITEM(name) -#define BINARY_CMP_OP(name) ADD_OP_ITEM(name) -#define POOL_OP(name) ADD_OP_ITEM(name) -#define REDUCE_OP(name) ADD_OP_ITEM(name) -#define REDUCE_INDEXED_OP(name) ADD_OP_ITEM(name) -#define UNARY_OP(name) ADD_OP_ITEM(name) -#define VARIADIC_OP(name) ADD_OP_ITEM(name) - -void RegisterGenericOrtOpTypeDispatcher(const std::shared_ptr& builder, - const OpIRRegistry* registry) { - auto dispatcher = std::make_unique("GenericOrtOpTypeOpIRCreators"); - LIST_ALL_GENERIC_OPS() - builder->InsertDispatcher(std::move(dispatcher)); -} - -#undef ADD_OP_ITEM -#undef BINARY_OP -#undef BINARY_CMP_OP -#undef POOL_OP -#undef REDUCE_OP -#undef REDUCE_INDEXED_OP -#undef UNARY_OP -// END: Generic IR creators classes - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h deleted file mode 100644 index c80056e619d6d..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_ir_builder.h +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/passes/utils/codegen_context.h" -#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h" -#include "core/common/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// TVMIRBuilder contains all applicable TVM OpIRCreators -// OpIRCreators are stored in multiple dispatchers -// that check different conditions of an ORT Node. - -// If an ORT Node satisfies more than one OpIRCreators, -// the first dispatched pass will be applied. - -class TVMIRBuilder { - public: - TVMIRBuilder(const std::string& name); - ~TVMIRBuilder() = default; - - // A debug dumps all existing in this TVMIRBuilders - void DumpAllOpIRCreators() const; - - // Evaluates an OpIRCreator that first satisfies condtions of all dispatchers - Status Evaluate( - const tvm::Array& inputs, - const Node& node, - CodeGenContext& ctx, - tvm::Array& outputs); - - // Inserts a dispatcher and move its ownership to this TVMIRBuilder - void InsertDispatcher(std::unique_ptr&& ptr); - - // Clears all dispatchers in this TVMIRBuilder - void ClearAllDispatchers(); - - // Dumps the name of this TVMIRBuilder - const std::string& Name() const; - - private: - std::vector> dispatchers_; - std::string name_; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMIRBuilder); -}; - -// Utility function to register all builtin generic OpIRCreators into an OpIRRegistry. -// It creates instances of all generic OpIRCreators -// and registers them to op_ir_registry -void RegisterAllGenericOpIRCreators(OpIRRegistry* op_ir_registry); - -// Utility function to bind all builtin generic OpIRCreators to a TVMIRBuilder. -// It creates an instance of a Dispatcher that contains all generic OpIRCreators created above -// and uses OrtOpType to dispatch OpIRCreators. -// Then, it registers the created Dispatcher to a TVMIRBuilder, builder. -void RegisterGenericOrtOpTypeDispatcher(const std::shared_ptr& builder, - const OpIRRegistry* registry); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc deleted file mode 100644 index 992272753f5a4..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.cc +++ /dev/null @@ -1,37 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/op_ir_creator/tvm_op_creator.h" - -#include "core/codegen/common/common.h" -#include "core/codegen/common/dispatcher.h" -#include "core/codegen/passes/utils/codegen_context.h" - -namespace onnxruntime { -namespace codegen { -// Explicit instantiation for OpIRCreator -template class CreatorBase&, - const Node&, - tvm_codegen::CodeGenContext&, - tvm::Array&, - Status>; - -// Explicit instantiation for OpIRCreators' dispatcher -template class DispatcherBase; - -} // namespace codegen - -namespace tvm_codegen { - -// One dispatcher is based on ORT OpType -OpIRCreator* OP_IR_DISPATCHER_CLASS(OpType)::Find(const Node& node) { - return DispatcherBase::Get(node.OpType()); -} - -// Another dispatcher is based ORT NodeArg name (GetKey) -OpIRCreator* OP_IR_DISPATCHER_CLASS(NodeName)::Find(const Node& node) { - return DispatcherBase::Get(GetKey(&node)); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h b/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h deleted file mode 100644 index e29c4a9f20767..0000000000000 --- a/onnxruntime/core/codegen/passes/op_ir_creator/tvm_op_creator.h +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/creator.h" -#include "core/codegen/common/dispatcher.h" -#include "core/codegen/common/registry.h" -#include "core/graph/graph.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -class CodeGenContext; - -// OpIRCreator lowers an Ort Node to its corresponding TVM IRs -using OpIRCreator = codegen::CreatorBase< - const tvm::Array&, - const Node&, - CodeGenContext&, - tvm::Array&, - Status>; - -// OpIRDispatcher is the base dispatcher for TVM IR Builder -// It checks whether an Ort Node satisfying a criteria (in Find) -// and dispatches a corresponding OpIRCreator. -class OpIRDispatcher : public codegen::DispatcherBase { - public: - OpIRDispatcher(const std::string& name) - : DispatcherBase(name) {} - - virtual ~OpIRDispatcher() = default; - - virtual OpIRCreator* Find(const Node&) = 0; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OpIRDispatcher); -}; - -// Macro returns an OpIRCreators' dispatcher's name -#define OP_IR_DISPATCHER_CLASS(OP) \ - TVM##OP##IRCreator - -// Macro declares an OpIRCreators' dispatcher -#define DECLARE_OP_IR_DISPATCHER_CLASS(OP) \ - class OP_IR_DISPATCHER_CLASS(OP) : public OpIRDispatcher { \ - public: \ - TVM##OP##IRCreator(const std::string& name) \ - : OpIRDispatcher(name) {} \ - ~TVM##OP##IRCreator() = default; \ - OpIRCreator* Find(const Node&) override; \ - \ - private: \ - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(OP_IR_DISPATCHER_CLASS(OP)); \ - }; - -// Declare two common dispatchers for TVM Op IR builders -// One dispatcher is based on Ort OpType -DECLARE_OP_IR_DISPATCHER_CLASS(OpType) -// Another dispatcher is based Ort NodeArg name -DECLARE_OP_IR_DISPATCHER_CLASS(NodeName) - -// OpIRCreator Registry is a registry holds all OpIRCreators -using OpIRRegistry = codegen::RegistryBase; - -// Macro declares an OpIRCreator -#define DECLARE_OP_IR_CREATOR_CLASS(OP, PREFIX) \ - DECLARE_CREATOR_CLASS(OP, PREFIX##IRCreator, \ - const tvm::Array&, \ - const Node&, \ - tvm_codegen::CodeGenContext&, \ - tvm::Array&, \ - Status) - -// Macro returns an OpIRCreator's name with prefix -#define OP_IR_CREATOR_CLASS_EX(OP, PREFIX, ARCH) \ - CREATOR_CLASS(OP, PREFIX##ARCH##IRCreator) - -// Macro declares an OpIRCreator with prefix and arch -#define DECLARE_OP_IR_CREATOR_CLASS_EX(OP, PREFIX, ARCH) \ - DECLARE_OP_IR_CREATOR_CLASS(OP, PREFIX##ARCH) - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/all_schedules.h b/onnxruntime/core/codegen/passes/scheduler/all_schedules.h deleted file mode 100644 index fe4be90f9fc84..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/all_schedules.h +++ /dev/null @@ -1,20 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/passes/scheduler/tvm_scheduler.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// AlwaysRoot is for debug purpose -DECLARE_TVM_SCHEDULER_CLASS(AlwaysRoot, GenericTVMRule) -// Create schedule for TVM Rule -DECLARE_TVM_SCHEDULER_CLASS(Extern, GenericTVMRule) -DECLARE_TVM_SCHEDULER_CLASS(Reduce, GenericTVMRule) - -// Crete scheduler for ORT OpType, Softmax -DECLARE_TVM_SCHEDULER_CLASS(Softmax, GenericOrtOpType) - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc b/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc deleted file mode 100644 index 59f492d164b14..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/ort_type_schedule.cc +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/scheduler/all_schedules.h" - -#include "core/codegen/passes/scheduler/schedule_utils.h" - -namespace onnxruntime { -namespace tvm_codegen { - -bool TVM_SCHEDULER_CLASS(Softmax, GenericOrtOpType)::Evaluate( - const tvm::Tensor& tensor, - const Node*, - CodeGenContext&, - ScheduleContext& ctx_sched) { - // compute root the exp since it is reused more than once - auto& tensor_exp = tensor->op->InputTensors()[0]; - return InsertRootSchedule(tensor_exp, ctx_sched); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc deleted file mode 100644 index 76c2ad509c401..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.cc +++ /dev/null @@ -1,178 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/common/utils.h" -#include "core/codegen/passes/scheduler/schedule_utils.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// Check the schedule of tensor -// If it has no compute_root, Insert compute_root to tensor, and record it to ctx.scheduled_tensors -bool InsertRootSchedule( - const tvm::Tensor& tensor, - ScheduleContext& ctx) { - auto it = ctx.scheduled_tensors.find(tensor->op.get()); - if (it != ctx.scheduled_tensors.end()) { - if (it->second == ScheduleType::ScheduleClosure || - it->second == ScheduleType::ScheduleRoot) { - return false; - } - it->second = ScheduleType::ScheduleRoot; - } else { - ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleRoot)); - } - ctx.schedule[tensor->op].compute_root(); - return true; -} - -// Check the schedule of tensor -// If it is not labeled as closure, lable it. -bool InsertClosure(const tvm::Tensor& tensor, - ScheduleContext& ctx) { - auto it = ctx.scheduled_tensors.find(tensor->op.get()); - if (it != ctx.scheduled_tensors.end()) { - if (it->second == ScheduleType::ScheduleClosure) - return false; - it->second = ScheduleType::ScheduleClosure; - } else { - ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleClosure)); - } - return true; -} - -// Combination of InsertRootSchedule and InsertClosure -bool InsertRootScheduleAndClosure( - const tvm::Tensor& tensor, - ScheduleContext& ctx) { - auto it = ctx.scheduled_tensors.find(tensor->op.get()); - if (it != ctx.scheduled_tensors.end()) { - if (it->second == ScheduleType::ScheduleClosure) { - return false; - } - it->second = ScheduleType::ScheduleClosure; - } else { - ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleClosure)); - } - ctx.schedule[tensor->op].compute_root(); - return true; -} - -// Check precondition for vectorize schedule -bool ShouldTryVectorization( - const tvm::Tensor& tensor, - ScheduleContext& ctx) { - auto it = ctx.scheduled_tensors.find(tensor->op.get()); - if (it != ctx.scheduled_tensors.end()) { - if (it->second > ScheduleType::ScheduleInline) { - return false; - } - } - return true; -} - -// Check the schedule of tensor -// If it is not scheduled, try to vectorize it. -// Note TryVectorization has to use with compute_root. -// Therefore, there is a safety check of tensor's schedule -bool TryVectorization( - const tvm::Tensor& tensor, - int64_t natural_vector_size, - ScheduleContext& ctx) { - if (!ShouldTryVectorization(tensor, ctx)) - return false; - - auto shape = tensor->shape; - auto rank = shape.size(); - if (rank < 1) { - return false; - } - const int64_t* tail_dim = as_const_int(shape[rank - 1]); - - if (nullptr != tail_dim) { - auto extern_op = tensor->op.as(); - if (nullptr != extern_op) { - return false; - } - - auto compute_op = tensor->op.as(); - - if (nullptr != compute_op) { - auto axis = compute_op->axis; - tvm::IterVar x = axis[rank - 1]; - if ((*tail_dim) > natural_vector_size) { - if ((*tail_dim) % natural_vector_size != 0) { - natural_vector_size = GCD(natural_vector_size, (*tail_dim)); - } - - if (natural_vector_size > 1) { - tvm::IterVar xi, xo; - ctx.schedule[tensor->op].split(x, static_cast(natural_vector_size), &xo, &xi); - ctx.schedule[tensor->op].vectorize(xi); - return true; - } - } else if (*tail_dim > 0) { - // don't vectorize if dim is 0 - ctx.schedule[tensor->op].vectorize(x); - return true; - } - } - } - return false; -} - -// Check the schedule of tensor -// If it is not scheduled, try to add compute_inline on it. -// Note TryInlineSchedule cannot be used with compute_root. -// Therefore, there is a safety check of tensor's schedule. -bool TryInlineSchedule( - const tvm::Tensor& tensor, - ScheduleContext& ctx) { - auto it = ctx.scheduled_tensors.find(tensor->op.get()); - if (it != ctx.scheduled_tensors.end()) { - if ((int)it->second < (int)ScheduleType::ScheduleInline) { - ctx.schedule[tensor->op].compute_inline(); - it->second = ScheduleType::ScheduleInline; - return true; - } else { - return false; - } - } - ctx.schedule[tensor->op].compute_inline(); - ctx.scheduled_tensors.insert(std::make_pair(tensor->op.get(), ScheduleType::ScheduleInline)); - return true; -} - -// Check the schedule of tensor's inputs, and call InsertRootSchedule for each of them -bool InputRootSchedule( - const tvm::Tensor& tensor, - ScheduleContext& ctx) { - bool status = false; - for (auto& t : tensor->op->InputTensors()) { - if (t->op->InputTensors().size() > 0) { - bool status_root = InsertRootSchedule(t, ctx); - status = status || status_root; - } - } - return status; -} - -// Check the schedule of tensor's inputs, -// and call InsertRootSchedule and TryVectorization for each of them -bool InputRootScheduleWithVectorization( - const tvm::Tensor& tensor, - int64_t natural_vector_size, - ScheduleContext& ctx) { - bool status = false; - for (auto& t : tensor->op->InputTensors()) { - if (t->op->InputTensors().size() > 0) { - bool status_vec = TryVectorization(t, natural_vector_size, ctx); - bool status_root = InsertRootSchedule(t, ctx); - status = status || status_root || status_vec; - } - } - return status; -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h b/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h deleted file mode 100644 index 4a0781f94d385..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/schedule_utils.h +++ /dev/null @@ -1,65 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// Check the schedule of tensor -// If it has no compute_root, Insert compute_root to tensor, -// and record it to ctx.scheduled_tensors -bool InsertRootSchedule( - const tvm::Tensor& tensor, - ScheduleContext& ctx); - -// Check the schedule of tensor -// If it is not labeled as closure, lable it. -bool InsertClosure( - const tvm::Tensor& tensor, - ScheduleContext& ctx); - -// Combination of InsertRootSchedule and InsertClosure -bool InsertRootScheduleAndClosure( - const tvm::Tensor& tensor, - ScheduleContext& ctx); - -// Check precondition for vectorize schedule -bool ShouldTryVectorization( - const tvm::Tensor& tensor, - ScheduleContext& ctx); - -// Check the schedule of tensor -// If it is not scheduled, try to vectorize it. -// Note TryVectorization has to use with compute_root. -// Therefore, there is a safety check of tensor's schedule -bool TryVectorization( - const tvm::Tensor& tensor, - int64_t natural_vector_size, - ScheduleContext& ctx); - -// Check the schedule of tensor -// If it is not scheduled, try to add compute_inline on it. -// Note TryInlineSchedule cannot be used with compute_root. -// Therefore, there is a safety check of tensor's schedule. -bool TryInlineSchedule( - const tvm::Tensor& tensor, - ScheduleContext& ctx); - -// Check the schedule of tensor's inputs, -// and call InsertRootSchedule for each of them -bool InputRootSchedule( - const tvm::Tensor& tensor, - ScheduleContext& ctx); - -// Check the schedule of tensor's inputs, -// and call InsertRootSchedule and TryVectorization for each of them -bool InputRootScheduleWithVectorization( - const tvm::Tensor& tensor, - int64_t natural_vector_size, - ScheduleContext& ctx); - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc deleted file mode 100644 index 33162deddc983..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/tvm_rule_schedule.cc +++ /dev/null @@ -1,41 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/scheduler/all_schedules.h" - -#include "core/codegen/passes/scheduler/schedule_utils.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// This is for debug -bool TVM_SCHEDULER_CLASS(AlwaysRoot, GenericTVMRule)::Evaluate( - const tvm::Tensor& tensor, - const Node*, - CodeGenContext&, - ScheduleContext& ctx_sched) { - return InsertRootSchedule(tensor, ctx_sched); -} - -// For External tvm::Tensor -bool TVM_SCHEDULER_CLASS(Extern, GenericTVMRule)::Evaluate( - const tvm::Tensor& tensor, - const Node*, - CodeGenContext&, - ScheduleContext& ctx_sched) { - bool status = InsertRootScheduleAndClosure(tensor, ctx_sched); - bool status_input = InputRootSchedule(tensor, ctx_sched); - return status || status_input; -} - -// For Reduce Compute tvm::Tensor -bool TVM_SCHEDULER_CLASS(Reduce, GenericTVMRule)::Evaluate( - const tvm::Tensor& tensor, - const Node*, - CodeGenContext&, - ScheduleContext& ctx_sched) { - return InsertRootScheduleAndClosure(tensor, ctx_sched); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc deleted file mode 100644 index 2c8250198fa5f..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.cc +++ /dev/null @@ -1,104 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/scheduler/tvm_schedule_builder.h" - -#include "core/codegen/common/op_macro.h" -#include "core/codegen/common/settings.h" -#include "core/common/common.h" -#include "core/common/logging/logging.h" - -namespace onnxruntime { -namespace tvm_codegen { - -TVMScheduleBuilder::TVMScheduleBuilder(const std::string& name) - : name_(name) { -} - -const std::string& TVMScheduleBuilder::Name() const { - return name_; -} - -void TVMScheduleBuilder::InsertDispatcher(std::unique_ptr&& ptr) { - dispatchers_.push_back(std::move(ptr)); -} - -void TVMScheduleBuilder::ClearDispatcher() { - dispatchers_.clear(); -} - -void TVMScheduleBuilder::DumpAllSchedulers() const { - std::ostringstream stream; - int count = 0; - stream << "[CODEGEN_DUMP_SCHEDULE]" << std::endl; - for (auto& d : dispatchers_) { - stream << "************ TVM Scheduler Dispatcher " - << count << " : " - << d->Name() - << " ************" << std::endl; - - d->ForEach([&stream](const std::string& key, Scheduler* op) { - stream << "Key " << key - << ", Creator " << op->Name() << std::endl; - }); - - ++count; - } - - LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str(); -} - -Status TVMScheduleBuilder::Evaluate( - const tvm::Tensor& tensor, - const Node* node, - CodeGenContext& ctx_codegen, - ScheduleContext& sched) { - Scheduler* candidate = nullptr; - - for (auto& d : dispatchers_) { - candidate = d->Find(tensor, node, ctx_codegen); - if (nullptr != candidate) - break; - } - - bool enable_dump_schedule = codegen::CodeGenSettings::Instance().HasOption(codegen::CodeGenSettings::kCodeGenDumpSchedule); - - if (nullptr == candidate) { - if (nullptr != node) - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented: ", node->OpType()); - return ORT_MAKE_STATUS(ONNXRUNTIME, FAIL, "Not implemented an internal tvm::Tensor: ", tensor->op->name); - } - - bool status = candidate->Evaluate(tensor, node, ctx_codegen, sched); - - if (enable_dump_schedule) { - std::ostringstream stream; - if (nullptr != node) { - stream << std::endl; - stream << "[CODEGEN_DUMP_SCHEDULE] " - << "Schedule Node: " << node->Name() << std::endl; - } else { - stream << std::endl; - } - - if (status) { - stream << "[CODEGEN_DUMP_SCHEDULE] " - << "Schedule tvm::Tesnor " - << tensor->op->name - << " with " - << candidate->Name() << std::endl; - } else { - stream << "[CODEGEN_DUMP_SCHEDULE] " - << "Schedule tvm::Tesnor " - << tensor->op->name - << " is suppressed " << std::endl; - } - - LOGS_DEFAULT(CODEGEN_SETTINGS_LOG_LEVEL) << stream.str(); - } - - return Status::OK(); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h b/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h deleted file mode 100644 index 9f0a1b3ef45c2..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/tvm_schedule_builder.h +++ /dev/null @@ -1,46 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/passes/scheduler/tvm_scheduler.h" -#include "core/common/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -// TVMScheduleBuilder contains all applicable TVM scheduler passes. -// Scheduler passes are stored in multiple dispatchers -// that check different conditions of a tvm::Tensor. - -// If a tvm::Tensor satisfies more than one TVM scheduler passes, -// the first dispatched pass will be applied. - -class TVMScheduleBuilder { - public: - // TODO: add more parameter in consructor to support different target - TVMScheduleBuilder(const std::string& name); - ~TVMScheduleBuilder() = default; - - void DumpAllSchedulers() const; - - Status Evaluate( - const tvm::Tensor& tensor, - const Node* node, - CodeGenContext& ctx, - ScheduleContext& sched); - - void InsertDispatcher(std::unique_ptr&& ptr); - void ClearDispatcher(); - - const std::string& Name() const; - - private: - std::vector> dispatchers_; - std::string name_; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMScheduleBuilder); -}; - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc deleted file mode 100644 index 071200a234e33..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.cc +++ /dev/null @@ -1,79 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/scheduler/tvm_scheduler.h" - -#include "core/codegen/common/common.h" -#include "core/codegen/common/dispatcher.h" -#include "core/codegen/passes/utils/codegen_context.h" - -namespace onnxruntime { -namespace codegen { -// explicit instantiation -template class CreatorBase; - -template class DispatcherBase; - -} // namespace codegen - -namespace tvm_codegen { - -static const std::string TMVOpRuleKey_Extern("TVMOpRule_Extern"); -static const std::string TMVOpRuleKey_ComputeReduce("TVMOpRule_ComputeReduce"); -static const std::string TMVOpRuleKey_ComputeRegular("TVMOpRule_ComputeRegular"); -static const std::string TMVOpRuleKey_AlwaysRoot("TMVOpRuleKey_AlwaysRoot"); -static const std::string TMVOpRuleKey_NoRule("TVMOpRule_NoRule"); - -const std::string& GetTVMOpRule(TVMOpRuleType rule) { - if (rule == TVMOpRuleType::Extern) { - return TMVOpRuleKey_Extern; - } else if (rule == TVMOpRuleType::ComputeReduce) { - return TMVOpRuleKey_ComputeReduce; - } else if (rule == TVMOpRuleType::AlwaysRoot) { - return TMVOpRuleKey_AlwaysRoot; - } - return TMVOpRuleKey_NoRule; -} - -const std::string& GetTVMOpRule(const tvm::Tensor& tensor) { - auto extern_op = tensor->op.as(); - - if (nullptr != extern_op) { - return TMVOpRuleKey_Extern; - } - - auto compute_op = tensor->op.as(); - if (nullptr != compute_op) { - if (compute_op->reduce_axis.size() > 0) { - return TMVOpRuleKey_ComputeReduce; - } - } - - return TMVOpRuleKey_NoRule; -} - -Scheduler* SCHEDULE_DISPATCHER_CLASS(OrtOpType):: - Find(const tvm::Tensor&, const Node* node, tvm_codegen::CodeGenContext&) { - if (nullptr == node) - return nullptr; - return DispatcherBase::Get(node->OpType()); -} - -Scheduler* SCHEDULE_DISPATCHER_CLASS(TVMOpRule):: - Find(const tvm::Tensor& tensor, const Node*, tvm_codegen::CodeGenContext&) { - return DispatcherBase::Get(GetTVMOpRule(tensor)); -} - -Scheduler* SCHEDULE_DISPATCHER_CLASS(OrtOpName):: - Find(const tvm::Tensor&, const Node* node, tvm_codegen::CodeGenContext&) { - if (nullptr == node) - return nullptr; - return DispatcherBase::Get(GetKey(node)); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h b/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h deleted file mode 100644 index d022497c77f7e..0000000000000 --- a/onnxruntime/core/codegen/passes/scheduler/tvm_scheduler.h +++ /dev/null @@ -1,128 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/common/common.h" -#include "core/codegen/common/creator.h" -#include "core/codegen/common/registry.h" -#include "core/codegen/passes/utils/codegen_context.h" -#include "core/graph/graph.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// These are current generic TVMOpRule we used. -enum class TVMOpRuleType : int { - Extern = 0, - ComputeReduce = 1, - ComputeRegular = 2, - AlwaysRoot = 3, // for debug - NoRule, -}; - -const std::string& GetTVMOpRule(const tvm::Tensor& tensor); -const std::string& GetTVMOpRule(TVMOpRuleType rule); - -// These are current generic ScheduleType in tvm_codegen -enum class ScheduleType : int { - ScheduleNone = 0, - ScheduleInline = 1, - ScheduleAt = 2, - ScheduleRoot = 3, - ScheduleClosure = 4, -}; - -// Data struct to bundle tvm::Schedule and scheduled tensor -struct ScheduleContext { - ScheduleContext(const tvm::Array& ops) { - schedule = tvm::create_schedule(ops); - } - tvm::Schedule schedule; - std::map scheduled_tensors; -}; - -// Scheduler inserts a tvm::Schedule content to a tvm::Tensor -using Scheduler = codegen::CreatorBase< - const tvm::Tensor&, - const Node*, - tvm_codegen::CodeGenContext&, - ScheduleContext&, - bool>; - -// TVMScheduleDispatcher is the base dispatcher for TVM Schedule Builder -// It checks whether a pair of {tvm::Tensor, Ort Node} satisfying a criteria (in Find) -// and dispatches a corresponding Scheduler. -class TVMScheduleDispatcher : public codegen::DispatcherBase { - public: - TVMScheduleDispatcher(const std::string& name) - : DispatcherBase(name) {} - - virtual ~TVMScheduleDispatcher() = default; - - virtual Scheduler* Find(const tvm::Tensor&, - const Node*, - tvm_codegen::CodeGenContext&) = 0; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVMScheduleDispatcher); -}; - -// Macro returns an Schedulers' dispatcher's name -#define SCHEDULE_DISPATCHER_CLASS(TYPE) \ - TVM##TYPE##Schedulers - -// Macro declares an Schedulers' dispatcher -#define DECLARE_SCHEDULE_DISPATCHER_CLASS(TYPE) \ - class SCHEDULE_DISPATCHER_CLASS(TYPE) : public tvm_codegen::TVMScheduleDispatcher { \ - public: \ - TVM##TYPE##Schedulers(const std::string& name) \ - : TVMScheduleDispatcher(name) {} \ - ~TVM##TYPE##Schedulers() = default; \ - tvm_codegen::Scheduler* Find(const tvm::Tensor&, \ - const Node*, \ - tvm_codegen::CodeGenContext&) override; \ - \ - private: \ - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(TVM##TYPE##Schedulers); \ - }; - -// Common dispatchers are listed here -// For a special pattern, it can be created later. -// One dispatcher is based on Ort OpType -DECLARE_SCHEDULE_DISPATCHER_CLASS(OrtOpType) -// One dispatcher is based on TVMOpRule -DECLARE_SCHEDULE_DISPATCHER_CLASS(TVMOpRule) -// One dispatcher is based Ort NodeArg name -DECLARE_SCHEDULE_DISPATCHER_CLASS(OrtOpName) - -// Scheduler Registry is a registry holds all Schedulers -using TVMScheduleRegistry = codegen::RegistryBase; - -// Macro declares TVM scheduler class -#define DECLARE_TVM_SCHEDULER_CLASS(OP, PRETFIX) \ - DECLARE_CREATOR_CLASS(OP, PRETFIX##Scheduler, \ - const tvm::Tensor&, \ - const Node*, \ - tvm_codegen::CodeGenContext&, \ - tvm_codegen::ScheduleContext&, \ - bool) - -// Macro returns TVM scheduler's name with prefix -#define TVM_SCHEDULER_CLASS(OP, PREFIX) \ - CREATOR_CLASS(OP, PREFIX##Scheduler) - -// Macro returns TVM scheduler's name as string -#define TVM_SCHEDULER_STRING(OP, PREFIX) \ - STRINGIZE(TVM_SCHEDULER_CLASS(OP, PREFIX)) - -// Macro returns TVM scheduler's name with prefix and arch -#define TVM_SCHEDULER_CLASS_EX(OP, PREFIX, ARCH) \ - CREATOR_CLASS(OP, PREFIX##ARCH##Scheduler) - -// Macro declares TVM scheduler class with prefix and arch -#define DECLARE_TVM_SCHEDULER_CLASS_EX(OP, PREFIX, ARCH) \ - DECLARE_TVM_SCHEDULER_CLASS(OP, PREFIX##ARCH) - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/codegen_context.cc b/onnxruntime/core/codegen/passes/utils/codegen_context.cc deleted file mode 100644 index 2f1a59b4a92eb..0000000000000 --- a/onnxruntime/core/codegen/passes/utils/codegen_context.cc +++ /dev/null @@ -1,27 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/utils/codegen_context.h" - -#include "core/codegen/common/common.h" - -namespace onnxruntime { -namespace tvm_codegen { - -CodeGenContext::CodeGenContext( - const codegen::CodeGenHandle* handle) - : handle_(handle), unname_symbol_counter_(0) {} - -tvm::Var CodeGenContext::GetOrCreateDynamicDim(const std::string& name) { - if (dynamic_dims_.count(name) == 0) - dynamic_dims_.emplace(name, tvm::Var(name)); - - return dynamic_dims_.at(name); -} - -std::string CodeGenContext::CreateUnnamedSymbol() { - return "unnamed_" + std::to_string(unname_symbol_counter_++); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/codegen_context.h b/onnxruntime/core/codegen/passes/utils/codegen_context.h deleted file mode 100644 index 641552bd3b2e8..0000000000000 --- a/onnxruntime/core/codegen/passes/utils/codegen_context.h +++ /dev/null @@ -1,44 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/handle.h" -#include "core/codegen/common/common.h" -#include "core/common/common.h" -#include "core/framework/data_types.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// CodeGenContext is a data structure involving across passes -// Compiler developers can use it to store meta data -// to support fine-grained control of code generation -class CodeGenContext { - public: - CodeGenContext(const codegen::CodeGenHandle* handle); - - virtual ~CodeGenContext() = default; - - // returns tvm::Var for the dynamic dim - tvm::Var GetOrCreateDynamicDim(const std::string& name); - - const codegen::CodeGenHandle* GetCodeGenHandle() const { - return handle_; - } - - std::string CreateUnnamedSymbol(); - - protected: - std::unordered_map dynamic_dims_; - - const codegen::CodeGenHandle* handle_; - - int unname_symbol_counter_; -}; - -// Add Promote for CodeGenContext -DYNAMIC_PROMOTE(CodeGenContext) - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc deleted file mode 100644 index 55892974aa33f..0000000000000 --- a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.cc +++ /dev/null @@ -1,194 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/utils/ort_tvm_utils.h" - -#include "core/codegen/common/profile.h" -#include "core/codegen/passes/utils/codegen_context.h" -#include "core/framework/tensorprotoutils.h" -#include "core/providers/common.h" -#include - -#include - -namespace onnxruntime { -namespace tvm_codegen { - -#define RETURN_DLDATATYPE_IF_MATCH(type_enum, type, type_code) \ - case type_enum: \ - return {type_code, sizeof(type) * 8, 1}; \ - break; - -// DLDataType: {DLDataTypeCode, bits, lanes} -DLDataType ToTvmDLDataType(MLDataType ml_type) { - if (ml_type->IsTensorType()) { - ml_type = ml_type->AsTensorType()->GetElementType(); - } - auto prim_type = ml_type->AsPrimitiveDataType(); - if (prim_type == nullptr) { - ORT_NOT_IMPLEMENTED("converting MLDataType ", ml_type, " to tvm DLDataType is not implemented"); - } - - switch (prim_type->GetDataType()) { - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT8, int8_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT8, uint8_t, kDLUInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT16, int16_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT16, uint16_t, kDLUInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT32, int32_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT32, uint32_t, kDLUInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_INT64, int64_t, kDLInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_UINT64, uint64_t, kDLUInt); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_BOOL, bool, kDLUInt); - - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, float, kDLFloat); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_DOUBLE, double, kDLFloat); - RETURN_DLDATATYPE_IF_MATCH(ONNX_NAMESPACE::TensorProto_DataType_FLOAT16, MLFloat16, kDLFloat); - default: - ORT_NOT_IMPLEMENTED("converting MLDataType ", ml_type, " to tvm DLDataType is not implemented"); - } -} - -tvm::Type ToTvmType(ONNX_NAMESPACE::TensorProto_DataType proto_type) { - switch (proto_type) { - // Note that bool is uint1 in tvm, but uint8 in ONNX, so it always require special handling - // case ONNX_NAMESPACE::TensorProto_DataType_BOOL: - // return tvm::UInt(1); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_INT16: - return tvm::Int(16); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_INT32: - return tvm::Int(32); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_INT64: - return tvm::Int(64); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_UINT8: - return tvm::UInt(8); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_UINT16: - return tvm::UInt(16); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_UINT32: - return tvm::UInt(32); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_UINT64: - return tvm::UInt(64); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_FLOAT: - return tvm::Float(32); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_DOUBLE: - return tvm::Float(64); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_INT8: - return tvm::Int(8); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_FLOAT16: - return tvm::Float(16); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_STRING: - ORT_THROW("Casting to and from strings is not supported yet."); /*break;*/ - case ONNX_NAMESPACE::TensorProto_DataType_UNDEFINED: - ORT_THROW("Cast op must have 'to' argument of type DataType"); /*break;*/ - default: - ORT_THROW("Unexpected 'to' argument value: ", proto_type); - } -} - -tvm::Array ShapeToTvmArray(const NodeArg* def, CodeGenContext& ctx) { - ORT_ENFORCE(nullptr != def); - const ONNX_NAMESPACE::TensorShapeProto* shape_proto = def->Shape(); - ORT_ENFORCE(nullptr != shape_proto); - - tvm::Array arr; - for (int i = 0; i < shape_proto->dim_size(); ++i) { - arr.push_back(ShapeDimToTvmDim(shape_proto->dim(i), ctx)); - } - return arr; -} - -tvm::Expr ShapeDimToTvmDim(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim, CodeGenContext& ctx) { - if (utils::HasDimParam(dim)) { - return ctx.GetOrCreateDynamicDim(dim.dim_param()); - } else if (utils::HasDimValue(dim)) { - return tvm::Expr(gsl::narrow_cast(dim.dim_value())); - } - return ctx.GetOrCreateDynamicDim(ctx.CreateUnnamedSymbol()); -} - -#ifdef CODEGEN_ENABLE_PROFILER -struct event_in_bracket_and_id { - bool in_bracket; - size_t id; -}; -std::unordered_map g_codegen_profiler_event_ids; -std::vector> g_codegen_profiler_events(1024); - -TVM_REGISTER_GLOBAL("tvm.contrib.onnxruntime.profile_event") - .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* ret) { - DLTensor* X = args[0]; - DLTensor* Y = args[1]; - size_t event_id = args[2]; - bool is_begin = args[3]; - if (!is_begin) { - DCHECK(event_id < g_codegen_profiler_event_ids.size()); - profiling::Profiler::Instance().EndTimeAndRecordEvent( - profiling::EventCategory::NODE_EVENT, - g_codegen_profiler_events[event_id].first, - g_codegen_profiler_events[event_id].second); - } - - { - CODEGEN_PROFILER_EVENT("profile_stub"); - int64_t elem_count = 1; - for (int i = 0; i < X->ndim; ++i) { - elem_count *= X->shape[i]; - } - // there's overhead in this copy, so put begin after copy and end before copy - memcpy(static_cast(Y->data) + Y->byte_offset, - static_cast(X->data) + X->byte_offset, - elem_count * X->dtype.bits / 8); - } - - if (is_begin) { - DCHECK(g_codegen_profiler_events.size() > event_id); - DCHECK(!g_codegen_profiler_events[event_id].first.empty()); - DCHECK(g_codegen_profiler_event_ids[g_codegen_profiler_events[event_id].first].id == event_id); - g_codegen_profiler_events[event_id].second = - profiling::Profiler::Instance().StartTime(); - } - }); - -tvm::Tensor ProfileBegin(tvm::Tensor X, const std::string& event_name) { - size_t event_id; - if (g_codegen_profiler_event_ids.count(event_name) == 0) { - event_id = g_codegen_profiler_event_ids.size(); - ORT_ENFORCE(event_id < g_codegen_profiler_events.size()); - } else { - ORT_ENFORCE(!g_codegen_profiler_event_ids[event_name].in_bracket); - event_id = g_codegen_profiler_event_ids[event_name].id; - } - g_codegen_profiler_event_ids[event_name] = {true, event_id}; - g_codegen_profiler_events[event_id].first = event_name; - return topi::detail::make_extern( - {X->shape}, {X->dtype}, {X}, - [&](tvm::Array ins, tvm::Array outs) { - return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.profile_event"), - topi::detail::pack_buffer(ins[0]), - topi::detail::pack_buffer(outs[0]), - gsl::narrow(event_id), - true}); - }, - event_name + "_begin", "", {})[0]; -} - -tvm::Tensor ProfileEnd(tvm::Tensor X, const std::string& event_name) { - ORT_ENFORCE(g_codegen_profiler_event_ids.at(event_name).in_bracket); - g_codegen_profiler_event_ids.at(event_name).in_bracket = false; - size_t event_id = g_codegen_profiler_event_ids.at(event_name).id; - ORT_ENFORCE(event_id < g_codegen_profiler_events.size()); - ORT_ENFORCE(g_codegen_profiler_events[event_id].first == event_name); - return topi::detail::make_extern( - {X->shape}, {X->dtype}, {X}, - [&](tvm::Array ins, tvm::Array outs) { - return topi::detail::call_packed({tvm::Expr("tvm.contrib.onnxruntime.profile_event"), - topi::detail::pack_buffer(ins[0]), - topi::detail::pack_buffer(outs[0]), - gsl::narrow(event_id), - false}); - }, - event_name + "_end", "", {})[0]; -} -#endif - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h b/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h deleted file mode 100644 index f13e91a2d5cea..0000000000000 --- a/onnxruntime/core/codegen/passes/utils/ort_tvm_utils.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/common.h" -#include "core/framework/data_types.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -class CodeGenContext; - -// Helper function that converts a onnxruntime MLDataType to TVM DLDataType -DLDataType ToTvmDLDataType(MLDataType ml_type); - -tvm::Type ToTvmType(ONNX_NAMESPACE::TensorProto_DataType proto_type); - -tvm::Array ShapeToTvmArray(const NodeArg* def, CodeGenContext& ctx); - -tvm::Expr ShapeDimToTvmDim(const ONNX_NAMESPACE::TensorShapeProto_Dimension& dim, CodeGenContext& ctx); - -#ifdef CODEGEN_ENABLE_PROFILER -// Helper functions to inspect into lowered function -tvm::Tensor ProfileBegin(tvm::Tensor X, const std::string& event_name); - -tvm::Tensor ProfileEnd(tvm::Tensor X, const std::string& event_name); -#endif - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc deleted file mode 100644 index c65132f6d4bca..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.cc +++ /dev/null @@ -1,105 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/weight_layout/tiling_2d.h" - -#include "core/codegen/passes/utils/codegen_context.h" - -namespace onnxruntime { -namespace tvm_codegen { - -constexpr auto local_name_prefix = "tiling_2d_"; -constexpr int num_bits = 8; - -const std::string WeightLayoutTiling2D::GetKey( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int vector_width) { - return WeightLayout::GetKey( - local_name_prefix + std::to_string(vector_width), - proto_type, 2, 0.0f); -} - -WeightLayoutTiling2D::WeightLayoutTiling2D( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int vector_width) - : WeightLayout( - local_name_prefix + std::to_string(vector_width), - proto_type, 2, 0.0f), - vector_width_(vector_width) {} - -CoordTransFunc WeightLayoutTiling2D::ToActual(const tvm::Tensor& /*X*/) const { - return [&](const tvm::Array& nominal_coord) { - ORT_ENFORCE(nominal_coord.size() == 2); - const auto& y = nominal_coord[0]; - const auto& x = nominal_coord[1]; - return tvm::Array{ - x, - y}; - }; -} - -CoordTransFunc WeightLayoutTiling2D::ToNominal(const tvm::Tensor& X) const { - return [&](const tvm::Array& actual_coord) { - ORT_ENFORCE(actual_coord.size() == 2); - ORT_ENFORCE(X->dtype == HalideIR::type_of() || - X->dtype == HalideIR::type_of()); - - int tile_row = (sizeof(int32_t) * num_bits) / X->dtype.bits(); - int tile_col = ((vector_width_ * num_bits) / X->dtype.bits()) / tile_row; - - const auto& x = actual_coord[0]; - const auto& y = actual_coord[1]; - - const int block_dimy = tile_row; - const int block_dimx = tile_col; - - const auto& y0 = y % block_dimy; - const auto& y1 = (y / block_dimy) % block_dimx; - const auto& y2 = y / block_dimy / block_dimx; - - const auto& x0 = x % block_dimx; - const auto& x1 = x / block_dimx; - - return tvm::Array{ - y0 + y2 * block_dimx * block_dimy + x0 * block_dimy, - y1 + x1 * block_dimx}; - }; -} - -tvm::Array WeightLayoutTiling2D::ToActualShape(const tvm::Tensor& X) const { - ORT_ENFORCE(X->dtype == HalideIR::type_of() || - X->dtype == HalideIR::type_of()); - - auto pad_row = tvm::make_const(tvm::Int(32), (vector_width_ * num_bits) / X->dtype.bits()); - auto pad_col = tvm::make_const(tvm::Int(32), vector_width_ / sizeof(int32_t)); - - auto new_shape0 = ((X->shape[1] + pad_col - 1) / pad_col) * pad_col; - auto new_shape1 = ((X->shape[0] + pad_row - 1) / pad_row) * pad_row; - - tvm::Array - new_shape = { - new_shape0, - new_shape1}; - return new_shape; -} - -std::vector WeightLayoutTiling2D::ToActualShape(const Tensor* X) const { - ORT_ENFORCE(X != nullptr); - ORT_ENFORCE(X->Shape().GetDims().size() == 2); - - int pad_row = vector_width_ / X->DataType()->Size(); - int pad_col = vector_width_ / sizeof(int32_t); - - auto old_shape = X->Shape().GetDims(); - auto new_shape0 = (old_shape[1] + pad_col - 1) / pad_col * pad_col; - auto new_shape1 = ((old_shape[0] + pad_row - 1) / pad_row) * pad_row; - - std::vector new_shape = { - new_shape0, - new_shape1}; - - return new_shape; -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h b/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h deleted file mode 100644 index 64334a069f94f..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/tiling_2d.h +++ /dev/null @@ -1,43 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/codegen/passes/weight_layout/weight_layout.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -/* - * \class! WeightLayoutTiling2D - * \breif! Transform 2D weight to 4D by tiling both dimension, - * this layout is used for tensorization. - * [M, N] => [M/Tx, N/Ty, Tx, Ty] - */ - -class WeightLayoutTiling2D : public WeightLayout { - public: - static const std::string GetKey(ONNX_NAMESPACE::TensorProto_DataType proto_type, - int vector_width); - - public: - WeightLayoutTiling2D(ONNX_NAMESPACE::TensorProto_DataType proto_type, - int vector_width); - - ~WeightLayoutTiling2D() = default; - - CoordTransFunc ToNominal(const tvm::Tensor& X) const override; - CoordTransFunc ToActual(const tvm::Tensor& X) const override; - tvm::Array ToActualShape(const tvm::Tensor& X) const override; - std::vector ToActualShape(const Tensor* X) const override; - - private: - int vector_width_; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutTiling2D); -}; - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc deleted file mode 100644 index ea8597f7dd89d..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.cc +++ /dev/null @@ -1,64 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/weight_layout/transpose_2d.h" - -#include "core/codegen/passes/utils/codegen_context.h" - -namespace onnxruntime { -namespace tvm_codegen { - -constexpr auto local_layout_name = "transpose_2d"; - -const std::string WeightLayoutTranspose2D::GetKey( - ONNX_NAMESPACE::TensorProto_DataType proto_type) { - return WeightLayout::GetKey(local_layout_name, proto_type, 2, 0.0f); -} - -WeightLayoutTranspose2D::WeightLayoutTranspose2D( - ONNX_NAMESPACE::TensorProto_DataType proto_type) - : WeightLayout(local_layout_name, proto_type, 2, 0.0f) {} - -CoordTransFunc WeightLayoutTranspose2D::ToActual(const tvm::Tensor& /*X*/) const { - return [&](const tvm::Array& nominal_coord) { - ORT_ENFORCE(nominal_coord.size() == 2); - const auto& y = nominal_coord[0]; - const auto& x = nominal_coord[1]; - return tvm::Array{ - x, - y}; - }; -} - -CoordTransFunc WeightLayoutTranspose2D::ToNominal(const tvm::Tensor& /*X*/) const { - return [&](const tvm::Array& actual_coord) { - ORT_ENFORCE(actual_coord.size() == 2); - const auto& y = actual_coord[0]; - const auto& x = actual_coord[1]; - return tvm::Array{ - x, - y}; - }; -} - -tvm::Array WeightLayoutTranspose2D::ToActualShape(const tvm::Tensor& X) const { - tvm::Array new_shape = { - X->shape[1], - X->shape[0]}; - return new_shape; -} - -std::vector WeightLayoutTranspose2D::ToActualShape(const Tensor* X) const { - ORT_ENFORCE(X != nullptr); - ORT_ENFORCE(X->Shape().GetDims().size() == 2); - auto old_shape = X->Shape().GetDims(); - - std::vector new_shape = { - old_shape[1], - old_shape[0]}; - - return new_shape; -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h b/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h deleted file mode 100644 index 65babaaec8dac..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/transpose_2d.h +++ /dev/null @@ -1,33 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include "core/codegen/passes/weight_layout/weight_layout.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// WeightLayoutTranspose2D for transposing a 2D weight -// [W, H] => [H, W] -class WeightLayoutTranspose2D : public WeightLayout { - public: - static const std::string GetKey(ONNX_NAMESPACE::TensorProto_DataType proto_type); - - public: - WeightLayoutTranspose2D(ONNX_NAMESPACE::TensorProto_DataType proto_type); - - ~WeightLayoutTranspose2D() = default; - - CoordTransFunc ToNominal(const tvm::Tensor& X) const override; - CoordTransFunc ToActual(const tvm::Tensor& X) const override; - tvm::Array ToActualShape(const tvm::Tensor& X) const override; - std::vector ToActualShape(const Tensor* X) const override; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutTranspose2D); -}; - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc deleted file mode 100644 index b1ddb791a3b3d..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.cc +++ /dev/null @@ -1,77 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/weight_layout/vertical_stripes_2d.h" - -#include "core/codegen/passes/utils/codegen_context.h" - -namespace onnxruntime { -namespace tvm_codegen { - -constexpr auto local_name_prefix = "vertical_stripe_2d_"; - -const std::string WeightLayoutVerticalStripe2D::GetKey( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int stripe_width) { - return WeightLayout::GetKey( - local_name_prefix + std::to_string(stripe_width), - proto_type, 2, 0.0f); -} - -WeightLayoutVerticalStripe2D::WeightLayoutVerticalStripe2D( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int stripe_width) - : WeightLayout( - local_name_prefix + std::to_string(stripe_width), - proto_type, 2, 0.0f), - stripe_width_(stripe_width) { -} - -CoordTransFunc WeightLayoutVerticalStripe2D::ToActual(const tvm::Tensor& /*X*/) const { - return [&](const tvm::Array& nominal_coord) { - ORT_ENFORCE(nominal_coord.size() == 2); - const auto& y = nominal_coord[0]; - const auto& x = nominal_coord[1]; - return tvm::Array{ - x / stripe_width_, - y, - x % stripe_width_}; - }; -} - -CoordTransFunc WeightLayoutVerticalStripe2D::ToNominal(const tvm::Tensor& /*X*/) const { - return [&](const tvm::Array& actual_coord) { - ORT_ENFORCE(actual_coord.size() == 3); - const auto& z = actual_coord[0]; - const auto& y = actual_coord[1]; - const auto& x = actual_coord[2]; - return tvm::Array{ - y, - x + stripe_width_ * z}; - }; -} - -tvm::Array WeightLayoutVerticalStripe2D::ToActualShape(const tvm::Tensor& X) const { - tvm::Array new_shape = { - (X->shape[1] + stripe_width_ - 1) / stripe_width_, - X->shape[0], - stripe_width_}; - return new_shape; -} - -std::vector WeightLayoutVerticalStripe2D::ToActualShape(const Tensor* X) const { - ORT_ENFORCE(X != nullptr); - auto old_shape = X->Shape().GetDims(); - - ORT_ENFORCE(old_shape.size() == 2); - - std::vector new_shape = { - (old_shape[1] + stripe_width_ - 1) / stripe_width_, - old_shape[0], - stripe_width_}; - - return new_shape; -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h b/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h deleted file mode 100644 index b9b65025dc014..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/vertical_stripes_2d.h +++ /dev/null @@ -1,40 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/common.h" -#include "core/codegen/passes/weight_layout/weight_layout.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -// WeightLayoutVerticalStripe2D for making a 2D weight to 3D, by tiling the lowest (verteical) dimension -// [W, H] => [H/stripe, W, stripe] -class WeightLayoutVerticalStripe2D : public WeightLayout { - public: - static const std::string GetKey( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int stripe_width); - - public: - WeightLayoutVerticalStripe2D( - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int stripe_width); - - ~WeightLayoutVerticalStripe2D() = default; - - virtual CoordTransFunc ToNominal(const tvm::Tensor& X) const override; - virtual CoordTransFunc ToActual(const tvm::Tensor& X) const override; - tvm::Array ToActualShape(const tvm::Tensor& X) const override; - std::vector ToActualShape(const Tensor* X) const override; - - private: - int stripe_width_; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayoutVerticalStripe2D); -}; - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc deleted file mode 100644 index ab3e647fd284a..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.cc +++ /dev/null @@ -1,91 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/codegen/passes/weight_layout/weight_layout.h" - -#include "core/codegen/common/common.h" -#include "core/codegen/common/utils.h" -#include "core/codegen/mti/mti_tvm_utils.h" -#include "core/codegen/passes/utils/ort_tvm_utils.h" - -namespace onnxruntime { -namespace tvm_codegen { - -static tvm::Tensor CreateTVMPlaceholder( - const std::string& name, - HalideIR::Type type, - int dim) { - tvm::Array shape; - if (dim > 0) { - for (int i = 0; i < dim; ++i) { - shape.push_back(tvm::Var(name + "_v" + std::to_string(i))); - } - } else { - shape.push_back(1); - } - return tvm::placeholder(shape, type, name + "_placeholder"); -} - -const std::string WeightLayout::GetKey( - const std::string& name, - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int input_dim, - float pad_zero) { - std::ostringstream key; - key << name << "_type_" << static_cast(proto_type); - key << "_dim_" << input_dim; - key << "_pad_zero_" << pad_zero; - return NormalizeCppName(key.str()); -} - -WeightLayout::WeightLayout( - const std::string& name, - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int input_dim, - float pad_zero) - : name_(GetKey(name, proto_type, input_dim, pad_zero)), - proto_type_(proto_type), - input_dim_(input_dim), - pad_zero_(pad_zero) {} - -const std::string& WeightLayout::Name() const { - return name_; -} - -void WeightLayout::CreateLayoutMarshallingTVMOp(tvm::Array& inputs, - tvm::Array& outputs) const { - HalideIR::Type halide_type = ToTvmType(proto_type_); - - tvm::Tensor placeholder = CreateTVMPlaceholder(name_, halide_type, input_dim_); - inputs.push_back(placeholder); - - tvm::Array new_shape = ToActualShape(placeholder); - CoordTransFunc new_coord_to_old_coord_func = ToNominal(placeholder); - tvm::Expr pad_zero_expr = tvm::make_const(halide_type, pad_zero_); - - tvm::Tensor output = tvm::compute( - new_shape, - [&](const tvm::Array& output_coord) { - tvm::Array output_coord1; - for (const auto& coord : output_coord) - output_coord1.push_back(coord); - auto input_coord = new_coord_to_old_coord_func(output_coord1); - ORT_ENFORCE(input_coord.size() == placeholder->shape.size()); - - if (input_coord.size() > 0) { - auto in_range = (input_coord[0] >= 0) && (input_coord[0] < placeholder->shape[0]); - for (size_t dim = 1; dim < input_coord.size(); ++dim) - in_range = in_range && (input_coord[dim] >= 0) && (input_coord[dim] < placeholder->shape[dim]); - - return tvm::if_then_else(in_range, placeholder(input_coord), pad_zero_expr); - } else { - // scalar - return placeholder(input_coord); - } - }); - - outputs.push_back(output); -} - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h b/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h deleted file mode 100644 index 1b45a38e7e24e..0000000000000 --- a/onnxruntime/core/codegen/passes/weight_layout/weight_layout.h +++ /dev/null @@ -1,68 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once -#include "core/codegen/common/common.h" -#include "core/codegen/common/registry.h" -#include "core/common/common.h" -#include "core/framework/tensor.h" -#include - -namespace onnxruntime { -namespace tvm_codegen { - -using CoordTransFunc = std::function(const tvm::Array&)>; - -// WeightLayout is data layout transformer for weight/initializer -class WeightLayout { - public: - // Static function to return unique string as a key - static const std::string GetKey( - const std::string& name, - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int input_dim, - float pad_zero); - - public: - WeightLayout( - const std::string& name, - ONNX_NAMESPACE::TensorProto_DataType proto_type, - int input_dim, - float pad_zero); - - virtual ~WeightLayout() = default; - - // Return a CoordTransFunc from actual (transformed) coordinate to normial (original) coordinate - virtual CoordTransFunc ToNominal(const tvm::Tensor& X) const = 0; - - // Return a CoordTransFunc from normial (original) coordinate to actual (transformed) coordinate - virtual CoordTransFunc ToActual(const tvm::Tensor& X) const = 0; - - // Return actual (transformed) shape in tvm::Array (tvm_codegen) - virtual tvm::Array ToActualShape(const tvm::Tensor& X) const = 0; - - // Return actual (transformed) shape in vector (ort) - virtual std::vector ToActualShape(const Tensor* X) const = 0; - - // Create Layout Marshalling op in outputs - void CreateLayoutMarshallingTVMOp(tvm::Array& inputs, - tvm::Array& outputs) const; - - // Layout name - const std::string& Name() const; - - protected: - std::string name_; - ONNX_NAMESPACE::TensorProto_DataType proto_type_; - int input_dim_; - float pad_zero_; - - private: - ORT_DISALLOW_COPY_ASSIGNMENT_AND_MOVE(WeightLayout); -}; - -// Weight Layout Registry is a registry holds all WeightLayout -using WeightLayoutRegistry = codegen::RegistryBase; - -} // namespace tvm_codegen -} // namespace onnxruntime diff --git a/onnxruntime/core/framework/utils.cc b/onnxruntime/core/framework/utils.cc index 9eed0249711f9..ff664c2c76703 100644 --- a/onnxruntime/core/framework/utils.cc +++ b/onnxruntime/core/framework/utils.cc @@ -57,7 +57,6 @@ void DestroyStrings(void* p_data, int64_t elements) { bool ProviderIsCpuBased(const std::string& provider_type) { return provider_type == onnxruntime::kCpuExecutionProvider || provider_type == onnxruntime::kDnnlExecutionProvider || - provider_type == onnxruntime::kTvmExecutionProvider || provider_type == onnxruntime::kVitisAIExecutionProvider || provider_type == onnxruntime::kOpenVINOExecutionProvider || provider_type == onnxruntime::kNnapiExecutionProvider || diff --git a/onnxruntime/core/platform/windows/stacktrace.cc b/onnxruntime/core/platform/windows/stacktrace.cc index 3401507ae911f..cc23d70c0f11f 100644 --- a/onnxruntime/core/platform/windows/stacktrace.cc +++ b/onnxruntime/core/platform/windows/stacktrace.cc @@ -30,7 +30,6 @@ class CaptureStackTrace { // Get the stack trace. Currently only enabled for a DEBUG build as we require the DbgHelp library. std::vector GetStackTrace() { #ifndef NDEBUG -// TVM need to run with shared CRT, so won't work with debug helper now #if (defined __cpp_lib_stacktrace) && !(defined _OPSCHEMA_LIB_) && !(defined _GAMING_XBOX) && !(defined ONNXRUNTIME_ENABLE_MEMLEAK_CHECK) return detail::CaptureStackTrace().Trace(); #else diff --git a/onnxruntime/core/providers/get_execution_providers.cc b/onnxruntime/core/providers/get_execution_providers.cc index d2a72c3a38b03..7d8c5525667b9 100644 --- a/onnxruntime/core/providers/get_execution_providers.cc +++ b/onnxruntime/core/providers/get_execution_providers.cc @@ -66,14 +66,6 @@ constexpr ProviderInfo kProvidersInPriorityOrder[] = true, #else false, -#endif - }, - { - kTvmExecutionProvider, -#ifdef USE_TVM - true, -#else - false, #endif }, { diff --git a/onnxruntime/core/providers/provider_factory_creators.h b/onnxruntime/core/providers/provider_factory_creators.h index 41e418d9eb97f..1c62c1a7a8d0b 100644 --- a/onnxruntime/core/providers/provider_factory_creators.h +++ b/onnxruntime/core/providers/provider_factory_creators.h @@ -78,10 +78,6 @@ #include "core/providers/tensorrt/tensorrt_provider_factory_creator.h" #endif -#if defined(USE_TVM) -#include "core/providers/tvm/tvm_provider_factory_creator.h" -#endif - #if defined(USE_VITISAI) #include "core/providers/vitisai/vitisai_provider_factory_creator.h" #endif diff --git a/onnxruntime/core/providers/tvm/custom_logging.cc b/onnxruntime/core/providers/tvm/custom_logging.cc deleted file mode 100644 index 1cabe81f8e87e..0000000000000 --- a/onnxruntime/core/providers/tvm/custom_logging.cc +++ /dev/null @@ -1,52 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. -// -// Enable custom logging - this will cause TVM to use a custom implementation -// of tvm::runtime::detail::LogMessage. We use this to change the absolute -// file path to relative file path. - -#include -#include -#include -#include -#include -#include - -// TODO(agladyshev): Make conditional choice of sep for Windows and UNIX -std::string GetFileName(const std::string& file_path, char sep = '/') { - return {std::next(file_path.begin(), file_path.find_last_of(sep) + 1), - file_path.end()}; -} - -std::string GetTimedLogMessage(const std::string& file, int lineno, const std::string& message) { - std::stringstream sstream; - std::string file_name = GetFileName(file); - std::time_t t = std::time(nullptr); - sstream << "[" -#ifdef _WIN32 -// TODO(vvchernov): use #include instead of and localtime_s() approach for WIN32 -#pragma warning(disable : 4996) // _CRT_SECURE_NO_WARNINGS -#endif - << std::put_time(std::localtime(&t), "%H:%M:%S") -#ifdef _WIN32 -#pragma warning(default : 4996) -#endif - << "][TVM] " - << file_name << ":" << lineno << ": " + message; - return sstream.str(); -} - -namespace tvm { -namespace runtime { -namespace detail { -void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { - throw std::runtime_error(GetTimedLogMessage(file, lineno, message)); -} - -void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { - std::cerr << GetTimedLogMessage(file, lineno, message) << std::endl; -} - -} // namespace detail -} // namespace runtime -} // namespace tvm diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher.cc b/onnxruntime/core/providers/tvm/hash_alg/hasher.cc deleted file mode 100644 index bb62b41c7aa85..0000000000000 --- a/onnxruntime/core/providers/tvm/hash_alg/hasher.cc +++ /dev/null @@ -1,30 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/common/common.h" - -#include "hasher.h" // NOLINT(build/include_subdir) -#include "hasher_impl.h" // NOLINT(build/include_subdir) - -namespace onnxruntime { -namespace tvm { - -Hasher::Hasher(const std::string& hash_type) { - hasher_ = getHasherImpl(hash_type); -} - -std::string Hasher::hash(const char* src, size_t size) const { - return hasher_->hash(src, size); -} - -std::shared_ptr Hasher::getHasherImpl(const std::string& hash_type) { - if (hash_type == "sha256") { - return std::make_shared(); - } else { - ORT_NOT_IMPLEMENTED("Hasher was not implemented for hash type: ", hash_type); - } - return nullptr; -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher.h b/onnxruntime/core/providers/tvm/hash_alg/hasher.h deleted file mode 100644 index 7b2f50def2e36..0000000000000 --- a/onnxruntime/core/providers/tvm/hash_alg/hasher.h +++ /dev/null @@ -1,32 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_H_ -#define ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_H_ - -#include -#include - -namespace onnxruntime { -namespace tvm { -class HasherImpl; - -class Hasher { - public: - Hasher() = delete; - explicit Hasher(const std::string& hash_type); - virtual ~Hasher() = default; - - std::string hash(const char* src, size_t size) const; - - private: - std::shared_ptr getHasherImpl(const std::string& hash_type); - - private: - std::shared_ptr hasher_; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_H_ diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc b/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc deleted file mode 100644 index 20aef66f3046a..0000000000000 --- a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.cc +++ /dev/null @@ -1,39 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "hasher_impl.h" // NOLINT(build/include_subdir) - -namespace onnxruntime { -namespace tvm { - -std::string HasherSHA256Impl::hash(const char* src, size_t size) const { - return hexdigest(src, size); -} - -void HasherSHA256Impl::digest(const Ipp8u* src, int size, Ipp8u* dst) { - IppStatus status = ippStsNoErr; - const IppsHashMethod* hashMethod = ippsHashMethod_SHA256(); - status = ippsHashMessage_rmf(src, size, dst, hashMethod); - if (ippStsNoErr != status) { - ORT_THROW("Can't get SHA-256..."); - } -} - -std::string HasherSHA256Impl::digest(const char* src, size_t size) { - const int digest_size_byte = IPP_SHA256_DIGEST_BITSIZE / 8; - auto dst = std::unique_ptr(new char[digest_size_byte]); - digest(reinterpret_cast(src), static_cast(size), reinterpret_cast(dst.get())); - return std::string(dst.get(), digest_size_byte); -} - -std::string HasherSHA256Impl::hexdigest(const char* src, size_t size) { - std::string byte_digest = digest(src, size); - std::stringstream ss; - for (char c : byte_digest) { - ss << std::hex << std::setw(2) << std::setfill('0') << (0xff & c); - } - return ss.str(); -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h b/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h deleted file mode 100644 index 6c285dd0c78f3..0000000000000 --- a/onnxruntime/core/providers/tvm/hash_alg/hasher_impl.h +++ /dev/null @@ -1,42 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_IMPL_H_ -#define ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_IMPL_H_ - -#include -#include -#include -#include -#include - -#include "core/common/common.h" - -namespace onnxruntime { -namespace tvm { - -class HasherImpl { - public: - HasherImpl() = default; - virtual ~HasherImpl() = default; - - virtual std::string hash(const char* src, size_t size) const = 0; -}; - -class HasherSHA256Impl : public HasherImpl { - public: - HasherSHA256Impl() = default; - virtual ~HasherSHA256Impl() = default; - - std::string hash(const char* src, size_t size) const final; - - private: - static void digest(const Ipp8u* src, int size, Ipp8u* dst); - static std::string digest(const char* src, size_t size); - static std::string hexdigest(const char* src, size_t size); -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // ONNXRUNTIME_CORE_PROVIDERS_TVM_HASH_ALG_HASHER_IMPL_H_ diff --git a/onnxruntime/core/providers/tvm/symbols.txt b/onnxruntime/core/providers/tvm/symbols.txt deleted file mode 100644 index 8d903acd9ea76..0000000000000 --- a/onnxruntime/core/providers/tvm/symbols.txt +++ /dev/null @@ -1 +0,0 @@ -OrtSessionOptionsAppendExecutionProvider_Tvm diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.cc b/onnxruntime/core/providers/tvm/tvm_allocator.cc deleted file mode 100644 index 4b68f6432e8cc..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_allocator.cc +++ /dev/null @@ -1,29 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include - -#include "tvm_allocator.h" -#include "core/framework/session_state.h" -#include "xpu_data_transfer.h" - -namespace onnxruntime { -namespace tvm { - -void* TVMAllocator::Alloc(size_t size) { - void* p = nullptr; - if (size > 0) { - DLDataType dl_type{kDLInt, 8, 1}; - int err = TVMDeviceAllocDataSpace(ctx, size, ::tvm::runtime::kAllocAlignment, dl_type, reinterpret_cast(&p)); - CHECK_EQ(err, 0); - return p; - } - return p; -} - -void TVMAllocator::Free(void* p) { - TVMDeviceFreeDataSpace(ctx, p); -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_allocator.h b/onnxruntime/core/providers/tvm/tvm_allocator.h deleted file mode 100644 index f3ba544b8ac46..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_allocator.h +++ /dev/null @@ -1,45 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_ALLOCATOR -#define TVM_ALLOCATOR - -#include "core/framework/allocator.h" -#include "tvm_common.h" - -namespace onnxruntime { -namespace tvm { - -#define TVM_ALLOC_ALIGN 128 - -class TVMAllocator : public IAllocator { - public: - TVMAllocator() : TVMAllocator(OrtMemoryInfo("TVM", - OrtAllocatorType::OrtDeviceAllocator, - OrtDevice(OrtDevice::CPU, OrtDevice::MemType::DEFAULT, 0), - 0, - OrtMemTypeDefault)) {} - explicit TVMAllocator(const OrtMemoryInfo& info) - : IAllocator(info) { - switch (info.device.Type()) { - case OrtDevice::CPU: - ctx = {kDLCPU, info.device.Id()}; - break; - case OrtDevice::GPU: - ctx = {kDLVulkan, info.device.Id()}; - break; - default: - ORT_NOT_IMPLEMENTED("Unsupported device"); - break; - } - } - - virtual void* Alloc(size_t size) override; - virtual void Free(void* p) override; - DLDevice ctx; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_ALLOCATOR diff --git a/onnxruntime/core/providers/tvm/tvm_api.cc b/onnxruntime/core/providers/tvm/tvm_api.cc deleted file mode 100644 index e9a7d002e77c8..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_api.cc +++ /dev/null @@ -1,303 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifdef _WIN32 -#include -#else -#include // glob(), globfree() -#endif -#include // memset() -#include -#include -#include - -#include -#include -#include - -#include "core/common/common.h" -#include - -#include "tvm_api.h" - -namespace onnxruntime { -namespace tvm { - -using TvmIntArray = ::tvm::Array<::tvm::Integer>; -using TvmPackedFunc = ::tvm::PackedFunc; -namespace tvm_rt = ::tvm::runtime; -namespace tvm_rt_vm = tvm_rt::vm; - -TvmModule TVMCompile(const TvmEPOptions& options, - const std::string& onnx_txt, - const std::string& model_path, - int opset, - const TVMTensorShapes& input_shapes) { - ::tvm::Array shapes; - for (size_t i = 0; i < input_shapes.size(); ++i) { - TvmIntArray shape; - for (auto& dim : input_shapes[i]) { - shape.push_back(::tvm::Integer(dim)); - } - shapes.push_back(shape); - } - - const TvmPackedFunc* compile = tvm_rt::Registry::Get("tvm_onnx_import_and_compile"); - ORT_ENFORCE(compile != nullptr, "Unable to retrieve 'tvm_onnx_import_and_compile'."); - TvmModule mod = (*compile)(TVMByteArray{onnx_txt.data(), onnx_txt.size()}, - model_path, - options.executor, - options.target, - options.target_host, - options.opt_level, - opset, - options.freeze_weights, - shapes, - options.to_nhwc, - options.tuning_file_path, - options.tuning_type); - ORT_ENFORCE(mod.get() != nullptr, "Compiled TVM Module is nullptr!"); - return mod; -} - -std::vector glob(const std::string& dir, const std::string& extension) { - std::vector filenames; -#ifdef _WIN32 - std::string pattern = dir + "/*." + extension; - WIN32_FIND_DATA fd; - HANDLE hFind = ::FindFirstFile(pattern.c_str(), &fd); - if (hFind != INVALID_HANDLE_VALUE) { - do { - if (!(fd.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY)) { - filenames.push_back( - dir + - ToUTF8String(PathString{k_preferred_path_separator}) + - fd.cFileName); - } - } while (::FindNextFile(hFind, &fd)); - ::FindClose(hFind); - } -#else - glob_t glob_result; - memset(&glob_result, 0, sizeof(glob_result)); - - std::string pattern = dir + "/*." + extension; - int return_value = glob(pattern.c_str(), GLOB_TILDE, NULL, &glob_result); - ORT_ENFORCE(return_value == 0, "No results of glob for pattern: " + pattern); - - for (size_t i = 0; i < glob_result.gl_pathc; ++i) { - filenames.push_back(std::string(glob_result.gl_pathv[i])); - } - globfree(&glob_result); -#endif - return filenames; -} - -std::string filter_lib_paths(const std::vector& lib_paths, const std::string& lib_ext) { - std::string lib_path; - size_t counter = 0; - for (const auto& path : lib_paths) { - if (path.find("libtvm_runtime." + lib_ext) != std::string::npos || - path.find("liboctomized_model." + lib_ext) != std::string::npos) { - ++counter; - } else { - lib_path = path; - } - } - ORT_ENFORCE((lib_paths.size() - counter) == 1, "It should be only one shared library for model after filtering"); - - return lib_path; -} - -static std::unordered_map str2dev_type = { - {"llvm", 1}, - {"stackvm", 1}, - {"cpu", 1}, - {"c", 1}, - {"hybrid", 1}, - {"composite", 1}, - {"cuda", 2}, - {"nvptx", 2}, - {"cl", 4}, - {"opencl", 4}, - {"sdaccel", 4}, - {"aocl", 5}, - {"aocl_sw_emu", 5}, - {"vulkan", 7}, - {"metal", 8}, - {"vpi", 9}, - {"rocm", 10}, - {"ext_dev", 12}, - {"hexagon", 14}, - {"webgpu", 15}}; - -TvmModule TVMSoCompile(const TvmEPOptions& options) { - const std::string& dir = options.so_folder; -#ifdef _WIN32 - std::string lib_ext = "dll"; -#else - std::string lib_ext = "so"; -#endif - const std::string lib_path = filter_lib_paths(glob(dir, lib_ext), lib_ext); - const std::string consts_path = dir + - ToUTF8String(PathString{k_preferred_path_separator}) + - "consts"; - const auto& ro_paths = glob(dir, "ro"); - ORT_ENFORCE(ro_paths.size() == 1, "It should be only one ro file in folder: " + dir); - const std::string vm_exec_code_path = ro_paths[0]; - - TvmModule lib = TvmModule::LoadFromFile(lib_path); - - std::ifstream code(vm_exec_code_path, std::ios::binary); - std::stringstream ss; - ss << code.rdbuf(); - - auto exec_mod = tvm_rt_vm::Executable::Load(ss.str(), lib); - const tvm_rt_vm::Executable* tmp = exec_mod.as(); - auto exec = tvm_rt::GetObjectPtr(const_cast(tmp)); - exec->LoadLateBoundConstantsFromFile(consts_path); - - auto vm = tvm_rt::make_object(); - vm->LoadExecutable(exec); - - size_t pos = options.target.find(" "); - const std::string dev_type_str = options.target.substr(0, pos); - ORT_ENFORCE(!dev_type_str.empty(), "Device was not found in target string"); - uint64_t dev_type = str2dev_type[dev_type_str]; - const uint64_t cpu_type = str2dev_type["cpu"]; - // Initialize the VM for the specified device. If the device is not a CPU, - // We'll need to add a CPU context to drive it. - int arity; - if (dev_type == cpu_type) { - arity = 3; - } else { - arity = 6; - } - uint64_t alloc_type = uint64_t(tvm_rt_vm::AllocatorType::kPooled); - // TODO(vchernov): multiple devices using and using device with specified id are not supported - // Always use the first device of the specified type. - uint64_t device_id = 0; - std::vector init_vals(arity); - std::vector codes(arity); - tvm_rt::TVMArgsSetter setter(init_vals.data(), codes.data()); - setter(0, dev_type); - setter(1, device_id); - setter(2, alloc_type); - // Also initialize a CPU device context. - if (dev_type != cpu_type) { - setter(3, cpu_type); - setter(4, device_id); - setter(5, alloc_type); - } - tvm_rt::TVMRetValue rv; - // Call the packed func with the init arguments. - vm->GetFunction("init", nullptr).CallPacked(tvm_rt::TVMArgs(init_vals.data(), codes.data(), arity), &rv); - - return TvmModule(vm); -} - -void TVMSetInputs(TvmModule& mod, - std::vector& inds, - std::vector& inputs) { - TvmPackedFunc set_input = mod.GetFunction("set_input", false); - TvmPackedFunc set_input_zero_copy = mod.GetFunction("set_input_zero_copy", false); - for (size_t i = 0; i < inds.size(); ++i) { - if (reinterpret_cast(inputs[i].data) % tvm_rt::kAllocAlignment == 0) { - set_input_zero_copy(inds[i], &inputs[i]); - } else { - set_input(inds[i], &inputs[i]); - } - } -} - -void TVM_VM_SetInputs(TvmModule& mod, - std::vector& inds, - std::vector& inputs) { - size_t num_total_args = inputs.size() + 1; - std::vector tvm_values(num_total_args); - std::vector tvm_type_codes(num_total_args); - ::tvm::runtime::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data()); - const std::string func_name = "main"; - setter(0, func_name.c_str()); - for (size_t k = 0; k < num_total_args - 1; ++k) { - setter(inds[k] + 1, &inputs[k]); - } - - TvmPackedFunc set_input = mod.GetFunction("set_input", false); - ::tvm::runtime::TVMRetValue rv; - set_input.CallPacked(::tvm::runtime::TVMArgs(tvm_values.data(), tvm_type_codes.data(), gsl::narrow_cast(num_total_args)), &rv); -} - -void TVMSetOutputsZeroCopy(TvmModule& mod, - std::vector& outputs) { - TvmPackedFunc set_output = mod.GetFunction("set_output_zero_copy", false); - for (size_t i = 0; i < outputs.size(); ++i) { - set_output(i, &outputs[i]); - } -} - -void TVM_VM_SetOutputsZeroCopy(TvmModule& mod, - std::vector& outputs) { - size_t num_total_args = outputs.size() + 1; - std::vector tvm_values(num_total_args); - std::vector tvm_type_codes(num_total_args); - tvm_rt::TVMArgsSetter setter(tvm_values.data(), tvm_type_codes.data()); - const std::string func_name = "main"; - setter(0, func_name.c_str()); - for (size_t k = 0; k < num_total_args - 1; ++k) { - setter(k + 1, &outputs[k]); - } - - TvmPackedFunc set_output = mod.GetFunction("set_outputs", false); - tvm_rt::TVMRetValue rv; - set_output.CallPacked(tvm_rt::TVMArgs(tvm_values.data(), tvm_type_codes.data(), gsl::narrow_cast(num_total_args)), &rv); -} - -void TVMGetOutputs(TvmModule& mod, - std::vector& outputs) { - TvmPackedFunc get_output = mod.GetFunction("get_output", false); - for (size_t i = 0; i < outputs.size(); ++i) { - get_output(i, &outputs[i]); - } -} - -void TVM_VM_GetOutputs(TvmModule& mod, - std::vector& outputs) { - TvmPackedFunc get_output = mod.GetFunction("get_output", false); - for (size_t i = 0; i < outputs.size(); ++i) { - // TODO(vvchernov): think about improvement of memory management - tvm_rt::NDArray output_array = get_output(i); - output_array.CopyTo(&outputs[i]); - } -} - -void TVMGetOutputShapes(TvmModule& mod, - TVMTensorShapes& output_shapes) { - size_t size = output_shapes.size(); - TvmPackedFunc get_output = mod.GetFunction("get_output", false); - for (size_t i = 0; i < size; ++i) { - tvm_rt::NDArray output_array = get_output(i); - tvm_rt::ShapeTuple shape_tuple = output_array.Shape(); - size_t dims_num = shape_tuple.size(); - TensorShapeVector dims; - for (size_t j = 0; j < dims_num; ++j) { - dims.push_back(int64_t(shape_tuple[j])); - } - output_shapes[i] = dims; - } -} - -void TVMRun(TvmModule& mod) { - TvmPackedFunc run = mod.GetFunction("run", false); - ORT_ENFORCE(run != nullptr, "Unable to retrieve graph executor run."); - run(); -} - -void TVM_VM_Run(TvmModule& mod) { - TvmPackedFunc run = mod.GetFunction("invoke", false); - ORT_ENFORCE(run != nullptr, "Unable to retrieve virtual machine invoke."); - run("main"); -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_api.h b/onnxruntime/core/providers/tvm/tvm_api.h deleted file mode 100644 index bbf05f4fc06d9..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_api.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_API_H -#define TVM_API_H - -#include -#include - -#include "tvm_common.h" -#include "tvm_defaults.h" -#include "tvm_ep_options.h" - -namespace onnxruntime { -namespace tvm { - -TvmModule TVMCompile(const TvmEPOptions& options, - const std::string& onnx_txt, - const std::string& model_path, - int opset, - const TVMTensorShapes& input_shapes); -TvmModule TVMSoCompile(const TvmEPOptions& options); - -void TVMSetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); -void TVM_VM_SetInputs(TvmModule& mod, std::vector& inds, std::vector& inputs); -void TVMSetOutputsZeroCopy(TvmModule& mod, std::vector& outputs); -void TVM_VM_SetOutputsZeroCopy(TvmModule& mod, std::vector& outputs); -void TVMGetOutputs(TvmModule& mod, std::vector& outputs); -void TVM_VM_GetOutputs(TvmModule& mod, std::vector& outputs); -void TVMGetOutputShapes(TvmModule& mod, - TVMTensorShapes& output_shapes); -void TVMRun(TvmModule& mod); -void TVM_VM_Run(TvmModule& mod); - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_API_H diff --git a/onnxruntime/core/providers/tvm/tvm_common.h b/onnxruntime/core/providers/tvm/tvm_common.h deleted file mode 100644 index 68e3b6496328a..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_common.h +++ /dev/null @@ -1,22 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_COMMON_H -#define TVM_COMMON_H - -#include -#include - -#include -#include -#include - -namespace onnxruntime { -namespace tvm { - -using TvmModule = ::tvm::runtime::Module; - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_COMMON_H diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.cc b/onnxruntime/core/providers/tvm/tvm_compiler.cc deleted file mode 100644 index 8f4e7e7de9a36..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_compiler.cc +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include - -#include "tvm_compiler.h" -#include "tvm_api.h" - -namespace onnxruntime { -namespace tvm { - -auto TVMCompilerBase::operator()(const TvmEPOptions& options, - const TVMTensorShapes& input_shapes) -> ModulePtr { - if (mod_) { - return mod_; - } - - mod_ = std::make_shared(); - this->compileTVMModule(options, input_shapes); - - return mod_; -} - -TVMCompiler::TVMCompiler(std::string&& onnx_model_str, - const std::string& model_path, - int opset) : onnx_model_str_(std::move(onnx_model_str)), - model_path_(model_path), - opset_(opset) { -} - -void TVMCompiler::compileTVMModule(const TvmEPOptions& options, - const TVMTensorShapes& input_shapes) { - *mod_ = tvm::TVMCompile(options, - onnx_model_str_, - model_path_, - opset_, - input_shapes); - - onnx_model_str_.clear(); -} - -void TVMSoCompiler::compileTVMModule(const TvmEPOptions& options, - [[maybe_unused]] const TVMTensorShapes& input_shapes) { - *mod_ = tvm::TVMSoCompile(options); -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_compiler.h b/onnxruntime/core/providers/tvm/tvm_compiler.h deleted file mode 100644 index bfc73d67aa07f..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_compiler.h +++ /dev/null @@ -1,63 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_COMPILER_H -#define TVM_COMPILER_H - -#include -#include - -#include "tvm_common.h" -#include "tvm_ep_options.h" - -namespace onnxruntime { -namespace tvm { - -class TVMCompilerBase { - public: - using ModulePtr = std::shared_ptr; - - TVMCompilerBase() = default; - virtual ~TVMCompilerBase() = default; - - ModulePtr operator()(const TvmEPOptions& options, - const TVMTensorShapes& input_shapes); - - virtual void compileTVMModule(const TvmEPOptions& options, - const TVMTensorShapes& input_shapes) = 0; - - protected: - ModulePtr mod_; -}; - -class TVMCompiler : public TVMCompilerBase { - public: - TVMCompiler() = delete; - ~TVMCompiler() = default; - - TVMCompiler(std::string&& onnx_model_str, - const std::string& model_path, - int opset); - - void compileTVMModule(const TvmEPOptions& options, - const TVMTensorShapes& input_shapes) final; - - private: - std::string onnx_model_str_; - std::string model_path_; - int opset_; -}; - -class TVMSoCompiler : public TVMCompilerBase { - public: - TVMSoCompiler() = default; - ~TVMSoCompiler() = default; - - void compileTVMModule(const TvmEPOptions& options, - const TVMTensorShapes& input_shapes) final; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_COMPILER_H diff --git a/onnxruntime/core/providers/tvm/tvm_defaults.h b/onnxruntime/core/providers/tvm/tvm_defaults.h deleted file mode 100644 index 197d1f363c50d..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_defaults.h +++ /dev/null @@ -1,35 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_DEFAULTS_H_ -#define ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_DEFAULTS_H_ - -#include - -namespace onnxruntime { -namespace tvm { - -namespace env_vars { -static const std::string kDumpSubgraphs = "ORT_TVM_DUMP_SUBGRAPHS"; -} // namespace env_vars - -constexpr const char* default_executor_type = "vm"; -constexpr const char* vm_executor_type = "vm"; -constexpr const char* graph_executor_type = "graph"; - -constexpr const char* default_target_str = "llvm"; -constexpr const char* llvm_target_str = "llvm"; - -constexpr const char* cpu_target_str = "cpu"; -constexpr const char* gpu_target_str = "gpu"; - -constexpr const char* default_tuning_type = "AutoTVM"; -constexpr const char* autotvm_tuning_type = "AutoTVM"; -constexpr const char* ansor_tuning_type = "Ansor"; - -constexpr const unsigned int default_opt_level = 3; - -} // namespace tvm -} // namespace onnxruntime - -#endif // ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_DEFAULTS_H_ diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.cc b/onnxruntime/core/providers/tvm/tvm_ep_options.cc deleted file mode 100644 index 70e99833cd78b..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_ep_options.cc +++ /dev/null @@ -1,273 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include - -#include "core/common/common.h" -#include "core/common/cpuid_info.h" -#include "core/framework/provider_options_utils.h" - -#include "tvm_ep_options.h" - -namespace onnxruntime { -namespace tvm { - -namespace provider_option_names { -constexpr const char* kExecutor = "executor"; -constexpr const char* kSoFolder = "so_folder"; -constexpr const char* kCheckHash = "check_hash"; -constexpr const char* kHashFilePath = "hash_file_path"; -constexpr const char* kTarget = "target"; -constexpr const char* kTargetHost = "target_host"; -constexpr const char* kOptLevel = "opt_level"; -constexpr const char* kFreezeWeights = "freeze_weights"; -constexpr const char* kSetOutputZeroCopy = "set_output_zero_copy"; -constexpr const char* kToNHWC = "to_nhwc"; -constexpr const char* kTuningFilePath = "tuning_file_path"; -constexpr const char* kTuningType = "tuning_type"; -constexpr const char* kInputNames = "input_names"; -constexpr const char* kInputShapes = "input_shapes"; - -static const std::unordered_set valid_keys{ - std::string{kExecutor}, - std::string{kSoFolder}, - std::string{kCheckHash}, - std::string{kHashFilePath}, - std::string{kTarget}, - std::string{kTargetHost}, - std::string{kOptLevel}, - std::string{kFreezeWeights}, - std::string{kSetOutputZeroCopy}, - std::string{kToNHWC}, - std::string{kTuningFilePath}, - std::string{kTuningType}, - std::string{kInputNames}, - std::string{kInputShapes}}; - -} // namespace provider_option_names - -size_t split(const std::string& src, std::vector& dst, char ch) { - dst.clear(); - - size_t pos = src.find(ch); - size_t initialPos = 0; - while (pos != std::string::npos) { - dst.push_back(src.substr(initialPos, pos - initialPos)); - initialPos = pos + 1; - - pos = src.find(ch, initialPos); - } - dst.push_back(src.substr(initialPos, std::min(pos, src.size()) - initialPos + 1)); - - return dst.size(); -} - -TvmEPOptions TvmEPOptionsHelper::FromOptionsString(const char* opt_str) { - std::string settings{opt_str}; - ProviderOptions options; - if (!settings.empty()) { - const std::string& str = settings; - - // tokenize settings - std::regex reg("\\s*,\\s*"); - std::sregex_token_iterator iter(str.begin(), str.end(), reg, -1); - std::sregex_token_iterator iter_end; - std::vector pairs(iter, iter_end); - - ORT_ENFORCE(pairs.size() > 0); - - for (const auto& pair : pairs) { - auto pos_colon = pair.find(':'); - ORT_ENFORCE(pos_colon != std::string::npos, "Invalid key value pair."); - std::string key = pair.substr(0, pos_colon); - std::string value = pair.substr(pos_colon + 1); - - // trim leading and trailing spaces from key/value - key = whitespace_trimming(key); - value = whitespace_trimming(value); - - // Check keys of obtained options - if (tvm::provider_option_names::valid_keys.count(key) == 0) { - ORT_NOT_IMPLEMENTED("TvmOptions: unknown option (", key, ")"); - } - - options[key] = value; - } - } - - return TvmEPOptionsHelper::FromProviderOptions(options); -} - -std::string TvmEPOptionsHelper::whitespace_trimming(const std::string& str) { - const std::string WHITESPACE = " \n\r\t\f\v"; - size_t start = str.find_first_not_of(WHITESPACE); - if (start == std::string::npos) { - return ""; - } else { - size_t end = str.find_last_not_of(WHITESPACE); - ORT_ENFORCE(end != std::string::npos); - return str.substr(start, end + 1); - } -} - -TvmEPOptions TvmEPOptionsHelper::FromProviderOptions(const ProviderOptions& pr_options) { - TvmEPOptions options{}; - - ORT_THROW_IF_ERROR( - ProviderOptionsParser{} - .AddAssignmentToReference(tvm::provider_option_names::kExecutor, options.executor) - .AddAssignmentToReference(tvm::provider_option_names::kSoFolder, options.so_folder) - .AddAssignmentToReference(tvm::provider_option_names::kCheckHash, options.check_hash) - .AddAssignmentToReference(tvm::provider_option_names::kHashFilePath, options.hash_file_path) - .AddAssignmentToReference(tvm::provider_option_names::kTarget, options.target) - .AddAssignmentToReference(tvm::provider_option_names::kTargetHost, options.target_host) - .AddAssignmentToReference(tvm::provider_option_names::kOptLevel, options.opt_level) - .AddAssignmentToReference(tvm::provider_option_names::kFreezeWeights, options.freeze_weights) - .AddAssignmentToReference(tvm::provider_option_names::kSetOutputZeroCopy, options.set_output_zero_copy) - .AddAssignmentToReference(tvm::provider_option_names::kToNHWC, options.to_nhwc) - .AddAssignmentToReference(tvm::provider_option_names::kTuningFilePath, options.tuning_file_path) - .AddAssignmentToReference(tvm::provider_option_names::kTuningType, options.tuning_type) - .AddAssignmentToReference(tvm::provider_option_names::kInputNames, options.input_names_str) - .AddAssignmentToReference(tvm::provider_option_names::kInputShapes, options.input_shapes_str) - .Parse(pr_options)); - - optionsPostprocess(options); - - return options; -} - -void TvmEPOptionsHelper::optionsPostprocess(TvmEPOptions& options) { - setInputShapes(options); - targetPostprocess(options.target); - targetHostPostprocess(options.target, options.target_host); - optLevelPostprocess(options.opt_level); -} - -bool TvmEPOptionsHelper::checkCPUTarget(const std::string& target) { - bool check = target.find("llvm") != std::string::npos; - return check; -} - -bool TvmEPOptionsHelper::checkGPUTarget(const std::string& target) { - bool check = (target.find("cuda") != std::string::npos || - target.find("opencl") != std::string::npos || - target.find("metal") != std::string::npos || - target.find("vulkan") != std::string::npos); - return check; -} - -void TvmEPOptionsHelper::setInputShapes(TvmEPOptions& options) { - if (options.input_names_str.empty() && options.input_shapes_str.empty()) - return; - ORT_ENFORCE(!options.input_names_str.empty() && !options.input_shapes_str.empty(), - "Both provider options \"input_names\" and \"input_shapes\" should be empty or full"); - - std::vector name_set; - std::string trimmed_names = whitespace_trimming(options.input_names_str); - size_t inp_tensors_num = split(trimmed_names, name_set, ' '); - ORT_ENFORCE(inp_tensors_num, "There is no any input tensor names!"); - - std::string trimmed_shapes = whitespace_trimming(options.input_shapes_str); - size_t end_pos = trimmed_shapes.find_last_of(']'); - ORT_ENFORCE(end_pos != std::string::npos, "Invalid string for input shapes. Symbol ] is not found"); - ORT_ENFORCE(end_pos == (trimmed_shapes.size() - 1), - "Invalid string for input shapes. Symbol ] should be last after whitespace trimming"); - - std::vector shape_set; - split(trimmed_shapes, shape_set, ']'); - shape_set.pop_back(); - ORT_ENFORCE(shape_set.size() == inp_tensors_num, - "Number of shapes is not the same as number of input tensor names"); - - for (size_t i = 0; i < inp_tensors_num; ++i) { - size_t pos = shape_set[i].find('['); - ORT_ENFORCE(pos != std::string::npos, "There is no symbol [ as pair for ]"); - std::string numbers = shape_set[i].substr(pos + 1); - std::vector number_set; - ORT_ENFORCE(split(numbers, number_set, ' '), "There is no any number between [ and ] symbols"); - - TensorShapeVector dims; - for (const auto& number : number_set) { - dims.push_back(std::stoi(number)); - } - - options.input_shapes[name_set[i]] = dims; - } -} - -void TvmEPOptionsHelper::targetPostprocess(std::string& target) { - if (target == tvm::cpu_target_str || - target == tvm::llvm_target_str) { - ProcessCPUTarget(target); - } else if (target == tvm::gpu_target_str) { - ProcessGPUTarget(); - } else if (target.empty()) { - ORT_NOT_IMPLEMENTED("target option is empty!"); - } else { - // TODO(vvchernov): extend mechanism of auto-definition of target - // target is gotten from option set up by client - } -} - -void TvmEPOptionsHelper::ProcessCPUTarget(std::string& target) { - const auto& cpu_id_info = CPUIDInfo::GetCPUIDInfo(); - // auto detect from CPU ID - if (cpu_id_info.HasAVX512Skylake()) { - target = tvm::cpu_targets::LLVM_TARGET_SKYLAKE_AVX512; - } else if (cpu_id_info.HasAVX512f()) { - target = tvm::cpu_targets::LLVM_TARGET_AVX512; - } else if (cpu_id_info.HasAVX2()) { - target = tvm::cpu_targets::LLVM_TARGET_AVX2; - } else if (cpu_id_info.HasAVX()) { - target = tvm::cpu_targets::LLVM_TARGET_AVX; - } else { - // TODO(vvchernov): extend mechanism of auto-definition of cpu target - target = tvm::llvm_target_str; - } -} - -void TvmEPOptionsHelper::ProcessGPUTarget() { - ORT_NOT_IMPLEMENTED("GPU target auto-defenition is not implemented now!"); -} - -void TvmEPOptionsHelper::targetHostPostprocess(const std::string& target, std::string& target_host) { - if ((target_host == tvm::cpu_target_str || - target_host == tvm::llvm_target_str) && - target_host != target) { - target_host = target; - } else if (target_host.empty()) { - target_host = target; - } else { - // TODO(vvchernov): extend mechanism of auto-definition of target host - // target host is gotten from option set up by client - } -} - -void TvmEPOptionsHelper::optLevelPostprocess(unsigned int& opt_level) { - if (opt_level < 1) { - opt_level = tvm::default_opt_level; - } -} - -std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options) { - out << "TVM EP options:\n" - << "executor type: " << options.executor << "\n" - << "so_folder: " << options.so_folder << "\n" - << "check_hash: " << options.check_hash << "\n" - << "hash_file_path: " << options.hash_file_path << "\n" - << "target: " << options.target << "\n" - << "target_host: " << options.target_host << "\n" - << "opt level: " << options.opt_level << "\n" - << "freeze weights: " << options.freeze_weights << "\n" - << "set_output_zero_copy: " << options.set_output_zero_copy << "\n" - << "tuning file path: " << options.tuning_file_path << "\n" - << "tuning type: " << options.tuning_type << "\n" - << "convert layout to NHWC: " << options.to_nhwc << "\n" - << "input tensor names: " << options.input_names_str << "\n" - << "input tensor shapes: " << options.input_shapes_str; - return out; -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_ep_options.h b/onnxruntime/core/providers/tvm/tvm_ep_options.h deleted file mode 100644 index 0f2db30a3b304..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_ep_options.h +++ /dev/null @@ -1,76 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_EXECUTION_PROVIDER_OPTIONS_H -#define TVM_EXECUTION_PROVIDER_OPTIONS_H - -#include -#include -#include -#include - -#include "core/framework/provider_options.h" -#include "core/framework/tensor_shape.h" - -#include "tvm_defaults.h" - -namespace onnxruntime { - -namespace tvm { -namespace cpu_targets { -// TODO(vvchernov): avx and avx512 need more careful differentiation for target -const std::string LLVM_TARGET_AVX = "llvm -mcpu=corei7-avx"; -const std::string LLVM_TARGET_AVX2 = "llvm -mcpu=core-avx2"; -const std::string LLVM_TARGET_SKYLAKE_AVX512 = "llvm -mcpu=skylake-avx512"; -const std::string LLVM_TARGET_AVX512 = "llvm -mcpu=skylake-avx512"; -} // namespace cpu_targets - -using TVMTensorShapes = std::vector; -using TVMInputShapes = std::unordered_map; -using InputsInfoMap = std::unordered_map; - -// Information needed to construct an TVM execution provider. -struct TvmEPOptions { - std::string executor{tvm::default_executor_type}; - std::string so_folder{""}; - bool check_hash = false; - std::string hash_file_path{""}; - std::string target{tvm::default_target_str}; - std::string target_host{tvm::default_target_str}; - unsigned int opt_level{tvm::default_opt_level}; - bool freeze_weights = true; - bool to_nhwc = false; - bool set_output_zero_copy = true; - std::string tuning_file_path{""}; - std::string tuning_type{tvm::default_tuning_type}; - std::string input_names_str{""}; - std::string input_shapes_str{""}; - TVMInputShapes input_shapes{}; - TVMTensorShapes output_shapes{}; -}; - -std::ostream& operator<<(std::ostream& out, const TvmEPOptions& options); - -class TvmEPOptionsHelper { - public: - static TvmEPOptions FromOptionsString(const char* options); - static TvmEPOptions FromProviderOptions(const ProviderOptions& options); - static std::string whitespace_trimming(const std::string& str); - - static bool checkCPUTarget(const std::string& target); - static bool checkGPUTarget(const std::string& target); - - private: - static void optionsPostprocess(TvmEPOptions& options); - static void setInputShapes(TvmEPOptions& options); - static void targetPostprocess(std::string& target); - static void ProcessCPUTarget(std::string& target); - static void ProcessGPUTarget(); - static void targetHostPostprocess(const std::string& target, std::string& target_host); - static void optLevelPostprocess(unsigned int& opt_level); -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_EXECUTION_PROVIDER_OPTIONS_H diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc b/onnxruntime/core/providers/tvm/tvm_execution_provider.cc deleted file mode 100644 index 61ee8f899dbf1..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider.cc +++ /dev/null @@ -1,304 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include -#include - -#include "core/common/common.h" -#include "core/framework/execution_provider.h" -#include "core/framework/tensorprotoutils.h" -#include "core/framework/kernel_registry.h" -#include "core/framework/compute_capability.h" -#include "core/graph/graph_proto_serializer.h" -#include "core/platform/env.h" -#include "core/graph/model.h" - -#include "tvm_execution_provider.h" -#include "xpu_data_transfer.h" -#include "tvm_allocator.h" -#include "tvm_utils.h" -#include "tvm_api.h" - -using namespace ONNX_NAMESPACE; - -namespace onnxruntime { -namespace tvm { - -// Information to construct kernel function state. -struct TVMFuncState { - AllocateFunc allocate_func = nullptr; - DestroyFunc release_func = nullptr; - AllocatorHandle allocator = nullptr; - std::shared_ptr compiler = nullptr; -}; - -TvmExecutionProvider::TvmExecutionProvider(const TvmEPOptions& options) - : IExecutionProvider{kTvmExecutionProvider}, - options_{options} { - AllocatorCreationInfo default_memory_info = {[](int) { - return std::make_unique(); - }, - 0, false}; - // Get environment variables - const Env& env_instance = Env::Default(); - - const std::string dump_subgraphs_env = env_instance.GetEnvironmentVar(env_vars::kDumpSubgraphs); - if (!dump_subgraphs_env.empty()) { - dump_subgraphs_ = std::stoi(dump_subgraphs_env) != 0; - } -} - -std::vector TvmExecutionProvider::CreatePreferredAllocators() { - AllocatorCreationInfo default_memory_info = {[](int) { - return std::make_unique(); - }, - 0, false}; - return std::vector{CreateAllocator(default_memory_info)}; // TODO(leca): REVIEW: will CPU EP also use this? -} - -TvmExecutionProvider::~TvmExecutionProvider() {} - -std::vector> -TvmExecutionProvider::GetCapability(const GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { - std::vector> result; - if (graph_viewer.IsSubgraph()) { - return result; - } - - const auto& init_tensors = graph_viewer.GetAllInitializedTensors(); - - std::unordered_set required_initializers; - const std::vector& sorted_nodes = graph_viewer.GetNodesInTopologicalOrder(); - std::unique_ptr sub_graph = std::make_unique(); - for (auto& node_idx : sorted_nodes) { - graph_viewer.GetNode(node_idx)->ForEachDef([&required_initializers, &init_tensors](const NodeArg& node_arg, bool is_input) { - if(is_input && init_tensors.count(node_arg.Name())) { - required_initializers.insert(node_arg.Name()); - } }, true); - } - - auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); - meta_def->name = "TVMStandalone"; - meta_def->domain = "StandaloneTest"; - std::vector inputs; - std::vector outputs; - - for (auto& nodeArgPtr : graph_viewer.GetInputs()) { - inputs.push_back(nodeArgPtr->Name()); - } - - for (auto& name : required_initializers) { - inputs.push_back(name); - } - - for (auto& nodeArgPtr : graph_viewer.GetOutputs()) { - outputs.push_back(nodeArgPtr->Name()); - } - meta_def->inputs = inputs; - meta_def->outputs = outputs; - meta_def->since_version = 1; - meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL; - sub_graph->SetMetaDef(std::move(meta_def)); - sub_graph->nodes = sorted_nodes; - result.push_back( - std::make_unique(std::move(sub_graph))); - return result; -} - -common::Status TvmExecutionProvider::Compile(const std::vector& fused_nodes_and_graphs, - std::vector& node_compute_funcs) { - printOptions(); - for (auto& fused_node_graph : fused_nodes_and_graphs) { - const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - const Node& fused_node = fused_node_graph.fused_node; - const std::string func_name = fused_node.Name(); - Model model(graph_body_viewer.Name(), true, ModelMetaData(), PathString(), - IOnnxRuntimeOpSchemaRegistryList(), graph_body_viewer.DomainToVersionMap(), - std::vector(), *GetLogger()); - ONNX_NAMESPACE::ModelProto model_proto = model.ToProto(); - // TVM EP is using static lib approach, so invoke serializer directly. - GraphViewerToProto(graph_body_viewer, *model_proto.mutable_graph(), true, true); - auto opset = model_proto.add_opset_import(); - opset->set_domain(kOnnxDomain); - opset->set_version(graph_body_viewer.DomainToVersionMap().at(kOnnxDomain)); - - std::string onnx_model_str; - model_proto.SerializeToString(&onnx_model_str); - compilers_[func_name] = std::make_shared(std::move(onnx_model_str), - ToUTF8String(fused_node.ModelPath().ToPathString()), - int(opset->version())); - InputsInfoMap all_input_shapes; - auto mod = compileModel(func_name, graph_body_viewer, all_input_shapes); - - std::vector output_tensors; - prepareOutputTensors(mod, output_tensors, graph_body_viewer.GetOutputs().size()); - - runners_[func_name] = std::make_shared(options_, mod, all_input_shapes, output_tensors); - - if (dump_subgraphs_) { - std::fstream dump("/tmp/" + func_name + ".onnx", - std::ios::out | std::ios::trunc | std::ios::binary); - model_proto.SerializeToOstream(&dump); - } - - // TODO(vvchernov): implement ops checking and mechanism of gracefully passing the responsibility to other EPs - // if the checking fails due to unsupported op(s) - NodeComputeInfo compute_info = prepareComputeInfo(func_name); - - node_compute_funcs.push_back(compute_info); - } - return Status::OK(); -} - -std::unique_ptr TvmExecutionProvider::GetDataTransfer() const { - // TODO(vvchernov): target or target host? - if (TvmEPOptionsHelper::checkGPUTarget(options_.target)) { - return std::make_unique(); - } else if (TvmEPOptionsHelper::checkCPUTarget(options_.target)) { - return std::make_unique(); - } else { - ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", options_.target); - } -} - -void TvmExecutionProvider::printOptions() { - LOGS(*GetLogger(), INFO) << options_; -} - -std::shared_ptr TvmExecutionProvider::compileModel(const std::string& func_name, - const GraphViewer& graph_viewer, - InputsInfoMap& all_input_shapes) { - all_input_shapes.clear(); - - TVMTensorShapes input_shapes; - if (options_.freeze_weights) { - setInputShapesForFreezedNN(graph_viewer, input_shapes, all_input_shapes); - } else { - setInputShapesForUnfreezedNN(graph_viewer, input_shapes, all_input_shapes); - } - - std::shared_ptr mod = compilers_[func_name]->operator()(options_, input_shapes); - - return mod; -} - -void TvmExecutionProvider::setInputShapesForFreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, - InputsInfoMap& all_input_shapes) { - const std::vector& all_nodes = graph_viewer.GetInputsIncludingInitializers(); - - size_t indx = 0; - for (const auto* node : all_nodes) { - if (!graph_viewer.IsInitializedTensor(node->Name())) { - TensorShapeVector shape = getInputShape(node); - all_input_shapes[indx++] = shape; - input_shapes.emplace_back(shape); - } - } -} - -void TvmExecutionProvider::setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, - InputsInfoMap& all_input_shapes) { - const std::vector& all_nodes = graph_viewer.GetInputsIncludingInitializers(); - - size_t indx = 0; - for (const auto* node : all_nodes) { - TensorShapeVector shape = getInputShape(node); - all_input_shapes[indx++] = shape; - if (!graph_viewer.IsInitializedTensor(node->Name())) { - input_shapes.emplace_back(shape); - } - } -} - -TensorShapeVector TvmExecutionProvider::getInputShape(const NodeArg* node) { - TensorShapeVector shape; - const auto& node_name = node->Name(); - if (!options_.input_shapes.empty() && - options_.input_shapes.count(node_name)) { - shape = options_.input_shapes[node_name]; - } else { - shape = convertTensorShape(*node->Shape()); - } - - return shape; -} - -TensorShapeVector TvmExecutionProvider::convertTensorShape(const TensorShapeProto& shape_proto) { - TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto); - size_t dims = ort_shape.NumDimensions(); - - TensorShapeVector shape(dims); - for (size_t j = 0; j < dims; ++j) { - int64_t dim = int64_t(ort_shape[j]); - ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " + - "Please use provider options to setup input_names and input_shapes"); - shape[j] = dim; - } - - return shape; -} - -void TvmExecutionProvider::prepareOutputTensors(const std::shared_ptr& mod, - std::vector& output_tensors, - size_t num) { - ORT_ENFORCE(mod != nullptr, "TVM module is not compiled"); - output_tensors.clear(); - options_.output_shapes.clear(); - options_.output_shapes.resize(num); - - if (options_.executor != "vm") { - TVMGetOutputShapes(*mod, options_.output_shapes); - } - - for (auto& output_shape : options_.output_shapes) { - DLTensor t; - // Draft for tensor, correct data is defined during inference - t.strides = nullptr; - t.byte_offset = 0; - t.data = nullptr; - if (options_.executor == "vm") { - t.ndim = 0; - t.shape = nullptr; - } else { - t.ndim = output_shape.size(); - t.shape = output_shape.data(); - } - - output_tensors.push_back(t); - } -} - -NodeComputeInfo TvmExecutionProvider::prepareComputeInfo(const std::string& func_name) { - NodeComputeInfo compute_info; - compute_info.create_state_func = std::bind(&TvmExecutionProvider::createStateFunc, - this, - std::placeholders::_1, - std::placeholders::_2); - - compute_info.release_state_func = [](FunctionState state) { - if (state) - delete static_cast(state); - }; - - compute_info.compute_func = *runners_[func_name].get(); - - return compute_info; -} - -int TvmExecutionProvider::createStateFunc(ComputeContext* context, FunctionState* state) { - auto* state_ptr = new TVMFuncState(); - *state_ptr = {context->allocate_func, - context->release_func, - context->allocator_handle, - compilers_[context->node_name]}; - // TODO(vvchernov): Who and when release state? - *state = state_ptr; - return 0; -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_execution_provider.h deleted file mode 100644 index baa46c593fa07..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_execution_provider.h +++ /dev/null @@ -1,71 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_EXECUTION_PROVIDER_H -#define TVM_EXECUTION_PROVIDER_H - -#include -#include -#include -#include - -#include "core/common/logging/logging.h" -#include "core/framework/execution_provider.h" -#include - -#include "tvm_compiler.h" -#include "tvm_runner.h" - -namespace onnxruntime { -class Graph; -class NodeArg; -namespace tvm { - -class TvmExecutionProvider : public IExecutionProvider { - using Compiler = TVMCompilerBase; - using Compilers = std::unordered_map>; - using Runner = TVMRunner; - using Runners = std::unordered_map>; - - public: - explicit TvmExecutionProvider(const TvmEPOptions& options); - virtual ~TvmExecutionProvider(); - - std::vector> - GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const override; - - common::Status Compile(const std::vector& fused_nodes_and_graphs, - std::vector& node_compute_funcs) override; - std::unique_ptr GetDataTransfer() const override; - std::vector CreatePreferredAllocators() override; - - private: - void printOptions(); - std::shared_ptr compileModel(const std::string& func_name, - const GraphViewer& graph_viewer, - InputsInfoMap& inputs_info); // NOLINT - void setInputShapesForFreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, // NOLINT - InputsInfoMap& all_input_shapes); // NOLINT - void setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, // NOLINT - InputsInfoMap& all_input_shapes); // NOLINT - TensorShapeVector getInputShape(const NodeArg* node); - TensorShapeVector convertTensorShape(const ONNX_NAMESPACE::TensorShapeProto& shape_proto); - void prepareOutputTensors(const std::shared_ptr& mod, - std::vector& output_tensors, size_t num); // NOLINT - NodeComputeInfo prepareComputeInfo(const std::string& func_name); - int createStateFunc(ComputeContext*, FunctionState*); - - private: - TvmEPOptions options_; - Compilers compilers_; - Runners runners_; - bool dump_subgraphs_ = false; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_EXECUTION_PROVIDER_H diff --git a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc b/onnxruntime/core/providers/tvm/tvm_provider_factory.cc deleted file mode 100644 index d83fd8ee4d1cb..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_provider_factory.cc +++ /dev/null @@ -1,53 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include - -#include "core/providers/tvm/tvm_provider_factory.h" -#include "core/session/abi_session_options_impl.h" - -#include "tvm_execution_provider.h" -#include "tvm_provider_factory_creator.h" -#include "tvm_so_execution_provider.h" // NOLINT(build/include_subdir) - -namespace onnxruntime { - -struct TvmProviderFactory : IExecutionProviderFactory { - TvmProviderFactory(const tvm::TvmEPOptions& options) : options_{options} {} - ~TvmProviderFactory() = default; - - std::unique_ptr CreateProvider() override { - std::unique_ptr provider = nullptr; - if (options_.so_folder != "") { - ORT_ENFORCE(options_.executor == "vm", - "Only virtual machine module is compiled from shared lib and dependences!"); - provider = std::move(std::make_unique(options_)); - } else { - provider = std::move(std::make_unique(options_)); - } - - return provider; - } - - private: - tvm::TvmEPOptions options_; -}; - -std::shared_ptr TVMProviderFactoryCreator::Create(const char* opt_str) { - tvm::TvmEPOptions options = tvm::TvmEPOptionsHelper::FromOptionsString(opt_str); - return std::make_shared(options); -} - -std::shared_ptr TVMProviderFactoryCreator::Create(const tvm::TvmEPOptions& options) { - return std::make_shared(options); -} -} // namespace onnxruntime - -ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tvm, - _In_ OrtSessionOptions* options, - _In_ const char* opt_str) { - onnxruntime::tvm::TvmEPOptions tvm_options = onnxruntime::tvm::TvmEPOptionsHelper::FromOptionsString(opt_str); - options->provider_factories.push_back(onnxruntime::TVMProviderFactoryCreator::Create(tvm_options)); - return nullptr; -} diff --git a/onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h b/onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h deleted file mode 100644 index 2d7e06b5b7c59..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_provider_factory_creator.h +++ /dev/null @@ -1,19 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#pragma once - -#include - -#include "core/providers/providers.h" - -namespace onnxruntime { -namespace tvm { -struct TvmEPOptions; -} - -struct TVMProviderFactoryCreator { - static std::shared_ptr Create(const tvm::TvmEPOptions& options); - static std::shared_ptr Create(const char* params); -}; -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_runner.cc b/onnxruntime/core/providers/tvm/tvm_runner.cc deleted file mode 100644 index 5dda8f5bf9c3e..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_runner.cc +++ /dev/null @@ -1,26 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/graph/model.h" -#include "core/framework/tensorprotoutils.h" - -#include "tvm_runner.h" - -using namespace ONNX_NAMESPACE; -namespace onnxruntime { -namespace tvm { - -TVMRunner::TVMRunner(const TvmEPOptions& options, - const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const std::vector& output_tensors) { - runner_ = getTVMRunnerImpl(mod, options, inputs_info, output_tensors); -} - -common::Status TVMRunner::operator()(FunctionState state, const OrtApi* /*api*/, OrtKernelContext* context) { - Ort::KernelContext ctx(context); - return runner_->run(ctx); -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_runner.h b/onnxruntime/core/providers/tvm/tvm_runner.h deleted file mode 100644 index 4b7349ee3405e..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_runner.h +++ /dev/null @@ -1,34 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_RUNNER_H -#define TVM_RUNNER_H - -#include -#include - -#include "tvm_runner_impl.h" - -namespace onnxruntime { -namespace tvm { - -class TVMRunner { - public: - TVMRunner() = delete; - virtual ~TVMRunner() = default; - - TVMRunner(const TvmEPOptions& options, - const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const std::vector& output_tensor); - - common::Status operator()(FunctionState state, const OrtApi* api, OrtKernelContext* context); - - private: - std::shared_ptr runner_; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_TVM_RUNNER_H diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc b/onnxruntime/core/providers/tvm/tvm_runner_impl.cc deleted file mode 100644 index c88de2652f14b..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_runner_impl.cc +++ /dev/null @@ -1,175 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/framework/tensorprotoutils.h" - -#include "tvm_runner_impl.h" -#include "tvm_utils.h" -#include "tvm_api.h" - -namespace onnxruntime { -namespace tvm { - -/* ------------------------------------ RunnerImplFactory ----------------------------- */ - -std::shared_ptr getTVMRunnerImpl(const std::shared_ptr& mod, - const TvmEPOptions& options, - const InputsInfoMap& inputs_info, - const std::vector output_tensors) { - const std::string& name = options.executor; - if (name == "graph") { - return std::make_shared(mod, inputs_info, options.output_shapes, - output_tensors, options.set_output_zero_copy); - } else if (name == "vm") { - return std::make_shared(mod, inputs_info, options.output_shapes, - output_tensors, options.set_output_zero_copy); - } - return nullptr; -} - -/* ------------------------------------ RunnerImpl ------------------------------------ */ - -RunnerImpl::RunnerImpl(const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const TVMTensorShapes output_shapes, - const std::vector output_tensors, - bool set_output_zero_copy) : mod_(mod), - inputs_info_(inputs_info), - output_shapes_(output_shapes), - output_tensors_(output_tensors), - set_output_zero_copy_(set_output_zero_copy) { -} - -void RunnerImpl::convert_input_tensors2dl_tensors(Ort::KernelContext& context, - std::vector& dst, - std::vector& dst_inds) { - size_t num = inputs_info_.size(); - dst.reserve(num); - dst_inds.reserve(num); - for (auto& info : inputs_info_) { - // TODO(vvchernov): decomposition declaration only available with -std=c++1z or -std=gnu++1z - auto& i = info.first; - auto& shape = info.second; - - auto input_tensor = context.GetInput(i); - ORT_ENFORCE(input_tensor.IsTensor()); - - auto ort_device_type = input_tensor.GetTensorMemoryInfo().GetDeviceType(); - const auto tensor_type = input_tensor.GetTensorTypeAndShapeInfo().GetElementType(); - - DLTensor t; - t.device = GetDLDevice(ort_device_type); - t.dtype = GetDataType(tensor_type); - t.strides = nullptr; - t.byte_offset = 0; - t.data = const_cast(input_tensor.GetTensorRawData()); - t.ndim = shape.size(); - t.shape = shape.data(); - dst.emplace_back(t); - dst_inds.push_back(i); - } -} - -void RunnerImpl::add_device_type_data2output_tensors(Ort::KernelContext& context) { - size_t num_outputs = output_tensors_.size(); - for (auto i = 0u; i < num_outputs; i++) { - // setup output tensor property - auto output_tensor = context.GetOutput(i, - output_shapes_[i].data(), - output_shapes_[i].size()); - ORT_ENFORCE(output_tensor.IsTensor()); - - output_tensors_[i].device = - GetDLDevice(output_tensor.GetTensorMemoryInfo().GetDeviceType()); - output_tensors_[i].dtype = - GetDataType(output_tensor.GetTensorTypeAndShapeInfo().GetElementType()); - output_tensors_[i].data = output_tensor.GetTensorMutableRawData(); - } -} - -/* ------------------------------------ GERunnerImpl ------------------------------------ */ - -GERunnerImpl::GERunnerImpl(const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const TVMTensorShapes output_shapes, - const std::vector output_tensors, - bool set_output_zero_copy) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) { -} - -void GERunnerImpl::set_input(Ort::KernelContext& context) { - std::vector inds; - std::vector dl_tensors_inputs; - convert_input_tensors2dl_tensors(context, dl_tensors_inputs, inds); - - tvm::TVMSetInputs(*mod_, inds, dl_tensors_inputs); -} - -void GERunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) { - add_device_type_data2output_tensors(context); -} - -void GERunnerImpl::set_output_zero_copy() { - tvm::TVMSetOutputsZeroCopy(*mod_, output_tensors_); -} - -void GERunnerImpl::run() { - tvm::TVMRun(*mod_); -} - -void GERunnerImpl::get_outputs() { - tvm::TVMGetOutputs(*mod_, output_tensors_); -} - -/* ------------------------------------ VMRunnerImpl ------------------------------------ */ - -VMRunnerImpl::VMRunnerImpl(const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const TVMTensorShapes output_shapes, - const std::vector output_tensors, - bool set_output_zero_copy) : RunnerImpl(mod, inputs_info, output_shapes, output_tensors, set_output_zero_copy) { -} - -void VMRunnerImpl::set_input(Ort::KernelContext& context) { - std::vector inds; - std::vector dl_tensors_inputs; - convert_input_tensors2dl_tensors(context, dl_tensors_inputs, inds); - - tvm::TVM_VM_SetInputs(*mod_, inds, dl_tensors_inputs); -} - -void VMRunnerImpl::connect_output_tensors2ort(Ort::KernelContext& context) { - // TODO(vvchernov): try to find more flexible solution - if (!probe_infer_) { - infer_once_to_get_output_shapes(); - } - - add_device_type_data2output_tensors(context); -} - -void VMRunnerImpl::set_output_zero_copy() { - tvm::TVM_VM_SetOutputsZeroCopy(*mod_, output_tensors_); -} - -void VMRunnerImpl::run() { - tvm::TVM_VM_Run(*mod_); -} - -void VMRunnerImpl::get_outputs() { - tvm::TVM_VM_GetOutputs(*mod_, output_tensors_); -} - -void VMRunnerImpl::infer_once_to_get_output_shapes() { - run(); - size_t num_outputs = output_tensors_.size(); - // TODO(vvchernov): check it - output_shapes_.resize(num_outputs); - tvm::TVMGetOutputShapes(*mod_, output_shapes_); - for (size_t i = 0; i < num_outputs; ++i) { - output_tensors_[i].ndim = output_shapes_[i].size(); - output_tensors_[i].shape = output_shapes_[i].data(); - } - probe_infer_ = true; -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_runner_impl.h b/onnxruntime/core/providers/tvm/tvm_runner_impl.h deleted file mode 100644 index 8c325303673b6..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_runner_impl.h +++ /dev/null @@ -1,126 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_RUNNER_IMPL_H -#define TVM_RUNNER_IMPL_H - -#include -#include -#include - -#include "core/framework/func_api.h" -#include "core/session/onnxruntime_cxx_api.h" - -#include "tvm_common.h" -#include "tvm_ep_options.h" - -namespace onnxruntime { -namespace tvm { - -class RunnerImpl { - public: - RunnerImpl() = delete; - RunnerImpl(const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const TVMTensorShapes output_shapes, - const std::vector tensors_outputs, - bool set_output_zero_copy); - virtual ~RunnerImpl() = default; - - virtual common::Status run(Ort::KernelContext& context) { - common::Status res; - if (set_output_zero_copy_) { - res = run_without_output_copying(context); - } else { - res = run_with_output_copying(context); - } - return res; - } - - virtual common::Status run_without_output_copying(Ort::KernelContext& context) { - set_input(context); - connect_output_tensors2ort(context); - set_output_zero_copy(); - run(); - - return Status::OK(); - } - - virtual common::Status run_with_output_copying(Ort::KernelContext& context) { - set_input(context); - connect_output_tensors2ort(context); - run(); - get_outputs(); - - return Status::OK(); - } - - virtual void set_input(Ort::KernelContext& context) = 0; - virtual void connect_output_tensors2ort(Ort::KernelContext& context) = 0; - virtual void set_output_zero_copy() = 0; - virtual void run() = 0; - virtual void get_outputs() = 0; - - protected: - void convert_input_tensors2dl_tensors(Ort::KernelContext& context, - std::vector& dst, - std::vector& dst_inds); - void add_device_type_data2output_tensors(Ort::KernelContext& context); - - protected: - std::shared_ptr mod_; - InputsInfoMap inputs_info_; - TVMTensorShapes output_shapes_; - std::vector output_tensors_; - bool set_output_zero_copy_; -}; - -class GERunnerImpl : public RunnerImpl { - public: - GERunnerImpl() = delete; - GERunnerImpl(const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const TVMTensorShapes output_shapes, - const std::vector tensors_outputs, - bool set_output_zero_copy); - virtual ~GERunnerImpl() = default; - - void set_input(Ort::KernelContext& context) final; - void connect_output_tensors2ort(Ort::KernelContext& context) final; - void set_output_zero_copy() final; - void run() final; - void get_outputs() final; -}; - -class VMRunnerImpl : public RunnerImpl { - public: - VMRunnerImpl() = delete; - VMRunnerImpl(const std::shared_ptr& mod, - const InputsInfoMap& inputs_info, - const TVMTensorShapes output_shapes, - const std::vector tensors_outputs, - bool set_output_zero_copy); - virtual ~VMRunnerImpl() = default; - - void set_input(Ort::KernelContext& context) final; - void connect_output_tensors2ort(Ort::KernelContext& context) final; - void set_output_zero_copy() final; - void run() final; - void get_outputs() final; - - private: - void infer_once_to_get_output_shapes(); - - private: - bool probe_infer_ = false; -}; - -std::shared_ptr getTVMRunnerImpl(const std::shared_ptr& mod, - const TvmEPOptions& options, - const InputsInfoMap& inputs_info, - const std::vector output_tensors); - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_TVM_RUNNER_IMPL_H diff --git a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc b/onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc deleted file mode 100644 index 029f25d6f292a..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.cc +++ /dev/null @@ -1,284 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include -#include -#include - -#include "core/framework/execution_provider.h" -#include "core/framework/tensorprotoutils.h" -#include "core/framework/kernel_registry.h" -#include "core/framework/compute_capability.h" -#include "core/platform/env.h" -#include "core/graph/model.h" - -#include "tvm_so_execution_provider.h" // NOLINT(build/include_subdir) -#include "xpu_data_transfer.h" // NOLINT(build/include_subdir) -#include "tvm_allocator.h" // NOLINT(build/include_subdir) -#include "tvm_utils.h" // NOLINT(build/include_subdir) -#include "tvm_api.h" // NOLINT(build/include_subdir) -#ifdef USE_TVM_HASH -#include "hash_alg/hasher.h" // NOLINT(build/include_subdir) -#endif - -using ONNX_NAMESPACE::TensorShapeProto; - -namespace onnxruntime { -namespace tvm { - -// Information to construct kernel function state. -struct TVMFuncState { - AllocateFunc allocate_func = nullptr; - DestroyFunc release_func = nullptr; - AllocatorHandle allocator = nullptr; - std::shared_ptr compiler = nullptr; -}; - -TvmSoExecutionProvider::TvmSoExecutionProvider(const TvmEPOptions& options) - : IExecutionProvider{kTvmExecutionProvider}, - options_{options} { - // Get environment variables - const Env& env_instance = Env::Default(); - - const std::string dump_subgraphs_env = env_instance.GetEnvironmentVar(env_vars::kDumpSubgraphs); - ORT_ENFORCE(dump_subgraphs_env.empty(), "TVM EP processing shared lib does not support subgraphs"); -} - -std::vector TvmSoExecutionProvider::CreatePreferredAllocators() { - AllocatorCreationInfo default_memory_info = {[](int) { - return std::make_unique(); - }, - 0, false}; - return std::vector{CreateAllocator(default_memory_info)}; -} - -TvmSoExecutionProvider::~TvmSoExecutionProvider() {} - -std::vector> -TvmSoExecutionProvider::GetCapability(const GraphViewer& graph_viewer, - const IKernelLookup& /*kernel_lookup*/) const { - std::vector> result; - if (graph_viewer.IsSubgraph()) { - return result; - } - - const auto& init_tensors = graph_viewer.GetAllInitializedTensors(); - - std::unordered_set required_initializers; - const std::vector& sorted_nodes = graph_viewer.GetNodesInTopologicalOrder(); - std::unique_ptr sub_graph = std::make_unique(); - for (auto& node_idx : sorted_nodes) { - graph_viewer.GetNode(node_idx)->ForEachDef([&required_initializers, &init_tensors](const NodeArg& node_arg, bool is_input) { - if (is_input && init_tensors.count(node_arg.Name())) { - required_initializers.insert(node_arg.Name()); - } }, true); - } - - auto meta_def = std::make_unique<::onnxruntime::IndexedSubGraph::MetaDef>(); - meta_def->name = "TVMStandalone"; - meta_def->domain = "StandaloneTest"; - std::vector inputs; - std::vector outputs; - - for (auto& nodeArgPtr : graph_viewer.GetInputs()) { - inputs.push_back(nodeArgPtr->Name()); - } - - for (auto& name : required_initializers) { - inputs.push_back(name); - } - - for (auto& nodeArgPtr : graph_viewer.GetOutputs()) { - outputs.push_back(nodeArgPtr->Name()); - } - meta_def->inputs = inputs; - meta_def->outputs = outputs; - meta_def->since_version = 1; - meta_def->status = ONNX_NAMESPACE::EXPERIMENTAL; - sub_graph->SetMetaDef(std::move(meta_def)); - sub_graph->nodes = sorted_nodes; - result.push_back( - std::make_unique(std::move(sub_graph))); - return result; -} - -common::Status TvmSoExecutionProvider::Compile(const std::vector& fused_nodes_and_graphs, - std::vector& node_compute_funcs) { - printOptions(); - for (auto& fused_node_graph : fused_nodes_and_graphs) { - const GraphViewer& graph_body_viewer = fused_node_graph.filtered_graph; - const Node& fused_node = fused_node_graph.fused_node; -#ifdef USE_TVM_HASH - if (options_.check_hash) { - ORT_ENFORCE(checkHash(ToUTF8String(fused_node.ModelPath().ToPathString())), - "Hash check shows that used tuning files were not obtained for the given onnx-model"); - } -#endif - const std::string func_name = fused_node.Name(); - - compilers_[func_name] = std::make_shared(); - InputsInfoMap all_input_shapes; - auto mod = compileModel(func_name, graph_body_viewer, all_input_shapes); - - std::vector output_tensors(graph_body_viewer.GetOutputs().size()); - prepareOutputTensors(output_tensors); - - runners_[func_name] = std::make_shared(options_, mod, all_input_shapes, output_tensors); - - // TODO(vvchernov): implement ops checking and mechanism of gracefully passing the responsibility to other EPs - // if the checking fails due to unsupported op(s) - NodeComputeInfo compute_info = prepareComputeInfo(func_name); - - node_compute_funcs.push_back(compute_info); - } - return Status::OK(); -} - -std::unique_ptr TvmSoExecutionProvider::GetDataTransfer() const { - // TODO(vvchernov): target or target host? - if (TvmEPOptionsHelper::checkGPUTarget(options_.target)) { - return std::make_unique(); - } else if (TvmEPOptionsHelper::checkCPUTarget(options_.target)) { - return std::make_unique(); - } else { - ORT_NOT_IMPLEMENTED("TVM GetDataTransfer is not implemented for target ", options_.target); - } -} - -void TvmSoExecutionProvider::printOptions() { - LOGS(*GetLogger(), INFO) << options_; -} - -#ifdef USE_TVM_HASH -bool TvmSoExecutionProvider::checkHash(const std::string& onnx_path) const { - auto hasher = Hasher("sha256"); - std::string onnx_str = readFromFile(onnx_path); - std::string onnx_hash = hasher.hash(onnx_str.c_str(), onnx_str.size()); - onnx_str.clear(); - std::string hash; - if (options_.hash_file_path.empty()) { - // TODO(vvchernov): align hash file name with OctoML team - hash = readFromFile(options_.so_folder + "/hash.txt"); - } else { - hash = readFromFile(options_.hash_file_path); - } - return onnx_hash == hash; -} -#endif - -std::shared_ptr TvmSoExecutionProvider::compileModel(const std::string& func_name, - const GraphViewer& graph_viewer, - InputsInfoMap& all_input_shapes) { - all_input_shapes.clear(); - - TVMTensorShapes input_shapes; - if (options_.freeze_weights) { - setInputShapesForFreezedNN(graph_viewer, input_shapes, all_input_shapes); - } else { - setInputShapesForUnfreezedNN(graph_viewer, input_shapes, all_input_shapes); - } - - std::shared_ptr mod = compilers_[func_name]->operator()(options_, input_shapes); - - return mod; -} - -void TvmSoExecutionProvider::setInputShapesForFreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, - InputsInfoMap& all_input_shapes) { - const std::vector& all_nodes = graph_viewer.GetInputsIncludingInitializers(); - - size_t indx = 0; - for (const auto* node : all_nodes) { - if (!graph_viewer.IsInitializedTensor(node->Name())) { - TensorShapeVector shape = getInputShape(node); - all_input_shapes[indx++] = shape; - input_shapes.emplace_back(shape); - } - } -} - -void TvmSoExecutionProvider::setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, - InputsInfoMap& all_input_shapes) { - const std::vector& all_nodes = graph_viewer.GetInputsIncludingInitializers(); - - size_t indx = 0; - for (const auto* node : all_nodes) { - TensorShapeVector shape = getInputShape(node); - all_input_shapes[indx++] = shape; - if (!graph_viewer.IsInitializedTensor(node->Name())) { - input_shapes.emplace_back(shape); - } - } -} - -TensorShapeVector TvmSoExecutionProvider::getInputShape(const NodeArg* node) { - TensorShapeVector shape; - const auto& node_name = node->Name(); - if (!options_.input_shapes.empty() && - options_.input_shapes.count(node_name)) { - shape = options_.input_shapes[node_name]; - } else { - shape = convertTensorShape(*node->Shape()); - } - - return shape; -} - -TensorShapeVector TvmSoExecutionProvider::convertTensorShape(const TensorShapeProto& shape_proto) { - TensorShape ort_shape = utils::GetTensorShapeFromTensorShapeProto(shape_proto); - size_t dims = ort_shape.NumDimensions(); - - TensorShapeVector shape(dims); - for (size_t j = 0; j < dims; ++j) { - int64_t dim = int64_t(ort_shape[j]); - ORT_ENFORCE(dim > 0, "Input dimension is not positive value (dim = " + std::to_string(dim) + "). " + - "Please use provider options to setup input_names and input_shapes"); - shape[j] = dim; - } - - return shape; -} - -void TvmSoExecutionProvider::prepareOutputTensors(std::vector& output_tensors) { - for (DLTensor& t : output_tensors) { - // Draft for tensor, correct data is defined during inference - t.strides = nullptr; - t.byte_offset = 0; - t.data = nullptr; - t.ndim = 0; - t.shape = nullptr; - } -} - -NodeComputeInfo TvmSoExecutionProvider::prepareComputeInfo(const std::string& func_name) { - NodeComputeInfo compute_info; - compute_info.create_state_func = std::bind(&TvmSoExecutionProvider::createStateFunc, - this, - std::placeholders::_1, - std::placeholders::_2); - - compute_info.release_state_func = [](FunctionState state) { - if (state) - delete static_cast(state); - }; - - compute_info.compute_func = *runners_[func_name].get(); - - return compute_info; -} - -int TvmSoExecutionProvider::createStateFunc(ComputeContext* context, FunctionState* state) { - auto* state_ptr = new TVMFuncState(); - *state_ptr = {context->allocate_func, - context->release_func, - context->allocator_handle, - compilers_[context->node_name]}; - // TODO(vvchernov): Who and when release state? - *state = state_ptr; - return 0; -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h b/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h deleted file mode 100644 index d3840f46b5b55..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_so_execution_provider.h +++ /dev/null @@ -1,72 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_SO_EXECUTION_PROVIDER_H_ -#define ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_SO_EXECUTION_PROVIDER_H_ - -#include -#include -#include -#include - -#include "core/common/logging/logging.h" -#include "core/framework/execution_provider.h" -#include - -#include "tvm_compiler.h" // NOLINT(build/include_subdir) -#include "tvm_runner.h" // NOLINT(build/include_subdir) - -namespace onnxruntime { -class Graph; -class NodeArg; -namespace tvm { - -class TvmSoExecutionProvider : public IExecutionProvider { - using Compiler = TVMCompilerBase; - using Compilers = std::unordered_map>; - using Runner = TVMRunner; - using Runners = std::unordered_map>; - - public: - explicit TvmSoExecutionProvider(const TvmEPOptions& options); - virtual ~TvmSoExecutionProvider(); - - std::vector> - GetCapability(const onnxruntime::GraphViewer& graph, - const IKernelLookup& /*kernel_lookup*/) const override; - - common::Status Compile(const std::vector& fused_nodes_and_graphs, - std::vector& node_compute_funcs) override; - std::unique_ptr GetDataTransfer() const override; - std::vector CreatePreferredAllocators() override; - - private: - void printOptions(); -#ifdef USE_TVM_HASH - bool checkHash(const std::string& onnx_path) const; -#endif - std::shared_ptr compileModel(const std::string& func_name, - const GraphViewer& graph_viewer, - InputsInfoMap& inputs_info); // NOLINT - void setInputShapesForFreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, // NOLINT - InputsInfoMap& all_input_shapes); // NOLINT - void setInputShapesForUnfreezedNN(const GraphViewer& graph_viewer, - TVMTensorShapes& input_shapes, // NOLINT - InputsInfoMap& all_input_shapes); // NOLINT - TensorShapeVector getInputShape(const NodeArg* node); - TensorShapeVector convertTensorShape(const ONNX_NAMESPACE::TensorShapeProto& shape_proto); - void prepareOutputTensors(std::vector& output_tensors); // NOLINT - NodeComputeInfo prepareComputeInfo(const std::string& func_name); - int createStateFunc(ComputeContext*, FunctionState*); - - private: - TvmEPOptions options_; - Compilers compilers_; - Runners runners_; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // ONNXRUNTIME_CORE_PROVIDERS_TVM_TVM_SO_EXECUTION_PROVIDER_H_ diff --git a/onnxruntime/core/providers/tvm/tvm_utils.cc b/onnxruntime/core/providers/tvm/tvm_utils.cc deleted file mode 100644 index e0a5b566835c8..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_utils.cc +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_UTILS_H -#define TVM_UTILS_H - -#include -#include - -#include "tvm_utils.h" // NOLINT(build/include_subdir) - -namespace onnxruntime { -namespace tvm { - -std::string readFromFile(const std::string& file_path) { - std::string str; - - std::ifstream t(file_path); - t.seekg(0, std::ios::end); - str.reserve(t.tellg()); - t.seekg(0, std::ios::beg); - - str.assign((std::istreambuf_iterator(t)), - std::istreambuf_iterator()); - return str; -} - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_UTILS_H diff --git a/onnxruntime/core/providers/tvm/tvm_utils.h b/onnxruntime/core/providers/tvm/tvm_utils.h deleted file mode 100644 index de77368c715b9..0000000000000 --- a/onnxruntime/core/providers/tvm/tvm_utils.h +++ /dev/null @@ -1,70 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef TVM_UTILS_H -#define TVM_UTILS_H - -#include - -#include "tvm_common.h" - -#include "core/session/onnxruntime_cxx_api.h" -#include "core/framework/ortdevice.h" -#include "core/common/common.h" - -namespace onnxruntime { -namespace tvm { - -inline DLDataType GetDataType(ONNXTensorElementDataType type) { - switch (type) { - case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT8: - return {kDLUInt, 8, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT8: - return {kDLInt, 8, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT16: - return {kDLUInt, 16, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT16: - return {kDLInt, 16, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT32: - return {kDLUInt, 32, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT32: - return {kDLInt, 32, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_UINT64: - return {kDLUInt, 64, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_INT64: - return {kDLInt, 64, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT16: - return {kDLFloat, 16, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT: - return {kDLFloat, 32, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_DOUBLE: - return {kDLFloat, 64, 1}; - case ONNX_TENSOR_ELEMENT_DATA_TYPE_BOOL: - return {kDLUInt, 1, 1}; - default: - ORT_NOT_IMPLEMENTED("Unsupported data type"); - } -} - -inline DLDevice GetDLDevice(OrtMemoryInfoDeviceType device_type) { - DLDevice context; - switch (device_type) { - case OrtDevice::CPU: - context = {kDLCPU, 0}; - break; - case OrtDevice::GPU: - context = {kDLVulkan, 0}; - break; - default: - ORT_NOT_IMPLEMENTED("Unsupported device"); - break; - } - return context; -} - -std::string readFromFile(const std::string& file_path); - -} // namespace tvm -} // namespace onnxruntime - -#endif // TVM_UTILS_H diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc b/onnxruntime/core/providers/tvm/xpu_data_transfer.cc deleted file mode 100644 index 4011dee7b7b7f..0000000000000 --- a/onnxruntime/core/providers/tvm/xpu_data_transfer.cc +++ /dev/null @@ -1,84 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#include "core/framework/tensor.h" - -#include "xpu_data_transfer.h" -#include "tvm_utils.h" - -namespace onnxruntime { -namespace tvm { - -XPUDataTransfer::XPUDataTransfer() { -} - -XPUDataTransfer::~XPUDataTransfer() { -} - -bool XPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { - return (src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU) || - (src_device.Type() == OrtDevice::GPU || dst_device.Type() == OrtDevice::GPU); -} - -common::Status XPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const { - size_t bytes = src.SizeInBytes(); - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - const auto src_device_type = src.Location().device.Type(); - const auto dst_device_type = dst.Location().device.Type(); - - if ((src_device_type == OrtDevice::CPU) && (dst_device_type == OrtDevice::CPU)) { - if (src_data == dst_data) { - // no need copying as both pointers are referring to same piece of memory. - return Status::OK(); - } - memcpy(dst_data, src_data, bytes); - } else { - DLTensor tvm_src, tvm_dst; - DLDataType dl_type{kDLInt, 8, 1}; - std::vector shape{int64_t(bytes)}; - // Construct source DLTensor - tvm_src.device = GetDLDevice(static_cast(src_device_type)); - tvm_src.dtype = dl_type; - tvm_src.strides = nullptr; - tvm_src.byte_offset = 0; - tvm_src.data = const_cast(src_data); - tvm_src.ndim = 1; - tvm_src.shape = shape.data(); - // Construct destination DLTensor - tvm_dst.device = GetDLDevice(static_cast(dst_device_type)); - tvm_dst.dtype = dl_type; - tvm_dst.strides = nullptr; - tvm_dst.byte_offset = 0; - tvm_dst.data = dst_data; - tvm_dst.ndim = 1; - tvm_dst.shape = shape.data(); - // Copying from src to dst - TVMDeviceCopyDataFromTo(&tvm_src, &tvm_dst, nullptr); - } - return Status::OK(); -} - -DLDevice XPUDataTransfer::get_context(const OrtDevice& device) const { - return GetDLDevice(static_cast(device.Type())); -} - -bool TvmCPUDataTransfer::CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const { - return src_device.Type() == OrtDevice::CPU && dst_device.Type() == OrtDevice::CPU; -} - -common::Status TvmCPUDataTransfer::CopyTensor(const Tensor& src, Tensor& dst) const { - const void* src_data = src.DataRaw(); - void* dst_data = dst.MutableDataRaw(); - if (src_data == dst_data) { - // no need copying as both pointers are referring to same piece of memory. - return Status::OK(); - } - // Copying only happens between two same size tensors. - ORT_ENFORCE(src.SizeInBytes() == dst.SizeInBytes()); - memcpy(dst_data, src_data, src.SizeInBytes()); - return Status::OK(); -} - -} // namespace tvm -} // namespace onnxruntime diff --git a/onnxruntime/core/providers/tvm/xpu_data_transfer.h b/onnxruntime/core/providers/tvm/xpu_data_transfer.h deleted file mode 100644 index a2cf55b241bb1..0000000000000 --- a/onnxruntime/core/providers/tvm/xpu_data_transfer.h +++ /dev/null @@ -1,38 +0,0 @@ -// Copyright (c) Microsoft Corporation. All rights reserved. -// Licensed under the MIT License. - -#ifndef XPU_DATA_TRANSFER -#define XPU_DATA_TRANSFER - -#include "core/framework/data_transfer.h" -#include "tvm_common.h" - -namespace onnxruntime { -namespace tvm { - -class XPUDataTransfer : public IDataTransfer { - public: - XPUDataTransfer(); - ~XPUDataTransfer(); - - bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; - - // Dumpen MSVC warning about not fully overriding - using IDataTransfer::CopyTensor; - common::Status CopyTensor(const Tensor& src, Tensor& dst) const override; - DLDevice get_context(const OrtDevice& device) const; -}; - -class TvmCPUDataTransfer : public IDataTransfer { - public: - TvmCPUDataTransfer() = default; - // Dampen MSVC warning about not fully overriding CopyTensor - using IDataTransfer::CopyTensor; - bool CanCopy(const OrtDevice& src_device, const OrtDevice& dst_device) const override; - common::Status CopyTensor(const Tensor& src, Tensor& dst) const override; -}; - -} // namespace tvm -} // namespace onnxruntime - -#endif // XPU_DATA_TRANSFER diff --git a/onnxruntime/core/session/provider_registration.cc b/onnxruntime/core/session/provider_registration.cc index 8c512c561ea8c..8bea347c85280 100644 --- a/onnxruntime/core/session/provider_registration.cc +++ b/onnxruntime/core/session/provider_registration.cc @@ -205,15 +205,6 @@ ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Nnapi, } #endif -#ifndef USE_TVM -ORT_API_STATUS_IMPL(OrtSessionOptionsAppendExecutionProvider_Tvm, - _In_ OrtSessionOptions* options, _In_ const char* settings) { - ORT_UNUSED_PARAMETER(options); - ORT_UNUSED_PARAMETER(settings); - return CreateNotEnabledStatus("Tvm"); -} -#endif - #ifdef __cplusplus } #endif diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index 4d9583be0ef0f..54accf7ed88f3 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -1125,16 +1125,6 @@ std::unique_ptr CreateExecutionProviderInstance( LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met."; } } -#endif - } else if (type == kTvmExecutionProvider) { -#if USE_TVM - onnxruntime::tvm::TvmEPOptions info{}; - const auto it = provider_options_map.find(type); - if (it != provider_options_map.end()) { - info = onnxruntime::tvm::TvmEPOptionsHelper::FromProviderOptions(it->second); - } - - return onnxruntime::TVMProviderFactoryCreator::Create(info)->CreateProvider(); #endif } else if (type == kVitisAIExecutionProvider) { #ifdef USE_VITISAI diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h index fa4916f8922f2..b71081bf20efc 100644 --- a/onnxruntime/python/onnxruntime_pybind_state_common.h +++ b/onnxruntime/python/onnxruntime_pybind_state_common.h @@ -24,7 +24,7 @@ struct OrtStatus { char msg[1]; // a null-terminated string }; -#define BACKEND_DEVICE BACKEND_PROC BACKEND_DNNL BACKEND_OPENVINO BACKEND_TVM BACKEND_OPENBLAS BACKEND_MIGRAPHX BACKEND_ACL BACKEND_ARMNN BACKEND_DML BACKEND_CANN BACKEND_WEBGPU +#define BACKEND_DEVICE BACKEND_PROC BACKEND_DNNL BACKEND_OPENVINO BACKEND_OPENBLAS BACKEND_MIGRAPHX BACKEND_ACL BACKEND_ARMNN BACKEND_DML BACKEND_CANN BACKEND_WEBGPU #include "core/session/onnxruntime_cxx_api.h" #include "core/providers/providers.h" #include "core/providers/provider_factory_creators.h" @@ -75,12 +75,6 @@ struct OrtStatus { #define BACKEND_OPENVINO "" #endif -#ifdef USE_TVM -#define BACKEND_TVM "-TVM" -#else -#define BACKEND_TVM "" -#endif - #if USE_OPENBLAS #define BACKEND_OPENBLAS "-OPENBLAS" #else @@ -141,9 +135,6 @@ extern std::string openvino_device_type; } } // namespace onnxruntime #endif -#ifdef USE_TVM -#include "core/providers/tvm/tvm_ep_options.h" -#endif #ifdef USE_ACL #include "core/providers/acl/acl_provider_factory.h" #endif @@ -444,10 +435,6 @@ std::shared_ptr CreateExecutionProviderFactory_MIGrap std::shared_ptr CreateExecutionProviderFactory_MIGraphX(int device_id); std::shared_ptr CreateExecutionProviderFactory_Cuda(const OrtCUDAProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_Dnnl(const OrtDnnlProviderOptions* params); -#ifdef USE_TVM -std::shared_ptr CreateExecutionProviderFactory_Tvm(const tvm::TvmEPOptions& info); -std::shared_ptr CreateExecutionProviderFactory_Tvm(const char* params); -#endif std::shared_ptr CreateExecutionProviderFactory_ACL(bool enable_fast_math); std::shared_ptr CreateExecutionProviderFactory_ArmNN(int use_arena); std::shared_ptr CreateExecutionProviderFactory_DML(int device_id); diff --git a/onnxruntime/python/providers/tvm/__init__.py b/onnxruntime/python/providers/tvm/__init__.py deleted file mode 100644 index 4bcbc0bfef586..0000000000000 --- a/onnxruntime/python/providers/tvm/__init__.py +++ /dev/null @@ -1,10 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See License.txt in the project root for -# license information. -# -------------------------------------------------------------------------- -""" -JIT interface implementing packed functions that -import and compile frontend models -""" -from .ort import ANSOR_TYPE, AUTO_TVM_TYPE, onnx_compile # noqa: F401 diff --git a/onnxruntime/python/providers/tvm/extend_python_file.py b/onnxruntime/python/providers/tvm/extend_python_file.py deleted file mode 100644 index 65902619f8150..0000000000000 --- a/onnxruntime/python/providers/tvm/extend_python_file.py +++ /dev/null @@ -1,54 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See License.txt in the project root for -# license information. -# -------------------------------------------------------------------------- - -import argparse -import textwrap - - -def rewrite_target_file(target): - with open(target, "a") as f: - f.write( - textwrap.dedent( - """ - import warnings - - try: - # This import is necessary in order to delegate the loading of libtvm.so to TVM. - import tvm - except ImportError as e: - warnings.warn( - f"WARNING: Failed to import TVM, libtvm.so was not loaded. More details: {e}" - ) - try: - # Working between the C++ and Python parts in TVM EP is done using the PackedFunc and - # Registry classes. In order to use a Python function in C++ code, it must be registered in - # the global table of functions. Registration is carried out through the JIT interface, - # so it is necessary to call special functions for registration. - # To do this, we need to make the following import. - import onnxruntime.providers.tvm - except ImportError as e: - warnings.warn( - f"WARNING: Failed to register python functions to work with TVM EP. More details: {e}" - ) - """ - ) - ) - - -def main(): - parser = argparse.ArgumentParser() - parser.add_argument( - "--target_file", - type=str, - required=True, - help="Path to the file to be expanded.", - ) - args = parser.parse_args() - rewrite_target_file(args.target_file) - - -if __name__ == "__main__": - main() diff --git a/onnxruntime/python/providers/tvm/ort.py b/onnxruntime/python/providers/tvm/ort.py deleted file mode 100644 index be6d23f39c532..0000000000000 --- a/onnxruntime/python/providers/tvm/ort.py +++ /dev/null @@ -1,140 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See License.txt in the project root for -# license information. -# -------------------------------------------------------------------------- - -import collections -import copy -import logging -import os - -import onnx -import tvm -from tvm import auto_scheduler, autotvm, relay -from tvm.contrib import graph_executor -from tvm.relay import vm - -log = logging.getLogger("tvm_ep") - -ANSOR_TYPE = "Ansor" -AUTO_TVM_TYPE = "AutoTVM" - - -@tvm.register_func("tvm_onnx_import_and_compile") -def onnx_compile( - model_string, - model_path, - executor, - target, - target_host, - opt_level, - opset, - freeze_params, - input_shapes, - nhwc=False, - tuning_logfile="", - tuning_type=AUTO_TVM_TYPE, -): - def get_tvm_executor(irmod, executor, target, params): - if executor == "vm": - log.info("Build TVM virtual machine") - lib = vm.compile( - copy.deepcopy(irmod), - target, - params=params, - ) - elif executor == "graph": - log.info("Build TVM graph executor") - lib = relay.build(irmod, target=target, params=params) - else: - log.error(f'Executor type {executor} is unsupported. Only "vm" and "graph" types are supported') - return None - return lib - - model = onnx.load_model_from_string(bytes(model_string)) - if model_path: - base_dir = os.path.dirname(os.path.abspath(model_path)) - onnx.load_external_data_for_model(model, base_dir) - - # Collect only feed input names from all input names - all_input_names = [node.name for node in model.graph.input] - all_initializer = [node.name for node in model.graph.initializer] - net_feed_input_names = list(set(all_input_names) - set(all_initializer)) - - # Match names and input shapes - all_input_mapping = [(name, shape) for (name, shape) in zip(all_input_names, input_shapes)] - # Using an ordereddict maintains input ordering. - shape_dict = collections.OrderedDict(all_input_mapping) - # Get only feed input pairs - feed_shape_dict = {} - for name in net_feed_input_names: - feed_shape_dict[name] = shape_dict[name] - - irmod, params = relay.frontend.from_onnx(model, feed_shape_dict, opset=opset, freeze_params=freeze_params) - irmod = relay.transform.DynamicToStatic()(irmod) - - # Tuning file can be set by client through ep options - if not tuning_logfile: - tuning_logfile = os.getenv("AUTOTVM_TUNING_LOG") - lib = None - tvm_target = tvm.target.Target(target, host=target_host) - if tuning_logfile: - if tuning_type == ANSOR_TYPE: - desired_layouts = { - "nn.conv2d": ["NHWC", "default"], - "nn.conv2d_transpose": ["NHWC", "default"], - "nn.upsampling": ["NHWC", "default"], - "vision.roi_align": ["NHWC", "default"], - } - log.info("Use tuning file from %s: %s", ANSOR_TYPE, tuning_logfile) - with auto_scheduler.ApplyHistoryBest(tuning_logfile): # noqa: SIM117 - with tvm.transform.PassContext( - opt_level=opt_level, - config={ - "relay.backend.use_auto_scheduler": True, - "relay.FuseOps.max_depth": 30, - }, - ): - if nhwc: - seq = tvm.transform.Sequential( - [ - relay.transform.InferType(), - relay.transform.ConvertLayout(desired_layouts), - relay.transform.EliminateCommonSubexpr(), - relay.transform.FoldConstant(), - ] - ) - irmod = seq(irmod) - lib = get_tvm_executor(irmod, executor, tvm_target, params) - elif tuning_type == AUTO_TVM_TYPE: - with relay.build_config(opt_level=opt_level): - log.info("Use tuning file from %s: %s", AUTO_TVM_TYPE, tuning_logfile) - with autotvm.apply_history_best(tuning_logfile): - lib = get_tvm_executor(irmod, executor, tvm_target, params) - else: - log.error( - f"Tuning log type {tuning_type} is unsupported. " - f"Only {ANSOR_TYPE} and {AUTO_TVM_TYPE} types are supported" - ) - return None - else: - with tvm.transform.PassContext(opt_level=opt_level): - lib = get_tvm_executor(irmod, executor, tvm_target, params) - - if lib is None: - return None - - ctx = tvm.device(target, 0) - if executor == "vm": - m = tvm.runtime.vm.VirtualMachine(lib, ctx) - elif executor == "graph": - m = graph_executor.GraphModule(lib["default"](ctx)) - else: - print( - f"ERROR: Executor type {executor} is unsupported. ", - 'Only "vm" and "graph" types are supported', - ) - return None - - return m.module diff --git a/onnxruntime/test/framework/function_test.cc b/onnxruntime/test/framework/function_test.cc index fa3545ef27d72..180a75a64c10e 100644 --- a/onnxruntime/test/framework/function_test.cc +++ b/onnxruntime/test/framework/function_test.cc @@ -580,13 +580,7 @@ TEST(FunctionTest, TestInlinedLocalFunctionNotRemoved) { // myfun is not removed because it was claimed by InternalTestingEP model_proto = session_object.GetModel().ToProto(); -#ifdef USE_TVM - // TVM EP takes the whole graph and optimizes it within its own framework. - // It does not retain the original graph. - ASSERT_EQ(0, model_proto.functions_size()); -#else ASSERT_EQ(1, model_proto.functions_size()); -#endif } TEST(FunctionTest, TestInlinedFunctionDoesNotReserrectNonExistingArgs) { diff --git a/onnxruntime/test/platform/windows/stacktrace_test.cc b/onnxruntime/test/platform/windows/stacktrace_test.cc index de09dbcf270a9..9b1840f4b5d65 100644 --- a/onnxruntime/test/platform/windows/stacktrace_test.cc +++ b/onnxruntime/test/platform/windows/stacktrace_test.cc @@ -14,7 +14,6 @@ namespace onnxruntime { namespace test { using namespace ::testing; -// TVM is not working with StackTrace now. #if !defined(ORT_NO_EXCEPTIONS) TEST(StacktraceTests, BasicTests) { auto result = ::onnxruntime::GetStackTrace(); diff --git a/onnxruntime/test/python/onnxruntime_test_python_tvm.py b/onnxruntime/test/python/onnxruntime_test_python_tvm.py deleted file mode 100644 index 0080bf53520f2..0000000000000 --- a/onnxruntime/test/python/onnxruntime_test_python_tvm.py +++ /dev/null @@ -1,242 +0,0 @@ -# ------------------------------------------------------------------------- -# Copyright (c) Microsoft Corporation. All rights reserved. -# Licensed under the MIT License. See License.txt in the project root for -# license information. -# -------------------------------------------------------------------------- -""" -Module for unit testing of TVM EP -""" - -import os -import sys -import tempfile -import unittest -from typing import Any, AnyStr, Dict, List, Tuple - -import numpy -import tvm -from numpy.testing import assert_almost_equal -from onnx import ModelProto, TensorProto, mapping -from onnx.helper import make_graph, make_model, make_node, make_tensor_value_info - -import onnxruntime - -numpy.random.seed(32) - - -def is_windows(): - """ - Function to determine the Windows system - """ - return sys.platform.startswith("win") - - -def get_model_with_dynamic_shapes() -> ModelProto: - """ - Create model with Dynamic Shapes - """ - x = make_tensor_value_info("X", TensorProto.FLOAT, [None, None]) # pylint: disable=invalid-name, no-member - a = make_tensor_value_info("A", TensorProto.FLOAT, [None, None]) # pylint: disable=invalid-name, no-member - b = make_tensor_value_info("B", TensorProto.FLOAT, [None, None]) # pylint: disable=invalid-name, no-member - y = make_tensor_value_info("Y", TensorProto.FLOAT, [None, None]) # pylint: disable=invalid-name, no-member - node1 = make_node("MatMul", ["X", "A"], ["XA"]) - node2 = make_node("Add", ["XA", "B"], ["Y"]) - graph = make_graph([node1, node2], "lr", [x, a, b], [y]) - onnx_model = make_model(graph) - return onnx_model - - -def get_model_with_fixed_shapes() -> ModelProto: - """ - Create model with Static Shapes - """ - - def change_input_shape(model: ModelProto, ind: int, shape: Tuple) -> None: - """ - Function to change the input form - """ - dims = model.graph.input[ind].type.tensor_type.shape.dim - assert len(dims) == len(shape), "Input rank and new shape rank do not match." - for i, new_dim in enumerate(shape): - model.graph.input[ind].type.tensor_type.shape.dim[i].dim_value = new_dim - - dynamic_model = get_model_with_dynamic_shapes() - change_input_shape(dynamic_model, 0, (1, 2)) # X - change_input_shape(dynamic_model, 1, (2, 2)) # A - change_input_shape(dynamic_model, 2, (1, 2)) # B - return dynamic_model - - -def get_input_data_for_model_with_dynamic_shapes() -> Dict[AnyStr, numpy.ndarray]: - """ - Create input data for model with dynamic shapes - """ - a = numpy.random.randn(2, 2).astype(numpy.float32) # pylint: disable=invalid-name - b = numpy.random.randn(1, 2).astype(numpy.float32) # pylint: disable=invalid-name - x = numpy.random.randn(1, 2).astype(numpy.float32) # pylint: disable=invalid-name - data = {"A": a, "B": b, "X": x} - return data - - -def get_input_data_for_model_with_fixed_shapes(onnx_model: ModelProto) -> Dict[AnyStr, numpy.ndarray]: - """ - Create input data for model with static shapes - """ - - def get_onnx_input_names(model: ModelProto) -> List[AnyStr]: - inputs = [node.name for node in model.graph.input] - initializer = [node.name for node in model.graph.initializer] - inputs = list(set(inputs) - set(initializer)) - return sorted(inputs) - - def get_onnx_input_types(model: ModelProto) -> List[numpy.dtype]: - input_names = get_onnx_input_names(model) - return [ - mapping.TENSOR_TYPE_TO_NP_TYPE[node.type.tensor_type.elem_type] - for node in sorted(model.graph.input, key=lambda node: node.name) - if node.name in input_names - ] - - def get_onnx_input_shapes(model: ModelProto) -> List[List[int]]: - input_names = get_onnx_input_names(model) - return [ - [dv.dim_value for dv in node.type.tensor_type.shape.dim] - for node in sorted(model.graph.input, key=lambda node: node.name) - if node.name in input_names - ] - - input_names = get_onnx_input_names(onnx_model) - input_shapes = get_onnx_input_shapes(onnx_model) - input_types = get_onnx_input_types(onnx_model) - assert len(input_names) == len(input_types) == len(input_shapes) - random_inputs = [numpy.random.uniform(size=shape).astype(dtype) for shape, dtype in zip(input_shapes, input_types)] - return dict(zip(input_names, random_inputs)) - - -def get_input_names_and_shapes(data: Dict[AnyStr, numpy.ndarray]) -> Tuple[List[AnyStr], List[AnyStr]]: - """ - Create text representations for model input names and shapes - """ - keys = list(data.keys()) - values = [data[key] for key in keys] - return ( - list(data.keys()), - [str(value.shape).replace(",", "").replace("(", "[").replace(")", "]") for value in values], - ) - - -def get_cpu_output(onnx_model: ModelProto, data: Dict[AnyStr, numpy.ndarray]) -> List[numpy.ndarray]: - """ - Run inference with CPUExecutionProvider - """ - # pylint: disable=no-member - sess = onnxruntime.InferenceSession( - onnx_model.SerializeToString(), - providers=["CPUExecutionProvider"], - ) - output = sess.run(None, data) - return output - - -def get_tvm_output( - onnx_model: ModelProto, data: Dict[AnyStr, numpy.ndarray], provider_options: Dict[AnyStr, Any] -) -> List[numpy.ndarray]: - """ - Run inference with TVMExecutionProvider - """ - session_options = onnxruntime.SessionOptions() # pylint: disable=no-member - session_options.log_severity_level = 0 - session_options.log_verbosity_level = 0 - # pylint: disable=no-member - session_options.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_DISABLE_ALL - - sess = onnxruntime.InferenceSession( - onnx_model.SerializeToString(), - session_options, - providers=["TvmExecutionProvider"], - provider_options=[provider_options], - ) - - output = sess.run(None, data) - return output - - -# pylint: disable=no-member -def compile_virtual_machine(model: ModelProto, target_str: AnyStr) -> tvm.runtime.vm.Executable: - """ - Compile ONNX model using VirtualMachine - """ - ir_mod, _ = tvm.relay.frontend.from_onnx( - model, - opset=model.opset_import[0].version, - freeze_params=True, - ) - target = tvm.target.Target(target=target_str, host=target_str) - return tvm.relay.backend.vm.compile(ir_mod, target) - - -def serialize_virtual_machine(vm_exec: tvm.runtime.vm.Executable) -> AnyStr: - """ - Serialize VirtualMachine - """ - temp_directory = tempfile.mkdtemp() - path_consts = os.path.join(temp_directory, "consts") - vm_exec.move_late_bound_consts(path_consts, byte_limit=256) - lib_path = os.path.join(temp_directory, f"model.{'dll' if is_windows() else 'so'}") - code_path = os.path.join(temp_directory, "model.ro") - code, lib = vm_exec.save() - lib.export_library(lib_path) - with open(code_path, "wb") as code_file: - code_file.write(code) - return temp_directory - - -class TestTVM(unittest.TestCase): - """ - Unit tests for TVM EP - """ - - @staticmethod - def test_accuracy_for_model_with_dynamic_shapes(): - """ - Accuracy test for model with dynamic shapes - """ - onnx_model = get_model_with_dynamic_shapes() - data = get_input_data_for_model_with_dynamic_shapes() - - cpu_output = get_cpu_output(onnx_model, data) - names, shapes = get_input_names_and_shapes(data) - provider_options = dict( - target="llvm", - input_names=" ".join(names), - input_shapes=" ".join(shapes), - ) - tvm_output = get_tvm_output(onnx_model, data, provider_options) - - assert_almost_equal(cpu_output, tvm_output, decimal=5) - - @staticmethod - def test_accuracy_for_tvm_so(): - """ - Accuracy test for TVMso Ep - """ - onnx_model = get_model_with_fixed_shapes() - data = get_input_data_for_model_with_fixed_shapes(onnx_model) - - cpu_output = get_cpu_output(onnx_model, data) - - compiled_vm_exec = compile_virtual_machine(onnx_model, target_str="llvm") - so_folder = serialize_virtual_machine(compiled_vm_exec) - provider_options = dict( - target="llvm", - so_folder=so_folder, - ) - tvm_output = get_tvm_output(onnx_model, data, provider_options) - - assert_almost_equal(cpu_output, tvm_output, decimal=5) - - -if __name__ == "__main__": - if "TvmExecutionProvider" not in onnxruntime.get_available_providers(): - raise AssertionError(f"Unable to find 'TvmExecutionProvider' in {onnxruntime.get_available_providers()}") - unittest.main() diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index cb53db42304be..3519c5d72c060 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -188,14 +188,6 @@ std::unique_ptr DnnlExecutionProviderWithOptions(const OrtDn return nullptr; } -// std::unique_ptr DefaultTvmExecutionProvider() { -// #ifdef USE_TVM -// return TVMProviderFactoryCreator::Create("")->CreateProvider(); -// #else -// return nullptr; -// #endif -// } - std::unique_ptr DefaultNnapiExecutionProvider() { // The NNAPI EP uses a stub implementation on non-Android platforms so cannot be used to execute a model. // Manually append an NNAPI EP instance to the session to unit test the GetCapability and Compile implementation. diff --git a/onnxruntime/test/util/include/default_providers.h b/onnxruntime/test/util/include/default_providers.h index ed95bf67f1ffb..9b44150d972db 100644 --- a/onnxruntime/test/util/include/default_providers.h +++ b/onnxruntime/test/util/include/default_providers.h @@ -20,7 +20,6 @@ std::shared_ptr CreateExecutionProviderFactory_Dnnl(c std::shared_ptr CreateExecutionProviderFactory_MIGraphX(const OrtMIGraphXProviderOptions* params); std::shared_ptr CreateExecutionProviderFactory_Nnapi( uint32_t flags, const optional& partitioning_stop_ops_list); -// std::shared_ptr CreateExecutionProviderFactory_Tvm(const char*); std::shared_ptr CreateExecutionProviderFactory_VSINPU(); std::shared_ptr CreateExecutionProviderFactory_Rknpu(); std::shared_ptr CreateExecutionProviderFactory_Rocm(const OrtROCMProviderOptions* provider_options); diff --git a/onnxruntime/test/util/include/providers.h b/onnxruntime/test/util/include/providers.h index a73b237ae10df..01be1a444646b 100644 --- a/onnxruntime/test/util/include/providers.h +++ b/onnxruntime/test/util/include/providers.h @@ -7,9 +7,6 @@ #ifdef USE_DNNL #include "core/providers/dnnl/dnnl_provider_factory.h" #endif -#ifdef USE_TVM -#include "core/providers/tvm/tvm_provider_factory.h" -#endif #ifdef USE_OPENVINO #include "core/providers/openvino/openvino_provider_factory.h" #endif diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 24dc6124d4a89..aa1198102f978 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -584,13 +584,7 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--use_preinstalled_eigen", action="store_true", help="Use pre-installed Eigen.") parser.add_argument("--eigen_path", help="Path to pre-installed Eigen.") parser.add_argument("--enable_msinternal", action="store_true", help="Enable for Microsoft internal builds only.") - parser.add_argument("--llvm_path", help="Path to llvm dir") parser.add_argument("--use_vitisai", action="store_true", help="Build with Vitis-AI") - parser.add_argument("--use_tvm", action="store_true", help="Build with TVM") - parser.add_argument("--tvm_cuda_runtime", action="store_true", default=False, help="Build TVM with CUDA support") - parser.add_argument( - "--use_tvm_hash", action="store_true", help="Build ipp-crypto for hash generation. It is used by TVM EP only" - ) parser.add_argument("--use_tensorrt", action="store_true", help="Build with TensorRT") parser.add_argument( "--use_tensorrt_builtin_parser", action="store_true", default=True, help="Use TensorRT builtin parser" @@ -602,12 +596,6 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--migraphx_home", help="Path to MIGraphX installation dir") parser.add_argument("--use_full_protobuf", action="store_true", help="Use the full protobuf library") - parser.add_argument( - "--llvm_config", - type=str, - default="", - help="Path to llvm-config.exe for LLVM built from sources. It is strongly needed for build on Windows", - ) parser.add_argument( "--skip_onnx_tests", action="store_true", @@ -1031,16 +1019,11 @@ def generate_build_tree( "-Donnxruntime_USE_NNAPI_BUILTIN=" + ("ON" if args.use_nnapi else "OFF"), "-Donnxruntime_USE_VSINPU=" + ("ON" if args.use_vsinpu else "OFF"), "-Donnxruntime_USE_RKNPU=" + ("ON" if args.use_rknpu else "OFF"), - "-Donnxruntime_USE_LLVM=" + ("ON" if args.use_tvm else "OFF"), "-Donnxruntime_ENABLE_MICROSOFT_INTERNAL=" + ("ON" if args.enable_msinternal else "OFF"), "-Donnxruntime_USE_VITISAI=" + ("ON" if args.use_vitisai else "OFF"), "-Donnxruntime_USE_TENSORRT=" + ("ON" if args.use_tensorrt else "OFF"), "-Donnxruntime_USE_TENSORRT_BUILTIN_PARSER=" + ("ON" if args.use_tensorrt_builtin_parser and not args.use_tensorrt_oss_parser else "OFF"), - # set vars for TVM - "-Donnxruntime_USE_TVM=" + ("ON" if args.use_tvm else "OFF"), - "-Donnxruntime_TVM_CUDA_RUNTIME=" + ("ON" if args.use_tvm and args.tvm_cuda_runtime else "OFF"), - "-Donnxruntime_TVM_USE_HASH=" + ("ON" if args.use_tvm_hash else "OFF"), # set vars for migraphx "-Donnxruntime_USE_MIGRAPHX=" + ("ON" if args.use_migraphx else "OFF"), "-Donnxruntime_DISABLE_CONTRIB_OPS=" + ("ON" if args.disable_contrib_ops else "OFF"), @@ -1172,8 +1155,6 @@ def generate_build_tree( cmake_args.append("-Donnxruntime_ROCM_VERSION=" + args.rocm_version) if args.use_tensorrt: cmake_args.append("-Donnxruntime_TENSORRT_HOME=" + tensorrt_home) - if args.llvm_config: - cmake_args.append("-Donnxruntime_TVM_USE_LLVM=" + args.llvm_config) if args.use_cuda: add_default_definition(cmake_extra_defines, "onnxruntime_USE_CUDA", "ON") @@ -1256,9 +1237,6 @@ def generate_build_tree( if args.use_full_protobuf or args.use_openvino or args.use_vitisai or args.gen_doc: cmake_args += ["-Donnxruntime_USE_FULL_PROTOBUF=ON", "-DProtobuf_USE_STATIC_LIBS=ON"] - if args.use_tvm and args.llvm_path is not None: - cmake_args += [f"-DLLVM_DIR={args.llvm_path}"] - if args.use_cuda and not is_windows(): nvml_stub_path = cuda_home + "/lib64/stubs" cmake_args += ["-DCUDA_CUDA_LIBRARY=" + nvml_stub_path] @@ -1659,16 +1637,6 @@ def generate_build_tree( cxxflags = cflags.copy() config_build_dir = get_config_build_dir(build_dir, config) os.makedirs(config_build_dir, exist_ok=True) - if args.use_tvm: - os.environ["PATH"] = ( - os.path.join(config_build_dir, "_deps", "tvm-build") - + os.pathsep - + os.path.join(config_build_dir, "_deps", "tvm-src") - + os.pathsep - + os.path.dirname(sys.executable) - + os.pathsep - + os.environ["PATH"] - ) preinstalled_dir = Path(build_dir) / config temp_cmake_args = cmake_args.copy() if cflags is not None and cxxflags is not None and len(cflags) != 0 and len(cxxflags) != 0: @@ -2097,8 +2065,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): if args.enable_pybind: python_path = None - if args.use_tvm: - python_path = str((Path(build_dir) / config / "_deps" / "tvm-src" / "python").resolve()) # Disable python tests in a reduced build as we don't know which ops have been included and which # models can run. @@ -2221,17 +2187,6 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): run_subprocess([sys.executable, "onnxruntime_test_python_keras.py"], cwd=cwd, dll_path=dll_path) -def tvm_run_python_tests(build_dir, configs): - for config in configs: - cwd = get_config_build_dir(build_dir, config) - if is_windows(): - cwd = os.path.join(cwd, config) - python_path = os.path.join(build_dir, config, "_deps", "tvm-src", "python") - run_subprocess( - [sys.executable, "onnxruntime_test_python_tvm.py"], cwd=cwd, python_path=os.path.abspath(python_path) - ) - - def run_nodejs_tests(nodejs_binding_dir): args = ["npm", "test", "--", "--timeout=90000"] if is_windows(): @@ -2251,7 +2206,6 @@ def build_python_wheel( use_dnnl, use_tensorrt, use_openvino, - use_tvm, use_vitisai, use_acl, use_armnn, @@ -2304,8 +2258,6 @@ def build_python_wheel( args.append("--use_openvino") elif use_dnnl: args.append("--use_dnnl") - elif use_tvm: - args.append("--use_tvm") elif use_vitisai: args.append("--use_vitisai") elif use_acl: @@ -2334,7 +2286,6 @@ def build_nuget_package( use_openvino, use_tensorrt, use_dnnl, - use_tvm, use_winml, use_qnn, enable_training_apis, @@ -2381,9 +2332,6 @@ def build_nuget_package( package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Gpu" elif use_rocm: package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.ROCm" - elif use_tvm: - execution_provider = "/p:ExecutionProvider=tvm" - package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.Tvm" elif use_qnn: execution_provider = "/p:ExecutionProvider=qnn" package_name = "/p:OrtPackageId=Microsoft.ML.OnnxRuntime.QNN" @@ -2625,7 +2573,7 @@ def main(): if args.use_tensorrt: args.use_cuda = True - if args.build_wheel or args.gen_doc or args.use_tvm or args.enable_training: + if args.build_wheel or args.gen_doc or args.enable_training: args.enable_pybind = True if ( @@ -2907,12 +2855,6 @@ def main(): run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs) - # TODO(agladyshev): - # to support Windows, we need to update .github/workflows/windows.yml - # and add to the PATH variable the following value: C:Program Files\LLVM\bin - if args.enable_pybind and args.use_tvm and not is_windows(): - tvm_run_python_tests(build_dir, configs) - # run node.js binding tests if args.build_nodejs and not args.skip_nodejs_tests: nodejs_binding_dir = os.path.normpath(os.path.join(source_dir, "js", "node")) @@ -2940,7 +2882,6 @@ def main(): args.use_dnnl, args.use_tensorrt, args.use_openvino, - args.use_tvm, args.use_vitisai, args.use_acl, args.use_armnn, @@ -2968,7 +2909,6 @@ def main(): args.use_openvino, args.use_tensorrt, args.use_dnnl, - args.use_tvm, args.use_winml, args.use_qnn, args.enable_training_apis, diff --git a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml deleted file mode 100644 index 5f073433265fa..0000000000000 --- a/tools/ci_build/github/azure-pipelines/templates/orttraining-linux-gpu-test-ci-pipeline.yml +++ /dev/null @@ -1,41 +0,0 @@ -parameters: -- name: DockerImageTag - type: string -- name: BuildConfig - type: string - -steps: - -- template: jobs/download_training_test_data.yml - - # Entry point for all ORTModule tests - # The onnxruntime folder is deleted in the build directory - # to enforce use of the onnxruntime wheel - # Uninstall orttraining requirements.txt and install ortmodule requirements.txt before running tests. -- script: | - docker run \ - --gpus all \ - --shm-size=1024m \ - --rm \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ - --volume $(Agent.TempDirectory)/mnist:/mnist \ - ${{ parameters.DockerImageTag }} \ - bash -c "rm -rf /build/onnxruntime/ && python3 -m pip show torch && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install && /build/launch_test.py --cmd_line_with_args 'python orttraining_ortmodule_tests.py --mnist /mnist --bert_data /bert_data/hf_data/glue_data/CoLA/original/raw' --cwd /build" \ - displayName: 'Run orttraining_ortmodule_tests.py' - condition: succeededOrFailed() - timeoutInMinutes: 60 - -# Entry point for all ort training api tests -- script: | - docker run \ - --gpus all \ - --shm-size=1024m \ - --rm \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory)/${{ parameters.BuildConfig }}:/build \ - ${{ parameters.DockerImageTag }} \ - bash -c "rm -rf /build/onnxruntime/ && python3 -m pip install /build/dist/onnxruntime*.whl && python3 -m pip install torch==2.3.1+cu118 --index-url https://download.pytorch.org/whl/cu118 && /build/launch_test.py --cmd_line_with_args 'python orttraining_test_ort_apis.py --cwd /build' --cwd /build" \ - displayName: 'Run ORT Training APIs Tests' - condition: succeededOrFailed() - timeoutInMinutes: 120 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml deleted file mode 100644 index fc163d17e44a9..0000000000000 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml +++ /dev/null @@ -1,209 +0,0 @@ -parameters: - build_py_parameters: '' - torch_version: '' - opset_version: '' - cuda_version: '' - cmake_cuda_architectures: '' - docker_file: '' - upload_wheel: '' - debug_build: '' - python_version: '' - stage_name: '' - SpecificArtifact: false - BuildId: '0' - build_pool_name: '' - -stages: - - stage: Build_${{ parameters.stage_name }} - variables: - - name: isMain - value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} - - name: finalStorage - ${{ if eq(variables['isMain'], 'true') }}: - value: '--final_storage' - ${{ else }}: - value: '' - - name: buildConfig - ${{ if eq(parameters['debug_build'], 'true') }}: - value: 'Debug' - ${{ else }}: - value: 'Release' - - name: PythonVersion - value: ${{ parameters.python_version }} - - name: Repository - value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }} - dependsOn: [] - - jobs: - - job: Build - pool: ${{ parameters.build_pool_name }} - timeoutInMinutes: 180 - steps: - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() - - - task: CmdLine@2 - displayName: 'check variables' - inputs: - script: | - echo "Branch is "${{ variables['Build.SourceBranch'] }} && \ - echo "isMain is "${{ variables['isMain'] }} && \ - echo "final_storage is "${{ variables['finalStorage'] }} - - - checkout: self - clean: true - submodules: recursive - - - template: set-python-manylinux-variables-step.yml - - - template: get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: >- - --build-arg TORCH_VERSION=${{ parameters.torch_version }} - --build-arg OPSET_VERSION=${{ parameters.opset_version }} - --build-arg PYTHON_VERSION=${{ parameters.python_version }} - --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu - --build-arg BUILD_UID=$(id -u) - Repository: $(Repository) - - - task: CmdLine@2 - displayName: 'build onnxruntime' - inputs: - script: | - set -e -x - mkdir -p $HOME/.onnx - docker run --rm \ - --volume /data/onnx:/data/onnx:ro \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /data/models:/build/models:ro \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - -e NIGHTLY_BUILD \ - -e DEFAULT_TRAINING_PACKAGE_DEVICE \ - -e BUILD_BUILDNUMBER \ - -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \ - $(Repository) \ - $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ - --build_dir /build \ - --config ${{ variables['buildConfig'] }} \ - --skip_submodule_sync \ - --parallel --use_binskim_compliant_compile_flags \ - --build_wheel \ - --enable_onnx_tests \ - ${{ parameters.build_py_parameters }} \ - --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \ - --use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }}; - workingDirectory: $(Build.SourcesDirectory) - - - task: CopyFiles@2 - displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.BinariesDirectory)' - Contents: "${{ variables['buildConfig'] }}/dist/*.whl" - TargetFolder: '$(Build.ArtifactStagingDirectory)' - - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation' - inputs: - ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}" - - - template: component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' - - - template: clean-agent-build-directory-step.yml - - - stage: Test_${{ parameters.stage_name }} - variables: - - name: isMain - value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} - - name: finalStorage - ${{ if eq(variables['isMain'], 'true') }}: - value: '--final_storage' - ${{ else }}: - value: '' - - name: buildConfig - ${{ if eq(parameters['debug_build'], 'true') }}: - value: 'Debug' - ${{ else }}: - value: 'Release' - - name: PythonVersion - value: ${{ parameters.python_version }} - - name: Repository - value: onnxruntimetraininggpubuild_cu${{ replace(parameters.cuda_version, '.', '') }}_py${{ replace(parameters.python_version, '.', '') }} - - name: UploadWheel - value: ${{ parameters.upload_wheel }} - dependsOn: Build_${{ parameters.stage_name }} - jobs: - - job: Test_GPU - pool: Onnxruntime-Linux-GPU - steps: - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() - - - checkout: self - clean: true - submodules: none - - - template: jobs/download_training_test_data.yml - - - template: set-python-manylinux-variables-step.yml - - - template: flex-downloadPipelineArtifact.yml - parameters: - ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}" - StepName: 'Download Pipeline Artifact - Linux Training Build' - TargetPath: '$(Build.ArtifactStagingDirectory)' - SpecificArtifact: ${{ parameters.SpecificArtifact }} - BuildId: ${{ parameters.BuildId }} - - - script: | - set -e -x - whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ - echo $whlfilename ; du -sh $whlfilename ; \ - (( $(wc -c < "$whlfilename") - 400*1024*1024 < 0 )) || ( echo 'Wheel size bigger than 400M'; exit 1) - displayName: 'Check wheel size' - continueOnError: true - - - template: get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: >- - --build-arg TORCH_VERSION=${{ parameters.torch_version }} - --build-arg OPSET_VERSION=${{ parameters.opset_version }} - --build-arg PYTHON_VERSION=${{ parameters.python_version }} - --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu - --build-arg BUILD_UID=$(id -u) - Repository: $(Repository) - - - task: CmdLine@2 - displayName: 'test ortmodule' - inputs: - script: | - set -ex ; \ - whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ - echo $whlfilename ; \ - basefilename=$(basename $whlfilename) ; \ - docker run --rm \ - --gpus all \ - -e NVIDIA_VISIBLE_DEVICES=all \ - --volume $(Build.ArtifactStagingDirectory):/build \ - --volume $(Agent.TempDirectory)/MNIST:/mnist \ - $(Repository) \ - bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ; - workingDirectory: $(Build.SourcesDirectory) - - - task: CmdLine@2 - displayName: 'Upload wheel' - condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true'))) - inputs: - script: | - set -e -x - whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ - python3 tools/ci_build/upload_python_package_to_azure_storage.py \ - --python_wheel_path $whlfilename ${{ variables['finalStorage'] }} diff --git a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh index 7f3160371aa24..87b9b960b7ebc 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_os_deps.sh @@ -12,7 +12,6 @@ d) DEVICE_TYPE=${OPTARG};; v) echo "Cuda version is no longer accepted as an input to this script. Ignoring the input argument -v.";; t) echo "Installing python training dependencies argument is no longer accepted as an input to this script. Ignoring the input argument -t.";; m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; -u) echo "Installing ortmodule python dependencies argument is no longer accepted as an input to this script. Ignoring the input argument -u.";; r) echo "Installing ROCM python dependencies argument is no longer accepted as an input to this script. Ignoring the input argument -r.";; esac done diff --git a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh index 1ac1d226deec6..2d7acd1f701ff 100755 --- a/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh +++ b/tools/ci_build/github/linux/docker/scripts/install_python_deps.sh @@ -3,7 +3,6 @@ set -e -x INSTALL_DEPS_TRAINING=false INSTALL_DEPS_DISTRIBUTED_SETUP=false -ORTMODULE_BUILD=false TARGET_ROCM=false CU_VER="11.8" TORCH_VERSION='2.0.0' @@ -18,7 +17,6 @@ d) DEVICE_TYPE=${OPTARG};; v) CU_VER=${OPTARG};; t) INSTALL_DEPS_TRAINING=true;; m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; -u) ORTMODULE_BUILD=true;; r) TARGET_ROCM=true;; c) USE_CONDA=true;; esac @@ -55,17 +53,3 @@ fi export ONNX_ML=1 export CMAKE_ARGS="-DONNX_GEN_PB_TYPE_STUBS=OFF -DONNX_WERROR=OFF" ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps\.sh/requirements\.txt} -if [ $DEVICE_TYPE = "gpu" ]; then - if [[ $INSTALL_DEPS_TRAINING = true ]]; then - if [[ $ORTMODULE_BUILD = false ]]; then - ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/requirements.txt} - else - if [[ $TARGET_ROCM = false ]]; then - ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements_torch${TORCH_VERSION}_cu${CU_VER}\/requirements.txt} - ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage2\/requirements.txt} - else - ${PYTHON_EXE} -m pip install -r ${0/%install_python_deps.sh/training\/ortmodule\/stage1\/requirements_rocm\/requirements.txt} - fi - fi - fi -fi diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt deleted file mode 100644 index 051f42dac335d..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_rocm/requirements.txt +++ /dev/null @@ -1,2 +0,0 @@ -numpy==1.21.6 ; python_version < '3.9' -numpy==2.1.2 ; python_version >= '3.9' diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt deleted file mode 100644 index b3b2651c8d26d..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.0.0_cu11.8/requirements.txt +++ /dev/null @@ -1,8 +0,0 @@ ---pre --f https://download.pytorch.org/whl/torch_stable.html -torch==2.0.0+cu118 -torchvision==0.15.1+cu118 -torchtext==0.15.1 -# TODO(bmeswani): packaging 22.0 removes support for LegacyVersion leading to errors because transformers 4.4.2 uses LegacyVersion -packaging==21.3 -setuptools>=68.2.2 diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt deleted file mode 100644 index 152a17db90366..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch2.1.0_cu12.2/requirements.txt +++ /dev/null @@ -1,7 +0,0 @@ ---pre --f https://download.pytorch.org/whl/torch_stable.html -torch==2.1.0+cu121 -torchvision==0.16.0+cu121 -torchtext==0.16.0 -packaging==23.1 -setuptools>=68.2.2 diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt deleted file mode 100644 index 846f8c15b257d..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt +++ /dev/null @@ -1,3 +0,0 @@ --f https://download.pytorch.org/whl/torch_stable.html -torch==2.3.0+cpu -setuptools>=68.2.2 diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt deleted file mode 100644 index 01fa7b0ff956e..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_nightly/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scikit-learn -packaging==21.3 -transformers==v4.36.0 -accelerate==0.25.0 -wget diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt deleted file mode 100644 index 6346c54decf9c..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/torch_eager_cpu/requirements.txt +++ /dev/null @@ -1,11 +0,0 @@ ---pre --f https://download.pytorch.org/whl/torch_stable.html -torch==2.2.0 -setuptools>=68.2.2 -cerberus -h5py -scikit-learn -numpy==1.21.6 ; python_version < '3.9' -numpy==2.1.2 ; python_version >= '3.9' -pandas -parameterized diff --git a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt b/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt deleted file mode 100644 index dd86b32f88c76..0000000000000 --- a/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage2/requirements.txt +++ /dev/null @@ -1,15 +0,0 @@ -pandas -scikit-learn -numpy==1.21.6 ; python_version < '3.9' -numpy==2.1.2 ; python_version >= '3.9' -transformers==v4.36.0 -accelerate==0.25.0 -rsa==4.9 -tensorboard==2.13.0 -h5py -wget -pytorch-lightning==2.3.3 -deepspeed==0.9.0 -fairscale==0.4.6 -parameterized>=0.8.1 -pydantic<2.0.0 diff --git a/tools/ci_build/github/linux/run_dockerbuild.sh b/tools/ci_build/github/linux/run_dockerbuild.sh index 9944861f519f4..6618810c77f6d 100755 --- a/tools/ci_build/github/linux/run_dockerbuild.sh +++ b/tools/ci_build/github/linux/run_dockerbuild.sh @@ -15,10 +15,6 @@ BUILD_DIR=$BUILD_BINARIESDIRECTORY YOCTO_VERSION="4.19" #Training only INSTALL_DEPS_DISTRIBUTED_SETUP=false -#Training only -ORTMODULE_BUILD=false -#Training only -USE_CONDA=false ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV="ALLOW_RELEASED_ONNX_OPSET_ONLY="$ALLOW_RELEASED_ONNX_OPSET_ONLY echo "ALLOW_RELEASED_ONNX_OPSET_ONLY environment variable is set as $ALLOW_RELEASED_ONNX_OPSET_ONLY_ENV" @@ -44,10 +40,6 @@ t) EXTRA_IMAGE_TAG=${OPTARG};; i) IMAGE_CACHE_CONTAINER_REGISTRY_NAME=${OPTARG};; # install distributed setup dependencies m) INSTALL_DEPS_DISTRIBUTED_SETUP=true;; -# install ortmodule specific dependencies -u) ORTMODULE_BUILD=true;; -# install and use conda -e) USE_CONDA=true;; *) echo "Invalid option";; esac done @@ -82,24 +74,6 @@ if [ $BUILD_OS = "yocto" ]; then $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \ --docker-build-args="--build-arg TOOL_CHAIN=$TOOL_CHAIN_SCRIPT --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER}" \ --dockerfile $DOCKER_FILE --context . -elif [ $BUILD_DEVICE = "gpu" ]; then - # This code path is only for training. Inferecing pipeline uses CentOS - IMAGE="$BUILD_OS-gpu_training" - # Current build script doesn't support building shared lib with Python dependency. To enable building with PythonOp, - # We need to avoid `--no-undefined` when building shared lib (Otherwise, CIs will report `undefined symbols`), but removing that would bring some other concerns. - # Plus the fact training did not need build shared library, we disable the --build_shared_lib for training CIs. - NEED_BUILD_SHARED_LIB=false - INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -t" - if [[ $INSTALL_DEPS_DISTRIBUTED_SETUP = true ]]; then - INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -m" - fi - if [[ $ORTMODULE_BUILD = true ]]; then - INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -u" - fi - INSTALL_DEPS_EXTRA_ARGS="${INSTALL_DEPS_EXTRA_ARGS} -v 11.8" - $GET_DOCKER_IMAGE_CMD --repository "onnxruntime-$IMAGE" \ - --docker-build-args="--build-arg BASEIMAGE=nvcr.io/nvidia/cuda:11.8.0-cudnn8-devel-${BUILD_OS} --build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg INSTALL_DEPS_EXTRA_ARGS=\"${INSTALL_DEPS_EXTRA_ARGS}\" --build-arg USE_CONDA=${USE_CONDA} --network=host" \ - --dockerfile Dockerfile.ubuntu_gpu_training --context . elif [[ $BUILD_DEVICE = "openvino"* ]]; then BUILD_ARGS="--build-arg BUILD_USER=onnxruntimedev --build-arg BUILD_UID=$(id -u) --build-arg PYTHON_VERSION=${PYTHON_VER} --build-arg OPENVINO_VERSION=${OPENVINO_VERSION} --build-arg UBUNTU_VERSION=${UBUNTU_VERSION}" IMAGE="$BUILD_OS-openvino" diff --git a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh b/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh deleted file mode 100755 index fb4dbeb2e73d3..0000000000000 --- a/tools/ci_build/github/pai/pai_huggingface_bert_large_test.sh +++ /dev/null @@ -1,43 +0,0 @@ -#!/bin/bash - -set -ex - -usage() { echo "Usage: $0 [-v ]" 1>&2; exit 1; } - -while getopts "v:" parameter_Option -do case "${parameter_Option}" -in -v) ROCM_VERSION=${OPTARG};; -*) usage ;; -esac -done - -MI200_DEVICE_NUMBERS=$(rocm-smi --showproductname | grep -c "MI250" | xargs) - -if [ "$MI200_DEVICE_NUMBERS" -gt "0" ]; then - RESULT_FILE=ci-mi200.huggingface.bert-large-rocm${ROCM_VERSION}.json -else - RESULT_FILE=ci-mi100.huggingface.bert-large-rocm${ROCM_VERSION}.json -fi - -python \ - /stage/huggingface-transformers/examples/pytorch/language-modeling/run_mlm.py \ - --model_name_or_path bert-large-uncased \ - --dataset_name wikitext \ - --dataset_config_name wikitext-2-raw-v1 \ - --do_train \ - --max_steps 260 \ - --logging_steps 20 \ - --output_dir ./test-mlm-bbu \ - --overwrite_output_dir \ - --per_device_train_batch_size 8 \ - --fp16 \ - --dataloader_num_workers 1 \ - --ort \ - --skip_memory_metrics - -cat ci-pipeline-actual.json - -python /onnxruntime_src/orttraining/tools/ci_test/compare_huggingface.py \ - ci-pipeline-actual.json \ - /onnxruntime_src/orttraining/tools/ci_test/results/"$RESULT_FILE" diff --git a/tools/ci_build/set-trigger-rules.py b/tools/ci_build/set-trigger-rules.py index 0e9cd514d8aa5..b46d1e2559e46 100644 --- a/tools/ci_build/set-trigger-rules.py +++ b/tools/ci_build/set-trigger-rules.py @@ -30,14 +30,9 @@ "mac-ios-ci-pipeline.yml", "mac-ios-packaging-pipeline.yml", "mac-react-native-ci-pipeline.yml", - "orttraining-linux-ci-pipeline.yml", - "orttraining-linux-gpu-ci-pipeline.yml", - "orttraining-linux-gpu-ortmodule-distributed-test-ci-pipeline.yml", - "orttraining-mac-ci-pipeline.yml", "win-ci-pipeline.yml", "win-gpu-dml-ci-pipeline.yml", "win-gpu-cuda-ci-pipeline.yml", - "win-gpu-training-ci-pipeline.yml", "win-gpu-doc-gen-ci-pipeline.yml", "win-gpu-tensorrt-ci-pipeline.yml", "win-gpu-webgpu-ci-pipeline.yml", diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index ff6556b1fd31a..ba125f4e2d980 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -138,7 +138,7 @@ def parse_arguments(): required=False, default="None", type=str, - choices=["cuda", "dnnl", "openvino", "tensorrt", "snpe", "tvm", "qnn", "None"], + choices=["cuda", "dnnl", "openvino", "tensorrt", "snpe", "qnn", "None"], help="The selected execution provider for this build.", ) parser.add_argument("--sdk_info", required=False, default="", type=str, help="dependency SDK information.") @@ -375,13 +375,11 @@ def generate_files(line_list, args): "mklml": "mklml.dll", "openmp": "libiomp5md.dll", "dnnl": "dnnl.dll", - "tvm": "tvm.dll", "providers_shared_lib": "onnxruntime_providers_shared.dll", "dnnl_ep_shared_lib": "onnxruntime_providers_dnnl.dll", "tensorrt_ep_shared_lib": "onnxruntime_providers_tensorrt.dll", "openvino_ep_shared_lib": "onnxruntime_providers_openvino.dll", "cuda_ep_shared_lib": "onnxruntime_providers_cuda.dll", - "tvm_ep_shared_lib": "onnxruntime_providers_tvm.lib", "onnxruntime_perf_test": "onnxruntime_perf_test.exe", "onnx_test_runner": "onnx_test_runner.exe", } @@ -394,7 +392,6 @@ def generate_files(line_list, args): "mklml_1": "libmklml_gnu.so", "openmp": "libiomp5.so", "dnnl": "libdnnl.so.1", - "tvm": "libtvm.so.0.5.1", "providers_shared_lib": "libonnxruntime_providers_shared.so", "dnnl_ep_shared_lib": "libonnxruntime_providers_dnnl.so", "tensorrt_ep_shared_lib": "libonnxruntime_providers_tensorrt.so", @@ -456,14 +453,6 @@ def generate_files(line_list, args): + '" target="build\\native\\include" />' ) - if args.execution_provider == "tvm": - files_list.append( - "' - ) - if args.execution_provider == "openvino": files_list.append( "' ) - if args.execution_provider == "tvm": - files_list.append( - "' - ) - files_list.append( - "' - ) - - tvm_build_path = os.path.join(args.ort_build_path, args.build_config, "_deps", "tvm-build") - if is_windows(): - files_list.append( - "' - ) - else: - # TODO(agladyshev): Add support for Linux. - raise RuntimeError("Now only Windows is supported for TVM EP.") - if args.execution_provider == "rocm" or is_rocm_gpu_package and not is_ado_packaging_build: files_list.append( "" ) - # Process tvm dependency - if os.path.exists(os.path.join(args.native_build_path, nuget_dependencies["tvm"])): - files_list.append( - "" - ) - # Some tools to be packaged in nightly debug build only, should not be released # These are copied to the runtimes folder for convenience of loading with the dlls # NOTE: nuget gives a spurious error on linux if these aren't in a separate directory to the library so diff --git a/tools/scripts/python_test.sh b/tools/scripts/python_test.sh index d12f6e6d33772..53d350cf30611 100755 --- a/tools/scripts/python_test.sh +++ b/tools/scripts/python_test.sh @@ -13,9 +13,6 @@ echo Install Python Deps cp $src_dir/tools/ci_build/github/linux/docker/scripts/manylinux/requirements.txt $build_dir/requirements.txt python3 -m pip install -r $build_dir/requirements.txt -mkdir -p $build_dir/requirements_torch_cpu/ -cp $src_dir/tools/ci_build/github/linux/docker/scripts/training/ortmodule/stage1/requirements_torch_cpu/requirements.txt $build_dir/requirements_torch_cpu/requirements.txt -python3 -m pip install -r $build_dir/requirements_torch_cpu/requirements.txt python3 -m pip list | grep onnx echo Install $config python package @@ -23,6 +20,5 @@ rm -rf $build_dir/$config/onnxruntime $build_dir/$config/pybind11 python3 -m pip install $build_dir/$config/dist/*.whl echo Run $config unit tests -pushd $build_dir/$config/ -python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests --enable_transformers_tool_test --ctest_path "" -popd +cd $build_dir/$config/ +python3 $src_dir/tools/ci_build/build.py --build_dir $build_dir --cmake_generator Ninja --config $config --test --skip_submodule_sync --build_shared_lib --parallel --use_binskim_compliant_compile_flags --build_wheel --enable_onnx_tests --enable_transformers_tool_test From af0303f9b439f9e815c09226706cab66f8fa8a0c Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Tue, 19 Nov 2024 14:24:52 -0800 Subject: [PATCH 21/28] Simplify CPU allocator arena usage helper function, fix unit tests that check old ifdefs. (#22876) --- onnxruntime/core/framework/allocator_utils.cc | 4 ++-- onnxruntime/core/framework/allocator_utils.h | 4 ++-- .../core/providers/cpu/cpu_execution_provider.cc | 3 +-- onnxruntime/core/session/environment.cc | 5 +++-- onnxruntime/test/framework/allocator_test.cc | 11 +++++------ onnxruntime/test/framework/session_state_test.cc | 11 ++++++----- onnxruntime/test/framework/tensor_test.cc | 11 +++++------ 7 files changed, 24 insertions(+), 25 deletions(-) diff --git a/onnxruntime/core/framework/allocator_utils.cc b/onnxruntime/core/framework/allocator_utils.cc index 797b6e1606f97..edf965d3835b5 100644 --- a/onnxruntime/core/framework/allocator_utils.cc +++ b/onnxruntime/core/framework/allocator_utils.cc @@ -77,7 +77,7 @@ AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info) { } } -bool ShouldCpuAllocatorUseArena([[maybe_unused]] bool is_arena_requested) { +bool DoesCpuAllocatorSupportArenaUsage() { #if defined(USE_JEMALLOC) || defined(USE_MIMALLOC) // We use these allocators instead of the arena. return false; @@ -89,7 +89,7 @@ bool ShouldCpuAllocatorUseArena([[maybe_unused]] bool is_arena_requested) { if constexpr (sizeof(void*) == 4) { return false; } else { - return is_arena_requested; + return true; } #endif } diff --git a/onnxruntime/core/framework/allocator_utils.h b/onnxruntime/core/framework/allocator_utils.h index 4035a0cc349e4..bef0b7057a7f8 100644 --- a/onnxruntime/core/framework/allocator_utils.h +++ b/onnxruntime/core/framework/allocator_utils.h @@ -43,8 +43,8 @@ struct AllocatorCreationInfo { AllocatorPtr CreateAllocator(const AllocatorCreationInfo& info); /** - * Gets whether a CPU allocator should use an arena or not. + * Gets whether a CPU allocator supports arena usage. */ -bool ShouldCpuAllocatorUseArena(bool is_arena_requested); +bool DoesCpuAllocatorSupportArenaUsage(); } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc index d57c33ae965b1..65eeb4b84e193 100644 --- a/onnxruntime/core/providers/cpu/cpu_execution_provider.cc +++ b/onnxruntime/core/providers/cpu/cpu_execution_provider.cc @@ -31,8 +31,7 @@ CPUExecutionProvider::CPUExecutionProvider(const CPUExecutionProviderInfo& info) : IExecutionProvider{onnxruntime::kCpuExecutionProvider}, info_{info} {} std::vector CPUExecutionProvider::CreatePreferredAllocators() { - const bool is_arena_requested = info_.create_arena; - const bool create_arena = ShouldCpuAllocatorUseArena(is_arena_requested); + const bool create_arena = DoesCpuAllocatorSupportArenaUsage() ? info_.create_arena : false; AllocatorCreationInfo device_info{[](int) { return std::make_unique(); }, DEFAULT_CPU_ALLOCATOR_DEVICE_ID, create_arena}; diff --git a/onnxruntime/core/session/environment.cc b/onnxruntime/core/session/environment.cc index 5f929d3760a95..48213e3e3894a 100644 --- a/onnxruntime/core/session/environment.cc +++ b/onnxruntime/core/session/environment.cc @@ -117,8 +117,9 @@ Status Environment::CreateAndRegisterAllocator(const OrtMemoryInfo& mem_info, co } // determine if arena should be used - const bool is_arena_requested = mem_info.alloc_type == OrtArenaAllocator; - const bool create_arena = ShouldCpuAllocatorUseArena(is_arena_requested); + const bool create_arena = DoesCpuAllocatorSupportArenaUsage() + ? (mem_info.alloc_type == OrtArenaAllocator) + : false; AllocatorPtr allocator_ptr; // create appropriate DeviceAllocatorRegistrationInfo and allocator based on create_arena diff --git a/onnxruntime/test/framework/allocator_test.cc b/onnxruntime/test/framework/allocator_test.cc index 57aa57b88acf5..fa6c4966d6953 100644 --- a/onnxruntime/test/framework/allocator_test.cc +++ b/onnxruntime/test/framework/allocator_test.cc @@ -3,6 +3,7 @@ #include #include "core/framework/allocator.h" +#include "core/framework/allocator_utils.h" #include "test_utils.h" #include "gtest/gtest.h" @@ -15,12 +16,10 @@ TEST(AllocatorTest, CPUAllocatorTest) { ASSERT_STREQ(cpu_arena->Info().name, CPU); EXPECT_EQ(cpu_arena->Info().id, 0); - // arena is disabled for CPUExecutionProvider on x86 and JEMalloc -#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(__loongarch__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER) - EXPECT_EQ(cpu_arena->Info().alloc_type, OrtAllocatorType::OrtArenaAllocator); -#else - EXPECT_EQ(cpu_arena->Info().alloc_type, OrtAllocatorType::OrtDeviceAllocator); -#endif + const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage() + ? OrtAllocatorType::OrtArenaAllocator + : OrtAllocatorType::OrtDeviceAllocator; + EXPECT_EQ(cpu_arena->Info().alloc_type, expected_allocator_type); size_t size = 1024; auto bytes = cpu_arena->Alloc(size); diff --git a/onnxruntime/test/framework/session_state_test.cc b/onnxruntime/test/framework/session_state_test.cc index b94d24a1b180b..3e694020f796b 100644 --- a/onnxruntime/test/framework/session_state_test.cc +++ b/onnxruntime/test/framework/session_state_test.cc @@ -5,6 +5,7 @@ #include #include "asserts.h" +#include "core/framework/allocator_utils.h" #include "core/framework/execution_providers.h" #include "core/framework/graph_partitioner.h" #include "core/framework/kernel_registry.h" @@ -216,10 +217,12 @@ TEST_P(SessionStateTestP, TestInitializerProcessing) { // Test that we allocate memory for an initializer from non-arena memory even if we provide an arena-based allocator // if the relevant session option config flag is set -// For this test we need to enable the arena-based allocator which is not supported on x86 builds, so -// enable this test only on x64 builds -#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64)) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER) TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { + // For this test we need to enable the arena-based allocator. + if (!DoesCpuAllocatorSupportArenaUsage()) { + GTEST_SKIP() << "CPU allocator does not support arena usage."; + } + AllocatorPtr cpu_allocator = std::make_shared(); // Part 1: Feature turned ON (i.e.) allocate from non-arena memory { @@ -348,8 +351,6 @@ TEST(SessionStateTest, TestInitializerMemoryAllocatedUsingNonArenaMemory) { } } -#endif - INSTANTIATE_TEST_SUITE_P(SessionStateTests, SessionStateTestP, testing::ValuesIn(param_list)); #ifndef ENABLE_TRAINING_CORE diff --git a/onnxruntime/test/framework/tensor_test.cc b/onnxruntime/test/framework/tensor_test.cc index 541dddabc3c96..fba099f9c55b3 100644 --- a/onnxruntime/test/framework/tensor_test.cc +++ b/onnxruntime/test/framework/tensor_test.cc @@ -2,6 +2,7 @@ // Licensed under the MIT License. #include "core/framework/tensor.h" +#include "core/framework/allocator_utils.h" #include "test_utils.h" #include "gmock/gmock.h" @@ -137,12 +138,10 @@ TEST(TensorTest, EmptyTensorTest) { ASSERT_STREQ(location.name, CPU); EXPECT_EQ(location.id, 0); - // arena is disabled for CPUExecutionProvider on x86 and JEMalloc -#if (defined(__amd64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(__loongarch__) || defined(_M_ARM64)) && !defined(USE_JEMALLOC) && !defined(USE_MIMALLOC) && !defined(ABSL_HAVE_ADDRESS_SANITIZER) - EXPECT_EQ(location.alloc_type, OrtAllocatorType::OrtArenaAllocator); -#else - EXPECT_EQ(location.alloc_type, OrtAllocatorType::OrtDeviceAllocator); -#endif + const auto expected_allocator_type = DoesCpuAllocatorSupportArenaUsage() + ? OrtAllocatorType::OrtArenaAllocator + : OrtAllocatorType::OrtDeviceAllocator; + EXPECT_EQ(location.alloc_type, expected_allocator_type); } TEST(TensorTest, StringTensorTest) { From 712bee13db4471b767caf4776dbd789373ede71b Mon Sep 17 00:00:00 2001 From: Kyle <92152685+idiskyle@users.noreply.github.com> Date: Thu, 21 Nov 2024 00:18:50 +0800 Subject: [PATCH 22/28] Fix Pipeline Timeout Issue (#22901) ### Description Extend timeout for always failed job. ### Motivation and Context --- .../github/azure-pipelines/templates/windowsai-steps.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml b/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml index be8569abf1bee..fb3ebdc760a7b 100644 --- a/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/windowsai-steps.yml @@ -11,6 +11,7 @@ parameters: jobs: - job: Windows_Packaging_${{ parameters.BuildArch }}_${{ parameters.Runtime }} + timeoutInMinutes: 180 templateContext: outputs: - output: pipelineArtifact From e4307953328847f0c049aad6d899f54aa92ceb68 Mon Sep 17 00:00:00 2001 From: Aleksei Nikiforov <103434461+AlekseiNikiforovIBM@users.noreply.github.com> Date: Thu, 21 Nov 2024 01:00:23 +0100 Subject: [PATCH 23/28] Fix MlasSgemmKernel: properly process more than 2 rows (#22125) This change fixes multiple tests like QDQTransformerTests.MatMul_U8S8S8, for all architectures where architecture-specific optimized function is not available yet, like s390x. ### Description Matrix B is packed by 16 elements, thus new row starts 16 items later. Also, for next C increment index only by 1 for each increment of C. ### Motivation and Context This change fixes mlas sgemm fallback implementation for all architectures which don't have architecture-specific implementations available, like s390x. --- .../mlas/lib/scalar/SgemmKernelScalar.cpp | 36 +++++++++++-------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp b/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp index 62729256dac23..cbec5d89bbac7 100644 --- a/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp +++ b/onnxruntime/core/mlas/lib/scalar/SgemmKernelScalar.cpp @@ -83,6 +83,8 @@ Return Value: #endif + int countb = 0; + do { float BElements00; @@ -116,6 +118,7 @@ Return Value: // const float* a = A; + const float* b = B; size_t k = CountK; while (k >= 2) { @@ -128,10 +131,10 @@ Return Value: Row1AElements1 = a[lda + 1]; } - BElements00 = B[0]; - BElements01 = B[1]; - BElements02 = B[2]; - BElements03 = B[3]; + BElements00 = b[0]; + BElements01 = b[1]; + BElements02 = b[2]; + BElements03 = b[3]; Row0Block00 = Row0Block00 + BElements00 * Row0AElements0; Row0Block01 = Row0Block01 + BElements01 * Row0AElements0; Row0Block02 = Row0Block02 + BElements02 * Row0AElements0; @@ -144,10 +147,10 @@ Return Value: Row1Block03 = Row1Block03 + BElements03 * Row1AElements0; } - BElements00 = B[4]; - BElements01 = B[5]; - BElements02 = B[6]; - BElements03 = B[7]; + BElements00 = b[16]; + BElements01 = b[17]; + BElements02 = b[18]; + BElements03 = b[19]; Row0Block00 = Row0Block00 + BElements00 * Row0AElements1; Row0Block01 = Row0Block01 + BElements01 * Row0AElements1; Row0Block02 = Row0Block02 + BElements02 * Row0AElements1; @@ -161,7 +164,7 @@ Return Value: } a += 2; - B += 8; + b += 32; k -= 2; } @@ -173,10 +176,10 @@ Return Value: Row1AElements0 = a[lda]; } - BElements00 = B[0]; - BElements01 = B[1]; - BElements02 = B[2]; - BElements03 = B[3]; + BElements00 = b[0]; + BElements01 = b[1]; + BElements02 = b[2]; + BElements03 = b[3]; Row0Block00 = Row0Block00 + BElements00 * Row0AElements0; Row0Block01 = Row0Block01 + BElements01 * Row0AElements0; Row0Block02 = Row0Block02 + BElements02 * Row0AElements0; @@ -188,8 +191,6 @@ Return Value: Row1Block02 = Row1Block02 + BElements02 * Row1AElements0; Row1Block03 = Row1Block03 + BElements03 * Row1AElements0; } - - B += 4; } // @@ -295,9 +296,14 @@ Return Value: break; } + B += 4; C += 4; CountN -= 4; + countb = (countb + 1) % 4; + if (countb == 0) { + B += CountK * 16 - 16; + } } while (CountN > 0); return ProcessTwoRows ? 2 : 1; From a28246a994683cb4045cc279c0a18312521eea31 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Thu, 21 Nov 2024 18:12:28 +0800 Subject: [PATCH 24/28] =?UTF-8?q?Revert=20"Update=20Gradle=20version=208.7?= =?UTF-8?q?=20and=20java=20version=2017=20within=20onnxrunt=E2=80=A6=20(#2?= =?UTF-8?q?2914)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit …ime/java (#22771)" This reverts commit 632a36a23394a9deacf19c221db4ff89287ac152. ### Description ### Motivation and Context Run E2E tests using Browserstack failed due to this PR. --- java/build-android.gradle | 6 +- java/build.gradle | 4 +- java/gradle/wrapper/gradle-wrapper.properties | 4 +- java/gradlew.bat | 20 +- java/src/test/android/app/build.gradle | 10 +- js/react_native/android/build.gradle | 2 +- js/react_native/android/gradle.properties | 2 +- .../android/gradle/wrapper/gradle-wrapper.jar | Bin 60756 -> 58910 bytes .../gradle/wrapper/gradle-wrapper.properties | 4 +- js/react_native/android/gradlew | 263 +++++++----------- js/react_native/android/gradlew.bat | 33 ++- .../github/android/build_aar_package.py | 4 +- .../default_full_aar_build_settings.json | 4 +- .../templates/react-native-ci.yml | 2 + 14 files changed, 159 insertions(+), 199 deletions(-) diff --git a/java/build-android.gradle b/java/build-android.gradle index 9c4275b74f626..d5839f9f27869 100644 --- a/java/build-android.gradle +++ b/java/build-android.gradle @@ -82,7 +82,7 @@ allprojects { } android { - compileSdkVersion 34 + compileSdkVersion 32 defaultConfig { minSdkVersion minSdkVer @@ -108,8 +108,8 @@ android { } compileOptions { - sourceCompatibility = JavaVersion.VERSION_17 - targetCompatibility = JavaVersion.VERSION_17 + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 } sourceSets { diff --git a/java/build.gradle b/java/build.gradle index 845121dd17a48..34ac93cce6f4e 100644 --- a/java/build.gradle +++ b/java/build.gradle @@ -50,8 +50,8 @@ mavenSettings { } java { - sourceCompatibility = JavaVersion.VERSION_17 - targetCompatibility = JavaVersion.VERSION_17 + sourceCompatibility = JavaVersion.VERSION_1_8 + targetCompatibility = JavaVersion.VERSION_1_8 } // This jar tasks serves as a CMAKE signaling diff --git a/java/gradle/wrapper/gradle-wrapper.properties b/java/gradle/wrapper/gradle-wrapper.properties index 381baa9cef1ec..4baf5a11d45a3 100644 --- a/java/gradle/wrapper/gradle-wrapper.properties +++ b/java/gradle/wrapper/gradle-wrapper.properties @@ -1,7 +1,7 @@ distributionBase=GRADLE_USER_HOME distributionPath=wrapper/dists -distributionSha256Sum=544c35d6bd849ae8a5ed0bcea39ba677dc40f49df7d1835561582da2009b961d -distributionUrl=https\://services.gradle.org/distributions/gradle-8.7-bin.zip +distributionSha256Sum=9631d53cf3e74bfa726893aee1f8994fee4e060c401335946dba2156f440f24c +distributionUrl=https\://services.gradle.org/distributions/gradle-8.6-bin.zip networkTimeout=10000 validateDistributionUrl=true zipStoreBase=GRADLE_USER_HOME diff --git a/java/gradlew.bat b/java/gradlew.bat index 25da30dbdeee9..93e3f59f135dd 100644 --- a/java/gradlew.bat +++ b/java/gradlew.bat @@ -43,11 +43,11 @@ set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 if %ERRORLEVEL% equ 0 goto execute -echo. 1>&2 -echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. 1>&2 -echo. 1>&2 -echo Please set the JAVA_HOME variable in your environment to match the 1>&2 -echo location of your Java installation. 1>&2 +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. goto fail @@ -57,11 +57,11 @@ set JAVA_EXE=%JAVA_HOME%/bin/java.exe if exist "%JAVA_EXE%" goto execute -echo. 1>&2 -echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% 1>&2 -echo. 1>&2 -echo Please set the JAVA_HOME variable in your environment to match the 1>&2 -echo location of your Java installation. 1>&2 +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. goto fail diff --git a/java/src/test/android/app/build.gradle b/java/src/test/android/app/build.gradle index 5c6a88792207d..ecbc4b90612dd 100644 --- a/java/src/test/android/app/build.gradle +++ b/java/src/test/android/app/build.gradle @@ -7,12 +7,12 @@ def minSdkVer = System.properties.get("minSdkVer")?:24 def qnnVersion = System.properties['qnnVersion'] android { - compileSdkVersion 34 + compileSdkVersion 32 defaultConfig { applicationId "ai.onnxruntime.example.javavalidator" minSdkVersion minSdkVer - targetSdkVersion 34 + targetSdkVersion 32 versionCode 1 versionName "1.0" @@ -34,11 +34,11 @@ android { } } compileOptions { - sourceCompatibility JavaVersion.VERSION_17 - targetCompatibility JavaVersion.VERSION_17 + sourceCompatibility JavaVersion.VERSION_1_8 + targetCompatibility JavaVersion.VERSION_1_8 } kotlinOptions { - jvmTarget = '17' + jvmTarget = '1.8' } // Conditional packagingOptions for QNN builds only if (qnnVersion != null) { diff --git a/js/react_native/android/build.gradle b/js/react_native/android/build.gradle index 44fb9dd7c433e..825990eba0fb8 100644 --- a/js/react_native/android/build.gradle +++ b/js/react_native/android/build.gradle @@ -7,7 +7,7 @@ buildscript { } dependencies { - classpath 'com.android.tools.build:gradle:7.4.2' + classpath 'com.android.tools.build:gradle:4.1.2' // noinspection DifferentKotlinGradleVersion } } diff --git a/js/react_native/android/gradle.properties b/js/react_native/android/gradle.properties index 8fe6e40d76911..465b04d1f5813 100644 --- a/js/react_native/android/gradle.properties +++ b/js/react_native/android/gradle.properties @@ -4,7 +4,7 @@ # Specifies the JVM arguments used for the daemon process. # The setting is particularly useful for tweaking memory settings. # Default value: -Xmx1024m -XX:MaxPermSize=256m -org.gradle.jvmargs=-Xmx4096m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8 +# org.gradle.jvmargs=-Xmx2048m -XX:MaxPermSize=512m -XX:+HeapDumpOnOutOfMemoryError -Dfile.encoding=UTF-8 # # When configured, Gradle will run in incubating parallel mode. # This option should only be used with decoupled projects. More details, visit diff --git a/js/react_native/android/gradle/wrapper/gradle-wrapper.jar b/js/react_native/android/gradle/wrapper/gradle-wrapper.jar index 249e5832f090a2944b7473328c07c9755baa3196..62d4c053550b91381bbd28b1afc82d634bf73a8a 100644 GIT binary patch delta 19947 zcmY(JV{;`8*Dk}!#I|kQwrz9A_GHJlZQD*Jwr$&XGVguPx930fs=8KJFKvM5PJ!3c zf+kg+AT^cPiT;KpDq}9=O}chd0B)<|s-ykew&j4G{G}pAmE7vz$_^n@snJ|qV;5j$ zMU|q2RKt^Y#9`ZYbzlMhjr*~uYWPv8s9@-O9{qbc<3-+*$FJusdgknHZ}g4%5V+@g zL+eQ_ihmjp`q7)e|ePD*K0L-`AIlv3WVTwiu*9K?I2tPO;%fK#=i@=j8;c_sV>VpJ7ghL*sO$(WwOL+Zq?!0CulF)v8%z zn~m(J+ztvpiToqI*%Gt}2Ru#cLh_^=%cTyzi`OKnmG|02$Eh@0-?wNCwms76R>qM#VQcf1}C(YR0wWiI)#OX0chV;GYnaoS1 z&}-mlCQ5Fp=~;V-?m=#8mtk)fx7V`38l?C(wwht<$1O5~(qD~=!`7_X6u@6&yc7%$ zC_kZA`GMDIr3>{B0FCh)&U(C7`ietj(^;74>FZRq+X|-Z6#@7(f4mDXe!XxmgLkHV+pJell+Ny}Wr(JqC1KL*|Zh$&RI$%a~QG~e< z0XAjqoM&EBlZ8uH+5uRH&Q#4_e)=;?43w5qS#1?|_+3kI`mVSm!oCFs+6r*`z-ebI zEBNt_0(QBVZ!Vz+BL3+_gCh1^9Z=b42GZmU-O$v9ROH^GZXW4Y5z2S&cfSB{-v|j` zaVbMog0r$O;CzheMQ04HUa30@x0p}nG|tXv_#7tIRpsAzhL>W2;1}#EHBB2&NS^Vm zHY9TMa_$G@XaL{`jk)6~@t)y=B)*z$_<{&7ad*!AAHDG($@M>qQiA!2f(H;8XX+P{Me&e zwI#!H=D!D6ex#pyt_L1wcTf^{X)?!L2{vR802wvPC~3+(jhJ!!;xu(>=9VOB08Ist z75FKfz7~8k<~CAGHw6{#IeuJ9^9^neXE#@7ZBJ7qf0fmbQPDFp`t>co&Y$j3j8PXD zdaa0nO>kl(*SKtO~+E>k|~vB5w<dSZcb` zdb3s!Ji5c$AP7EOT8S#L(RljAP0}Fks{i1w($#g?sMWv?A2F1 z+lSSSX1)5q_#wdk)YeA`KONiVebp$?j3CSi@ZBW`A4D9 z4#zZ}7_-bK|C&Jkur`7kdhk!5LiA>7En{eEtQ5PEnfXY0aQCXEWz9UD$wfIazH_wx zN*#(w2U596kBiX6lLO){Zs}?rD`O7WxNu}GPpv?aXkgqm5KpLL(q%%y$6qKvwOA!R z=nJi75ucAK^dBLapd%VD#(xT8Lg>Cn5BG>``=%&=^$ok}B#ja8QvNL{A3y#}dfzWk zU^IpAaSdMFd$1soJzM6?Fz-*A_$_>zN9M#vlp~l*Vf0J&&-rPy&L2bD_PQ-@=KR~q zD|-{3((TB5)^?1v*79et+JSy}@X692kpCqpne=vQ9uWkDhX@3O`2P_^3artF_QV}^ z^NRy%kahI>ok%6zNT)?PyqM^g*l3baNG8=S7N1P4otV~_7z|;uKP(c4x|x8w?qu>F zo+ArL=D3t&512Wd+?>?%e_v#1i+(w=77QTar{LIM`e2_A~nf5>Hr~C}b z0%d@ubFbaS^Lak!jAx8JPk;~Fd0)f3uNJGH5n0}I2lNl#5Wl^W;ip#v9kG9VC28k` z!%v9fkBT(kO=$+jK;<*T;m2LE$J;_LdA5K1C241FjNg>sX;wd7Krrurk|qD17vj!F zVZOym0F2A5DM@cteAG!9A~!UeK0oZcM~S?c!4EIR1Ds{1iC4l3^q zDV$Z^;>R@uBX%O6?kOJSTcyrj6TIrZy2tu7f5qv#CHJ>sJeUCeVgvO&hu$>i#pxP1 z%o5M8TRWz?(nIq68x9$!sR=S}qbjYS5)_0|mb2Fin}-quz*uqDO^61I@>e5=-TfWZ zL4EK7BSXfuPQ{C|=au$cEF1WB4LatP8MS2qg-UB~eb}?-hEhjTMWdQH+MeZj6Si9{b!@}fsaI%)ZHu_+{h+~7_^klVV#cz5(UToarJ}IqLX|z?Ph$Uz6 z6bes^&WbA6GS2;D#Xx?laT8=VJEKKnacgo>o;MoUv6Cgc&NOv&!&b!9SjwBqWCHw3>AWGx&GbEBmgL72CT8$~)of6RbGo%W&!ermUVTC_Ro4#`E2j4Y%w^L7<) z>OTio(3o8&sPcO+tjFNEEs-!^G&!S&Zu77qxi$|?t@Jds6SGr!v(d&_ttMBxWi0cy04)=)1uqaNj!f*zV0>vAWvc+o>=teimK!NTX5Ru2z|)aQs>bb-b~Gvm9kY!Lh>GIuE#Y@;L=8ri@K1N zxr5oLPY91JDq`Vv1aa@5Qp-JIR@J7eWl{|p+SY&&KZMT!(3*ssK#Cp3my3_hpMmjH zXO^*f2k+F*do{9qCmU&h@=>*(z|R1Mq>h4cdV+Q;=Ec2Xkg?jX&16-G($=~MwsBtP z+)7j5o2U3CL-q_pKG|+gMR5=vr4EG&7S6z*swL9SBS!{(v8*y^=MkXUB%15v1JFW1 zG`Pr*c4;%~Mxjf}nrkGw`noFW5bNZ~U3 zi(ujad^5~3iZES*{bfjd>3z(!In6QU15PoNSogZ=VHv%?EQd!kfZy5kssVu^^aEGH z1EnLTzg4H;w2hi<-@W!z?*}Wqr;Vo3}$KBg4Me`-a-(h8i&6&#qGcXH#iz`OV9Z zaIYF-0pj)O)}8hQd_JcAdnV6FE7>PUhb+w+p8vn#A{3ROjE`r*E|>bGJ9gpC*_U=5(+z5by(vT+mOelok>&KSW z=`qDBdqMe6&HqxxA=>uU32aE5Sjl%fhs?kVvv!xhx4GXoE$?|(Z{Oe4d4cI{Q+aY; z>G82uLVG~b1Se7dlE5E7R&z_AZxF|_vM67rdER7>A!g!A-@1}G`LR#e2YK>aJ(;_I zt#Vq|!l38(OVz$(9p_RjS`oe^=Xu?H6hO9>uFvOX+)-@v@O+vpE!r{di@#4$pvJfMj3;D#9g~WmKJ=TZucB0M3@i zF1H~}WOcfi&0}=tO0fNGW986^L&ny%D^yKB6Ek^v;8!vU?l&89$~-_v!t%`4&llkk z3UJ?@pYa%`tV*``b3b6O%_NLlR?6bZzX9_nHB2!uO=*CUsh$d3h6R^b-Ezq#B+7C!~;Z$>w$BvB`0B-1;zLa|E26 z8{n8lqlZzPKVU*zm0w6S_)94&ySW7)uZ=CbmrsS;Rgjm)l~#`h0-f$6yC?u%T90x;5A8Oj*H@moj# z*vF_ z>b3t+*lY*X(T!BmCqAxcDOhhq?0YOxH2eqzzZhvl80~{;TQQ7U#e|E6U1{7rgx4FBwNz zaxYZ=K3(czULx(IS*QfctTQ00=SF!xlk1;w2I9fQxgTR8amc5f(mc^w*cj@M-|p%W z9c0ySRO-fzyM=jWhC-n|3RUNaPf>*k9rS5&XWZ}M1{{Qwp1e=rC>=)Nyf%|wu-_~_ zAdKoRsDBa#Laxx*A2!vv;my@yYLu+XMDjGwSYdj~=VW1es^^#?4NQQX@fMjEr^hDa zEuHo_Bedl2{$7h~rt@Uns?~G)${_;0F<`ay!+T!=kkf1&UbOg-%jJ=E5(u??5Q4Bz zfZW2I{ZP2=X)E{rSB$Tleg{rJ{G=tX4xh-*f!>~&q~_aDxXLs`|^g#ZxjMs|xUx8!C`*s55Zc{Ju(h(&&w+ZFlbBDw_jo{xd1#( zDoNZ4Nqqk3az`ywMlD=?LaAI$%&3$%tcY`B&$ z{CXm2jbVaFc%t5$S6;@*Plo`{T(D3^%w2gW@7btRVzue#hzpqP&E$e?e>tfHR7|>6 z3jO6vyqadkBr4h?-)qb)a5chp^{|Z#)-b(2m5yk6CZ*f4d(Aj%#&u==gg-s^$B63z zoyQ%URxv!~6|?TCs{_p=OE**&^DM$Mh4}27>g|x~ISjezFg6Zn4_l<8f~u~BstN;b z6(M%ic`Yv8biS_+nK8ul7p?q74f;uzI9w%1Y^)o%uN2;Drghm#EFOy zV3?LLj}*Qe@AJ5y{wCkSDOmZ^cJxfb>S5b$bRjqikk&oJfSEQ1CCfyV#=p-Gw>!$`VI|CJQBi44rq zg7QQgMgM`yX)aqPDL}op5-=5_R1T*86=gvTE$v7o1V-ZMf7~nu<LG@S}O!gH7P|wNDG85djI4 zTVTSPTOl&sbaZFSy;ZZvO+!Q00S25^zvF|PeLaNq>sCUUsq#cNxEhuH@~jB-QCpH3 z(b0>KVpP3%?iT5%RiAPluT#0V-l8?WO&YX0y3;{_J#>RHxE;m)@+^W0;H36!iVX3L ziiGs63T&&;q657d1&1McI=rSC@C=LeIM9E%+;;Yi!`rzW6&GZvC?EPf`T~B_2>2sb zju~kU|0YnmXOckomFhP~zjP8G)^EQU4Lc5vd%IVLBuvU9OpD4>x|jB?gvlGRMB^jj z7NjMX{=pMq3}Y;RBk3(Zn0$*2tgBp$t%IJrSle8{00=hLmHoL*n7PThmhAL+b$7c( z`7Ne!R`y)lo{ML7(NLr1Yy=GIThd_7XnZd2F^nsN4^SF^X?@vAt(Ef8MJQvKY_v4g z^l^ygsq@!qtS~X9!*1e)O%B0*fqm1N_LHfK7)l(ebv;Noe!dtz2vu8%zPSJHL{EC8 zo3}(9Q2~=BDP^ByGdllvDmsrYL4?QFPz__{-q^FPTYp+QTE& z&6sKnO_MmR(g#?lky>!rMGeE(Y9MM;U|y#uB%*;1v&eVROYB4v|Mwt#X{Fa$vJu!= zv!g=uuQSGMU*92>vshCoqDt9o>2?>K>P>K~@R>Di|lD?w4 z-w@H9rQ6!_VsL7?VR~ow&c45g?UCB4^|0axGNNlPWSmnBl!kVc@MbouMc!FTLuA*! zoGeO~=+ls;2kkf7+M;X%olh{4d}he)zHMg+9)cfG?9$e8R)MM&9EWcltT|T>ZFHkQ z75uFP{2i)<&dvt?oDdozl(E(kmAVuy=MuzfK2y!;>|0N{+RnWiQUP+{h~XTMaxC^2 z-#9OYl7pxTD@LRx`&4JEb7Ey8gPiyDAAjGJSi&e=?yzAR6`@5dKGh3!GOnQ3(SKUTHioCUU4lr zN|!`KG>s(s{!JSsgt*{OZ)?;ju~jJko9?aXXa;`q6-wXCpdTJ94E$*K8?t?& z0~hZ+u(yDFnW4Y~oXNqQDOj5XA@%-L;Qp@jt|`n<(Z17{W&siL5Sn;0V1RN0UAX{S z{4GO~lS}w%$y4D>` ztW0}u@ij4ev5XI{@vjhKgy2#vCeuvOo$<`B0KLG&Z}cj%!o#Rp%FA@B16&8BHc%^4 z2UETuJZ*{Wn7J|2mfBTJ%^0aXcpl7Z?rk%?HnR`u^KA^y}m9^A;{dJ-X^d5E+jRZ{~BlB`cdeqlbR z0~d@ucNeXr%#+;TLi-FL-KuH5YB7=(CH+mfH0GW}4FHGK12yZpHU{LY2`*8m>ZNOt zF$D3$c9lukXla^FJf$Z?9;FLI&MG=>o_O2onkOf6oefV6-Q=`p>m`%CsTSCxPWn5T z)oAalVU+0lj8h-ub|sqDBNYlYZt`XGK#obaOYyApUJ#5}>O2#vn)E-l73r3PTIanC zVV4B&22Qu)z8A^)v2`d8FV8Dl@8M|U`s6N{@@<}y9B3GizhTquc*J|cRSkG8PkF06 z=2&LOe2Nb7v3qmR^7r*NV^jhB3Q*Hyryif{p`k%$d-IBCa9h^|JAy|IqqNSfK)x3l z`K{O#vy<_WD?E-$6%HdVx4(16%&?Cc(D+m(DuB(Y-rj7vl;VS4RTaZ?%$J-7f};xN z?S}a%0psf#w+iy4!=tFcNXxt}1I(7#z*Ws5HpS8~&mL$(+qJkX;(uuncf{ep-9?Nr zEmf6RHY zK;haczzfv(q$HE3!U?-318)D91$#M1tckE;y)#;%>0YUj6&7cliLv3FV6<*%gB4m7 zbTe7dhf`A_4r2UR$_}pL=f8SrmB)Da&sTWvy zoYD4sM}!8Ma3t%K1*Q^HPZCD(Rpf{L?aD==Qs;aDOLCYn7%BdbK({4knH0v}>b{os z=1P;V)Q*1)804$5-A9$qUtl6@+~KKoyL6KKaH)?D;`!9|EJ3h(^|CXqTU<2!}6KX!Ce$@% z21n;Pbe&(!VJ2^d(~;VF=&JmY*J5=A<;GW3u`}a*?H6>}Ml`auW&aS9g0kKqxNXr6 zl7QKaKrJs{G!OKDKaHbwNuUc#BA8ZLI<_v1`!vCWA|lLoC`81;5XCuH2wB8Ute01G z0p3b>HIhA-Dc*Tn;w5XgBJ(4kLN+}P^BOgh{Fj6;s^WhfEI8M<>8P3WW`AZpzIQ%* zUq9t%zE2CnK&uA?PmICo>=U=T<8iaH&^TkGff&W)cnQb@;lV{LX2o94(UNUpcO*B4 zQ?!ixCnZ~WrzZ&5(A{zpoCY(~IggH*2K_}{=G`cDCW)Gpp71x&`z>-Gok#|=jXOk# zF`lS(-5q$Z2lR4p8o9kSc*@;9c+A~FS@TFYhsPcho|rrIrtvjWd;DA7nggFAp1|LP zz~B2p#J*Azr~*^CgvJ0$GGDb3o-M{jXhDkoLlgy>w_u@RP>TBe z!+LMAm@|#wQ(VZ242sgSY>sUVt>mor52KBF`leNm(sa4$h$r_p)J1bSjFu@;Z$7&! zIZCBX~N)<#aW$A;(N83>%U=hl&n*EFUbk5OP5=GAirufqPu(R zMLAn)T}_mlUdw~`64F}TDeIX4~?${v>N4X() z`#8z@3iot9)%x3*srPwddZTWkAu_W1lN_B1`^`VZfLErGlBKf5Ff>3~JJX=C>RLa(jHxP*MlJ6`C&ns-oN%Kb@i zNr8fgjAD9V>A~e1w01+4@{<(`S#6i&)-w6lqlF4=(7~PT?B*HtWY2ZJdw=(DVR8qG z`xXGVZe{Y4idL#3>zb~?Iaf!=P2{uy#*yg0l%_z5y$@NrdA&JcQ%Tf>yD^W_e1?IQ z2OEuEYR+S#jdE^Us(~JL0f)6s<>5(fUua@Vt65C8a-J`{9@*(AyJgx$*~^1ap;ubw zTx65u2Zzx*MM}a*CW^VohziEZ5SBBYgY@1;ri%IBcE7aCidMWiJ(mi5$me9sQ;|lO zOQQ*vh1gbEx6m;lvo%{~$yuR}w5Gc6jHc2)^NCVMlXaH1-Jq>CEcZJb_m29ME>CKS zSCnZ+*j276wPaD%R@u7y4`f4>!pYn_8+(G~v-L{1*GwiXaYK19f{03@<*$7&C+cD) z{~%@tDzqtc@+HMxO_W{ro>ql6C;5Hwg4Q>?WZsobOE)XvIhKdkeLL(5n4=|Q`g$LJ zZ#h$Bu<>x2yzXSFU1k^Hnmf$4TPi2ZU5ko?y}NVFG^B5zD46P%diPc9sgX`*(l@-; z$Gs^k-BN%+LCW8&i)_Si{-7QFaY-Pi{p)c?{55+|eDzwaznRwL z2}Y~4wS5ZH3^5SE+OA-PmgVKzlYeSnPOep49GV;) zgCsv>Fyd*bS%nRKOch0a=s`p4Y)&>$l-oC7rwJaXoJe>O331{R^Q{N78)vQ4zcA&X z<=|WP;Ifvp0>sZhY`A3IqtLsg!4HSQx4h8Xl+e3n(A#k+y@aGw{LM11ga&;510we*h?m4td*dR>tvLPzJnv!BHOiSHL%smed$gA*;DLX;zplORIwb`ya4wO_9FdT`poYa?8&Z_JGaQd9wc2VQVNpK zubMD&wr^OdpP1ju!lWF6&^&4lDGLlYCHN%vIDyVwLC<9}7Ig>6W{OPYp&P_$2BXGp zgrWO4!3#*&Vv5NBkPzq8j0ZLI$uv#8kN21vo3j?A?bl&Hq;gi2|7BAxZv+5E|GJ_Iz@Ak-kU-ehrEw?s= zkU!N6op-HzH=hHRYf}blrxWmX){qp{hy)HCA(kP@AqF_h^Q}0aYG&0}OT(A$c8Z=3 z@3~ca?6x-=?WbdW-Q}w@a+iLat<=VAW4X7E=@<8u3flmF^Yq&wIIBt#poMN2f;wJl z;Qkx>J~veCnq|0!%Pp1)FTFjX!=uGoa=$tyY}GJ{D6x-uyP3EEMwfpgge@HCnIMj3 zK0EXV9**k|1o`ZS${#lt^My4)Gr#Uw<2HCJ2{DEJL287u9;-2l`GC;E5ZcX!mZer_ zAt@?qJN=+&sO#!xRu8E$unmjiUz~E5T0dNmGPL(S$(R%r+oTjP8wC}mM?R{|IuF}_ zv6AADhxQNd)2s};0yD#HJ+xj~2X^%U{4kSQQqXTN4u2-MSfuudv0tJC4Ccx>qvc;x ze(z{Hy^fJ*X$Tae}2IL$`)e8ybk+A|PDC5o;*pN&5 zUm^C%PG&{3v7w`zW^HFv)3uG+^=HG+uSEXe zX>Z0jnb+^P&$p1zovzm{m)Qrw(_Ej^uzwx5SjhDIf1v=3r?RR<)7ZcTxr9`UQ%z0#>^WD~5LzmK=r)H2@sU4Nk&;@oSenx9&d_==twfQ?K{rZ%0*4H4yrKzsDfNBdALL57NOqN?!ebG<5S{QH*yQB5}j4- z^%u!c=x~y)mrp8n&R~Er8JZBqToA9Aa|m94p}Swx>I*rh&TIkqKzcJ=;<9t9mH&d( zUdK(DG=nWkt^nxvd}-6lY2W3jFZ$S81K+aQ#%^oh=_oth3NHuwUqhXSqpnQ4qrGga zo8WnBT?*|6y^q=Efi<6uz4u4%$EvVtu{qs>jiP#{Qeo06E>oR9b$;7UM?J`A`Ws0Bv+?h#Dbz)gpIG?FHMjqmz^rrQaDeuA64XnL)uWabTfJvtq5Y z2?rMmg<>b+8fGkh#Jhdo(nWH&CAXS_Zz%dlWe}0 zAm-~7I zFDBv`!nYaK)1!xYxj^LVvLwn@=sqzG9se?_Vh1kI6W32|>Q5e>v-r#t1~~981$r?< zjMIe)FWAazBX`wVqs~EF)ke8yO5=9(;e!PYTvfu~-z3MR$dOXauWUfw;l7l_>?HeHypmUA<82 zDGfYyHL$l2{4L&Lq-9C5SuNFeXzJiI6xf=C|AQ5$!7h=E9f)uVAAtqO>1NjVC-jMv1zMG8c-f2U5?{Es#<#?APg%-c?~3xuAC>;D&}6m=DbP{8OF>BUd6Hjp^{## zy@h^Er-@Nsou-&!BxmIA2-?)xQ&Ka)K@%}RqiQQWB zyIYw|@b27RoMe{y1cR0s(7}No!INDdv4Cv~Z8n65jF8o&+{JSVWG^C)C9hHAj0Wb) zvh;(h=#yZu9)t_xi$T-kyWtqZCP^gnk~-O%&Mm8|bi!JbC(4I{IOug@vy*S@FuZ^k zq{>xXL#iCm&aFVY(V_MUn6~NW-htfUeB&@E48Ka`-)Gc1rCx&OVYk)vR-+~dFt@;H zR~tx>)%1g};}Q2K8aHy68L|}BM*@F0Mmnc2kE$}WW^M4x&^dHbe@3-(CecKV)ZPds z3KrZ`f-bj;c^E2`n&@a9WcF{FyX60B@UQdD*C zh%eBkT8!7DO^U(1!gG>_T+*!bP!C#lFzL8_L_1%W!_}cvp1LjAChfPVcmmaP5$f~@ zlV(%LZ}~#T%d!BE-f0MRjWQbAe>SAX;LGj-8c~=zt)1!kwi?(w5fO5by-H)Q-q8fB z=N?Z!IRyDoZr&LN^XEQK^$MDElCB>}f3(W{aOTc&dJ$mrG?r#;VWPZQ%8?i`1G-2#!?(D2t2c=i-3U)SCk z!MOZc{Ap+tNB>%1UZC(P<3pP34~t>hqwo)mbDR%$;k~BY4-QcORMa|G+~wh%Lefk$y1cZME)05&oScHkPC6+4#liit*JGkj=Jq8rf+1K7G6T-+40|M6si z+lso6VD?bI9%aY9foYIka;Y0mW2)6YU4vPqd)oC*kQ3+lcLv-SY7vqWjikpNMV4%! zp^Y+NM)KVt=62C5{_k+}{Si*oC7f)!g&Shm6xiwKd%8ki*`}MHKjG5*CKI*Cb$lb` zLaDP4*Ze*Q`<8KE2k_b@@^JVb!+$e{!s02UD_VBiu?jFU*ou6aNSqS<;I#C-+`-vQdZ_m| zj$-Y927~xj0)5O!(VW-2bC`30Ly9m(WySCJze^fZ@@5pH#Jf!tBO3F}#w0WxdH$)5 zS^RZFF1w#~^$P{oJ~k!tMva`LH$zWcpj23OMdBrQg!jwk;NB20xDMh&SMkujpJL(& zmOVZmV9_V&VMz{`-NW+y9b*K1HO|!CRq^~w1cvMr0LuiH-dDeHbAXe7MWruvkk6ki ze|lzsNRmGZ|B@T-(i`PZSIM5gwYT_Ono&98%1=v=OZT^k8~zy%;RBb?37-^m{*4%d z!6;^bWng5R;0HHyv(oiPCpwLv6OOP7Xx2`6Cjmxu@%&w!5_J_$95+_X$-T1=cZU*| z;jEvSf%nV<%G@msi)d{sYB?aqR}28F__D12Eqie#Fpta^scXu#u(?VmJ zh|6V>H#2w3&O6uIfSr{bnSQE_O;`W{3o(iUW5E<{kxW~RyF%* zmF??L`W8$cys`I!8V}~a`;-TheN()fCshh zWTpu(H_R+m1+Hvf4_75ugS-6^C~rc7XcXC{={AQ#%@0BgX?fBjmPOa7&YJp{?R2je z(E7ziZkpeP}-h^n3g z=PcvekCqjWT>DRI@oaL@n4&)BUoYqj z4IuWK3P~>>lh4>3%+7@G%9MdaU5KE1pTtk8A^~ zpLm#~8PHouXV8LN%>{wy?#SAaN4PlyQ9|)gW?y{-1OvX)6!^LJMq!v%wCLF~$zM`} zBB&D+aq$`&PQhR1DUwzP#w`P*^9q!L8y)G`rPQt%X6cem6<7|B=Q`2AWNyB5)F)|@ zX9|`EZ`6v1rL3`I;afLXes8hNA~Z7i*_PvPHUMhoxV-u6oLKjfrjO{3A5Hno_+hlQ z;0I;!R}foC3EdImJIw?wRC#?~seqKYM{Hsf8c%9~3#3_1F?KUjBBP44y`#qDL`SN9 zzM?P*VNK!p6TlIAfu$3O{PJvMkNqdqKGcFW|J3r_B|lS^^&F%7{aj<}#pJ{;J?OB# z2yr*27ewL(Q;t`rKt71Ar^MIag3U|A^O16~4Q#nr#9^m~t%>F6vMhtPJ7h;U^hXEz z!3(C~s5(gWZ{=0lI+S$X%cShsKOu3!9PMlTV#i64>53OGjW<}}H}P%5CO}yLKBPP3 zr-YD8#-*wyenrvJ%c1C=c=t&Bo6W&;j8cD?x9;w-zkt%L>hqk!MTy2mkv&Lr z+zzZzFJb*i}oikZz`{GAnGv``kV`_UGZu8 zAr%>XV2l9W%(bz*2u)48q>s22jALTu1r;8RdXO!=KzO(;-icGuB$w3%-AJ&QaRf^=Nk$;hBeo>z?LEhmV1SD)=?hd=hhMNW)5cOy#!Oc z5=KbRO8e%BJEaROsBC^-w+ns3i5!6OSI>_o=H!tX)U7o{KYE8-O}wR z6>Ybb^#aw2RR3V%T-d(|__N)7b0ha-f|_BJHR%C>XHcD>(M$@fo@SM7FWV5*=#bt` zKrF-o3gz!vZXs`Rsjjq@lLS;>MvZ9qo7Mh*NN|2oXy3o&!TmA82dap4=yy0lg0W8- zN_~Oozi_D1=3mc|sX)dXBkkpQ`TR2uPP(gIa&kZCNnz5GEPqSOO+t(?$N2dRhDKPx z&hf8<9-By)#U1=JeQIyA>+|;`{}5f{nBf=N84OZeQZcxT1+82kk`GebJouY;p~DAx zx%V9C{f1X-xXUMW%I)O~{=ns|Gzh9AD1{5u{dN9@@otZoFAu(?7Sr8}-MRyY`|)|6 zL%$plRzX#aZtG@3j%IaJc9Js|XL4XC4+SV4bcZr%P{k|pcvg*ay+T0@Y0$I^YR1Le z)pryY?2$AImlV`rQ^dp%bhPvxde9Gb^@@0tkQ?@jo{a7}K*^_J=lx0b1KZaH-YTykugF1=?-P!{s_#b$ z101oUUP2fa?pq|F2dUhf0rtfk1|k$#Z%=e+`n3#DtD1krv0I8)I^&v_$OGxmk`?>! ztjTyOgZp0~y_xmJT=!*7ti1!Al67tiTm7sZ=opzkD@bpnjvfgf@Fqsw15jSE!+%`T z|4aA2xy_?15W<>qc;t)I)TScGKt;TBDNv5aZ)wlf;|7otZ#7b@m~_J>Pbq-28`|h= z2&c)^LK;&#U}6aInyo8YW2fA%AyC9`C{k(gQEZySud?-DlUi@+Ln~Ex!;A?Qz>u07 zbR7}kn1m5?xxsKcCfJPXW-^q^NzlWpOhAgZV0G~>m0&m+>e!=0X8F8>*GiUABIRGq z64~Ig`gXB}=b$C>Vc{X+hhbpj;8g-XMPM2gxH~hY#Q{{gRB}~(9K_OPnSJoUVvsBJ z!I(8RGl$SPXrnTHNcTbt%sbSXN<>9&5L^+P6h}aNMtNW7$up~tP5GuMS(0>8`m5$@|#B%W}sN)QW#%r|TWl#R@ zWcLnIqQmn_8aamVzw8ClMZhRy;{=}i7Fn1?ObImNn|y&&IKCjSTSL|kzppui%O97Y zS+rK)?Z~0h(x~^WdN`i3gSc=ss?OdSl*?!##OIBw3S6=!Q5NWA$USliTFx@g=oDhT z^onX1VyVyASkHP`(;*V+iv26Qs2!vkd5D#@bDedfIWjJwgxUkv90Ce1nPur}Qc_GE z5aBde{91|<9bMz+Wr8EeQcMD{Fc zhLjIcxGFm_QWRy0EGb(=X-u~4k$vq`;|uv;^DX6n=b7i8bMEh)`_6mc_j%4U_kE9= zn!8C<42C>=E)wfBHS-=zaUc7Xlh+j>{fMG#9-{k?OwUA2?{V zsObP4UtD$rP-dwb8D!R$sS`rwEYcG9= zi!LP@3_VC76gja)uF-Bw1YEXjEfIX#-JOwfG{}Rpyz_-ilcvj-D4!D@!|Ute&|Poi z9xH1-IU|r z;}+Kms#AB874av^zOH3UJqnPU7qPBrllSlzJy+~jl+$tCWeriN|InqwPDV>+_=`dvjd?CDp}fe;@B6Tihr=M;4W&!eqg~nh&bYAvPmO9D0Y<}PnYMjMU!ivloY_X zUy9mg&I-lo9-{SECr;McxJYa}@-r!5wE6pD2vdeS798|y#f|}A%DrrgOpQrB=hmv} z6C!2e8d&sxRJbuPIhg%nf*09kzB@km>aE{pCk#ui_F5UtnOBI$C_QE z;y1!P(2XAo2H%AFnP{FLF#k*3=1^V>S?p1_X8G_3z7F|ysiB`|UhdxTaqbm$UonNL}{Qnxx1bT4#vXKPty9PmH5o9eI8uO@%cq}kFQ)W@uw&1msBuYISN%Vl#m zOn&-ajlgYm`7okqKs2uQ^>7x}Cw{Z`dQ6yAv-p|;Q=ZF>vsj+?Vo~hv9{rBVO6gI~ z^%J=oa?Xr$Yik8@a&en3haFS>IUNyGQjfPS`Jtsfju+5PTJGwD8d|IOy+zc#uz zK7IYX(fq0H#nQ&7&%N3n->*e=bX$I@OT|b=C{*jiBGw;Al}YN<_~_PHr==DioXd4g zzY-kb{Rm%D?}@pQTIcgi;QOJ}~AK?)A( zmz=dA1Q~SRv3TAyZOw_Bs&3+aI-umDuIQTHmUaOX(qf^kY;jbnd~k8LvsBS5(``=E zr`+p#Kh9#gaoy%-b#uf>R_CzPIg$cbzq;CwGOOE7!V{>uWqMY~vgb)wnKr##QZC zlMV!Bqi+RcB(XIX3@w`mry$#ki0C_$j*g!+-u7{@*>r>~@>Yg*IG8{%wN3hyiI*cx zznwRmYh9SvYulzB3^Q!U%(!C{q(_%WI~4KPtlg&1=G%>RyY!L_>H5;vas7Ys;?HOq zbZ*j>i>(>)Ho44OHlQayZplToZ&t^RcMvvZ@Zz5-iZk-2JSVg8W~~Bl+iYCC<&&n0 zKNAP&HeM{fy`GWzF@bl~>Zb8Bi&eVj9k0FvD4(-506kwl{MMz5?jZwtI=EOI=2S?_ ztuoweJ*BHH&kE0XCLaCn8<=&yX~Kkec(XLI>DlnGkL-_2m*8{0D>;f|%|}W&nk0J( zIUx@Z-QqchYaMsrj^R*>CYTQe-pGzO=h9P>T9_LMo}yo0_C0>|UVX{Tvbx<({G+_E z+mv*~8@(*}HzyozcJRe=x&DfYtUE&lbkBp}l9FeX;GG@wDGv`mwUele3-4s2tQzW1 zEq71N$zc(cDr{3dd3+RY=#>!IX%<&U@RYRv=#Jy?Cw_b9cglFIQEX(F{Gy>OHJ~Xu zydW}g7q&e7qj^N5VTC6yDR38lf{(6Etk!Ait4KXJX0u=WSz--YpsUK*siE&e{%$dvp$8>=!Vt;m z3Our_qOXFdHJqHEIApiQYI|Shy@dK$)Me(0nBy!o0tJ?QpR0bZm4&3@KOQwekol|4 zkUgXYd?sCH%sTS=fPLTX4-zD1X5lE|uym0jTywZ_Z#N8g@|7^LvyPp%mBTOpNp!DC zRg}bilrDpwJy9q3`R!JAvJ1Nx!FVg(4-`$A$yz_fwXAfYV%?49_0XnGLF|W&SIVuH zKc(z--Y66Mkg8sy)+Da^2hM_;I$Y{X8@Ws?bNqvad_#qchMn#AKPV&05C<5vl{PFF z#+cEkT$#7cR_?&R{qHQLyv8A}GZVax^H=L43)dr*4%h3Vw$<0O2QW3O4XkS;2 zu;OLDDJ3E%TfONq7A=q;7w3XEzo@`vd*Gl>5eiJSLRZTepio@06(-_PMuYg0q%DmxihFyzy)3AfWCX!g+OOFA9op7A1@FK=V$)%i-#p65u*Ev$8Z5qdjwgj z@`-n^>=P379_9gG`f-A5Gm3yPElBF%f>b29HOdOog|IS1-qeE_oy173+9Dsq2yV`> zqNtUkpkgkAP&T0TF9n^6(f&+SaCuf4wZH1iFNzp_5b;k97)jLr!XCdvA*>K&0^AU! zvMlqOK!^~vh4U$bpf1$Hw0R^DDM2-@r6a-dc~;cE?LfbRLG99%`Ul+G9V7&0BUcwsTEYKCYBurL+|tBRi;Az5cK=95Se{Y`AI7DY5{}#@AW)H2=0R}YyA_6 z0Yop#1J+hl717AN_hV4hyq5+BTZPu09N*_hYYSyshN%6?*Drz?J#&tlib*cX1I{i` zP(?6l2?GRN04Lwcfy0Ze;N}t%@bQLJ6ez#U3Z5QCp#B?WP$1!7!Jv+M69h`bMZx)D z3?Lp1X2YdG%@GXX9}N18r~!GwU><@Olwn4KBajsn3aRRl$O>AHB7xy6V8EyY5O5Vd ztBwMfMo$1FJXM^XMuCoFNPy=CqzZ%OD2BaJ1bQ9$eTV_EAZhFZ@E{&KNq{cnDnLde z6@NLd3#=wnMTrShARnZP*%Nv|S0+_tnA8Q{=0Kt(=&8gFCQV|1o_wn6dz=+CnP3I= z)PBFycp>%TRW&w2JtA3~xW7^(TU>#->$@TL?p34@M0e> zCcZMNi9DqbSU!V9G^jBx50tln0n_F{Ll>yvDhry9L7naK`<*A=g92YXfT|Cmq^H_m zze)W6eE#=Y5TpO--8Z{G4lAJcrt`lCF?wh}@EFd=ZDvFVtyzM>l%dZ8G?w|vd)og1 D86?w= delta 21827 zcmaI6Q*fYN6E&KNIk9cqwr$(C@x-=mTN6)gI}_VBCYk*2`7ch@S9R*#?W)~feY02h zTD^AuG}!V6SR?HZ1SOZQtQr^)5PA#{5So-EVT_dVHO!PqS_Gijr8tVDK%6&qZeU7XO?s53M}(nQWu(T*V4y~Q+IgZu`Cg|- zA^NxO&)4z&XPTQ4T(q8r1kU$+3v^INRW5@oYbsjnN+f1%-qEOBTa80WAz(KZ|xo} zjmJR^sH^9dtu)jPdVc;q{cZ@*{lgFH-^|rx5jfrUv?zo&7@6xf zqo{2J?XSS)LMbs4-JhM+oux%=2gj-LDutG->ubB)2_?)EB{+^WyZB+!7mT1{rLTY= zhBe$m_UQXkTYvIm@mXsLzO;ZaX-sd*8TOU{+u|RaQ4=3OA)fBB{i4Ff0M>x$;G=Ma zcigTy3Omv^$`Tq`q03>8Nu_CI-oZETO1CF?vujdca}q^5VwW%3jU=l>GX0P9$&0ck zdq~l*>>GvgA6Taz%F7GuSNLUoa04^fN57B& zyco@qy^}+xizUm!uOdF30KJ;UbaUDoc=X2i5;;X{GYa;D@a;d{4Jo$4RP>X?9tClm zA6c=cM=%!VTMjMk)s!gqqkA5#*o0Q?bWlKK)^^(tV3HwlK-L%B09T73kG}(|+OA-` z^lVb)kt1ER>-6ZSFd(c;kIq8KC)S!(aj2|HINyl4jgt?mD+-z(UScExUcp0v(;MW7 z^8L9qVV11`VMH~qbKYDhqq0|Re9{>1xW5V8Te9E%M&PXgi&h{f0k3Pc{q6jZ%?}_H zoWB$Olp082{{&B9j-g0t5mkg|jl>CvE}(wv3^&}%z#;L<4oA*icEVHCyrV_v8+8Of z@$FclzI0)mRJ~!yEuXL@E9{#QcM1j)91z>dP$XitO{IHxC-z@Kr}#75o26R^MTDIIu@^Iea}0XB#D?J(~_3 z7`p8Cq4U-63wntR0PH+uXw0Ih;)4~DCi1CH(GY9g!eTZolrQ9m9%L3~7}SPu?8-EC zcLo2{|54{e>ya;Y@!R=eD8mVSi?8FvUqHLI`qMWi=TI0=`Sk{KnuJ zjPi7bc_|V4WAV6OZ4_+Gs@1fbVqp|C;%OwH*_Dv0RWBbc}nZ%#zdQ64Bn# zl?%gu(t1RXAtW~S-c)6?VYP87Jk5m*%|R&;Y&h(SucL~?-dNofI3vkQUv6EhQCS#W z3oQ`_l46?W%km>bXxOW$0R5^Gi^cGDmE6>LTAV8rOKNLot}L95UJ+~aCnj&5ch`>B z%WSQ^g0oQ(0n62u2eV_bKAMLr`Suk=n|uk4rL-}Gb^Tlp-1LJADI<||x41^E5S1Y~ zb7f8!!V(lgB-nf2JU#d&oX%P6hfn>GH-9-3)(&PHu81o8+t8XiaHwuT>63bDqrmKr zMiqXD8pL&!CYDdL1$)zZq8^CPAH%Od164X8Y8J3`VI&}a99NeerQ?-0Io8TFlMB8^ zPoTgFCd2Alz9-gvYLJiKe6@P)uiO%wRLS6os1f{`BeE3zD`Wb2X;VgxhN4R0*j>M3 zi6e%iMl$DI0RDmkh*e}N)fg7o%$!@|Qyy=a*dHV66Y#zA4Zkt|uz&I}?9a`HKwBu^`J~UHFKq*{b z|8(%QtrwJn#0buu?cY8K`bV6=Z;+I8-K42=@Y2A=s@P@?oHj0`784JhgLX2=du7hZ zEd+_s#I?;ll}t~lNl)I2K&+&9G{VWdktxQ&j9D;#q^9vLwWP}@q};;cTh}+ z@l6hvdy{YvPAQxjeFbbmzZXyjBC(adii z&Qv@6@yWf)RPwzzhOqy@*n1CTsjg{ZQ{7+RL3KP~SyibD$TX!~%E$<@B+)$~v!iXJ zk9RI`3`JpEvSmh@x}~d>rRcH8@S3OPjSXPg+4Zu3-J{cJU z;jr?$h8jO&537S132!9su=0}hkqRThWP&SQWwjYCUD2l(^+)^^j9X;yY6%`K6DDmF zbWI~c%|Z}6_!EUmQ~Yfn0+SQ#tP$#s80yWSMXqV)tSK#lL`}#}WDL^JeGf{%jCTVb zIWbwl8Cmj;Jp_lKv~-B7IR9_aAy((h0oez$&~k!{gHa+fbB8PRkWyt$n&-q2{4w{2 z8y+RqHJ^P9$!O#-K0hc$-#eBx6px6u_@};{nutFw*mH>$)(~v)8Ipz>GQ|LuXWNw! z`gXl&#i7zX`e7#MDeVClSzkQQ&#DLFOpR`UIM2`={z&F^H>`&a&eB{vE955?NfPox z@<|Tub!n#hr!Kw~e693;xpM6cW;>bT+fXdPV0cjxX+a{`#s#eC}2O3AI)1&>X zv4t02&WA?qK{~D40-BYA@gsjGWmJ%^e@0_jLuHXKysqSZDQ#%=F-aSY9(2Ww4X!xw z7edknLe+}YVZ?)EO{TTfehQ0Zz8RLF03<<$9o32$Q6)0Unbv-5!0e33Vethrydn5+ zGS`SUyJx;dG)%qiC(l$vm>ieqbJb@}uwy}RTtbQ30RDhNn2h>6hCJ`qsTr8kCK8pb z@!##tV=X#LUX`;%i-aQ8L9JADw-6gCDCPp;{Lq%w2{BD;Odql);(tzY}Z9jw?UjauCZ@ z3t=Pk0QZ{}LQsEEz3bjLq!K8QtBi z?HIxS3jnbHKQ62t+{|4ZjQ^jA|1AynuW0fE6a<740tAHO|1VL;+DX;U+KIu`&e+v8 zOifpHNeJyZ60h(#nzvxWENjsTnd`2P}KL%^4&V5;wNMJi| zXQo_sGLD_Y<1-myd=#K)baM8|VFfvIb@v%NPag}}>N-G02#Gz$UjfkS)tkD+>0Ye0jar=7%=EoiXE!Hdk5ucsnxgz{njwOkA5>#;k5*orMm!FJN=e0&==H= zaYJmmFJLj=^U*DF3Y2_=%zKnr$)*oh4xXs#b8}l^bf4K4m~I*@{>q{^m=LH7ofGa|Nc4 z(^xQDF18*5=BBgx^Guv@!U9hWVE6hdVRGr&KHnIh&nPvse*UD%He0s!iIFWZ@OINn znV^>ZD~`;H5FTEoz9{?ZiwivBXn#2485!Q*8T|wyX;H!ShGH5a*X{bmRNjn(X>Wsm zqHOI~oqVf9zsEI4S8a(q-1Q~XqGOW6@67>0A@?6+Ndqu$3k7n7&BMb(ROcR@u`5p2 zA{TBp$&d~09Ws`oXJ-vdH-AJYEiM)rw&4FThDPvi)$L~k z2Lbs5^&g1;uO91#hDoW0p#*kSan;fOIdJ5JnWL&mQK9JwZQ_8EtJA_-+v*bG;K-1p ziPg-KcOq;uba$)^eTNIYEobzer7U3@@{o$Sm-{be{UiP7vw)qq;4H!aiW1-k%Y~mZ z(aHI`<=T7OeR{P`2>@Tv{j_i6VxW#}#ppweu~I4Q6S?;N+^DDb7658;2azU2c1P#} zMXd3b&}_dhMX?vJi!s54DM5!bJ^QD(;#cRqvO3tx0YwHkx) zhfrYE<9d&8=y>@DIFJw5?3hl>caaul>pI{ulD4rJd}sL-A&yKlZmQ{1K?8~xmQ%={jC-&sMHm8m%MjbPTU5tDgnye~P=vVMAk}U_- z3}`%6_aL^`i?Mf$I)0g9{KgqMvYTM(O0!e~)j1nfhqLFhE5gUeh%a0kq=rYS5eB=} z%9L0bgvo6^13JA|`eVavGufFa`EM7SZiI98BX!)*&RCb&IU6&E+f+$ralPgS|8_X+ zgXwYJ5sSV6YI$O}d-C*KXyiP3|4ywOA?e9!mz!>vL=aaj+_4qbaHWfsec#3Jo#i{o zldd<9J1RY~gsKj4#UWEnG zl8o^+z$HhKQ&zV05z34;10-2 zQG1Y{kvdAve5~~T$iAFMx!2GsKdn)Cm(Fux036D@eb*MSIP*q``Kxw^q)jjsE(-Q5 zk8+nejZm)Nq9UwdjT!Qm&(Us6yv4pD7v6vR<2Q{VqP-y@w_dOq2!X8wNTiaOB`4;@ z@J-O++F07}w zsu$PWW~)t{iS^Vz@0|A@lF$2HrXb*L2u)#bmL-haO>b2!TV7bf^pwv>c1HW*Ggfml z>I+aMiaZ+r?{v-&uG=gE0|5#6ufwqYza0i*6$?mH-*#0MNBh2(Ka+RhWE+;L(yBsX z{!eg=e-?@tmKGX)821&nf^O#IJsmvnc)6OM3m&oZWEW3!37o?tPICqF2)t>&?V%2> zZ?>kC=ArSP-*9*LxxVD?a(BNjJQf5%I^mFmNiySM7pg zLB*G8uI7rVc3GQuVr3-1w@SPB|I|~(q2A>lYyI;MA5fDt^NAwXbCOLis<7gA>3TWN ze!>{emZyy>wuSYT_DWyGjWI?Cy`J&8`3=lOW%n`Q@3Ms5`oMoyA4)YC#n`B$Q0$(c z9T`BOc>>xWEoQy@K4sN3>{HmlUS!LTo1RCUXVHcB zm(33Ufzu5Q#y5(~_8`RJ1nRher;)dPcgJXLoNb%MGq}4y&)Fz-Q7y`7(Hrpsk`xii z;5dL)UBWwHT}k>v`qTCe0^nC2>NBw;)apU!;D5mFLXj*dBx&N~QyHVxJ^QUV(CZ^8 zB?aLd-AmPfi*|te5y)5OIM13pLZ~%T&=JySbl|9VrTJr1C$iP5giHY_R$h z7}19D(p^at7}MEldBWS2IS`YE25sgtkcNi&V-$%GMSGvD`R}!tQoA`!`ZVV@$M4?% zHQ)E9^ECgl!1d;r;rEOyBgz8JKV|9_U;*$t6Fl$ZJNs(43aFa@_8J!_^g46?NXrP2 z@4H_#WeWkL6aasP4T6?B0Pc}8V{?r}b+XKQGsM}&1 zFc`s%60dhmFUfij|b}6<* zlg$yfM!Qx20EuY7Z&CDDoyOA(pc)iTuC2c8C^c5#+U5B^`TLlvB_MEyPn(f( zY)z}JVkDHw@mn~olvrC~SU$A9IR2U6>83^7+L;{|hEir{p4r&ibX9lsrE0CI18c?~ zG@Y-ntLX0jU5ChfbphuA{Ca(Qy}p3;@PHJ(&eSFxJUB*|`?vGrei_5U)LC-BZ|oc& zmUn;Tbm*jlC>b~UCC#72lpL4mf|5;x<#|~GnF95@PJ#tJYDfn?%KD{}k>Ye{8fpT) zD&#D|1Txk_4D0t+v-(D?7;g6yc&3bK(tf5xd5Y5B!QlE-Ij#o}PzJ%Wl_73|+!9vx z%O|`fKdu#BH!IivzL8ihZaDVl>CAz2z2Y_=XK>+On7>P1QDXQH1x!SV($?))lQJ1BVoVIy4x+0c=BX;0Ywr;KXvDGCY&CKH@Ns< zQ+zqIgjGGcfL>J}z~hz~a^~b5Z9la+u?q2Kgoj#wKXh=779PtzZ#!si<1gpj+0!mW zvt`R6S_5=H-@|uhzqXEaWhXQeUNk)EV)+R>_FcTqKxw%Bc4Dxd;0Sz6Qy-_*RNOEw zr&w`#YUSB}<2<2sZ6f*vgI(#g)O0$3jT1f5Mu5@0RHQT=P(OZy9h)V=QZCsC~U!nkALP)KipT z(nW?c1wms0gfx9hlb0c4e@&dB{dF(iD@Wr%SD@`t-2Z|l9A}oy#87mej;nTf+2iQ3iXm>@Do$U!qbcW2WMN{Q0WaaPH>dd z=E>Ygs>JtPT31iKab(Hgd26ngjp14>2FyYZ22M9*|PrIuWu>B(=Tzyl0 z${#Jj_&s-L$^L=oZ%{(&CRMU|jzqHw52XV&c#0o=eZWjw zZRspiw~!*uEeJ|Hcwlw>mzwxZzOYqs|MeLt(WeL$E-;?)zbR1ZG2WNohkTmr5nBTD z`iBv3v^auv*$oeCZ2x!w(L>3%99Y5Xd*uMR!?E|a+IBINtBEiihemDYpf4X9B;<4M zEQL&o4&k$x&_P9;Px=6v!;1G!Ij?N|r8n#Vk;7Z)&4RzkFgW)->>4^tNtLljlUK7u zkm1Sq3xT7%$Cl!*cu`bLr9&qB<$-|p1lomJqSHf&_91gn3u-YpwZd2KSzOGCH=EWy zs2!&$i-bqcas%cFjPJ4h><*DTbmqN~h+=tc;GdKL3BfUPr32YR%y+bk)(O!O-{&R{5< z&w1}kv?gn6M!+y$)_`M@8W9F3Sd|+I@)*ZHh!s?l5g2Z}$H0vMEgSCDeCva}I#P2+f_?VxJAO1jgkX`Bqg;B--;1BHKGrMnf_{Am@J?gC#xECQC4N~ z4u~6Q9uhwAN@J%=EVNq8#}6Nk2c8Em4afJAMfI>P1~vn`&tpYa_cyI|PhOYZhctWYnhODv(%D z-X`%;54AetMCSDHn*1n8CEhQj6mAvLO?n42%r2!aN8JIHlF{zEy14lj&3;wJ~9~6v>c{dY`Fq8@l$=I zCVaRg3m|;ahIf1TLeP2}hh0HUet@w4B?7MuNz6-~lhk~U*I=%tg4X6%5C5RD%g4t& z*e#BRjwuaJ$pA1yy_+(ADk%Tu6+%~OPU8ywNLKh2CzPEi;5%t*awtK(`AaaWL*nRw zP1u_CJ7N?b9x0AgLO*1|ONOs@#0HjnjFE+)ovXuh_HhD-@bMu-_0w zQ%btKMgtn+KfjA29m*doh2-H6o?#szV<7>%W`vlXqYClG$BZ$L+lq1|#F6*hG|UuhudmseFtaNM7u zCaq5ITs<(;qnp}^M3-c>^7^h6wMnF2NPNU4*`w?FIzefi!+o9bS-NsouTgyR8Js|$ z$Bmk4b#N))NGBUx4$boD(5IVG8u~$HpLD$={iu2e^qoO42v3yq3F+SpWOY_ zFMvWvsYgB%^3*?iyWkn@o`@#|BYL>pfV@ChJk8S|@H(Ojkpx}RupTv%?j#sGnp`I> z)VFWhX>9>75uZKjDJHHsI02&S8tjtj?2SV;ZB-!Gk3HbjIa~kG6Q8mkyMi0+m%Az3 zE0=nZeZEtID8YxH6uC_hOnsqB7m9<9B;ZbD#qgJIcW_SisWpxQ%ocnuI@>bJ|4}iy54^r6wFJV& zEqijzdS3`3e;`I-o;w99i=6YOP`ov%x=NMC-o9f{<3suUJXzd8sZV~)?(sQA6sV_b zn3_L;&+D#}z@lM-F_LvcylTb%qUYv93x31?h(|cMU2M_v)^iwbh5C~7U>zgSQvPhR zNB4A1sd)j<%P4xx**bI^=;xxx?jMyMv(j$g%`5tEQe^A&xyDxSH=@f&@4{rzV1w4V zc#DT$TswyBW)+Q6XqvP0H*i#$0B-v@isw3x=Nl}2Qw z&N}>i-CnV)xz!J|^!&9A&l+hHj@s($csjf~K69d_#*?mZ`A}9trP#Jpd!f}j^VL0+ z=PH~pn)t4=tjlh02f}jdAK9#KS-bApYJIe#8EXaQ`5*AV@XF#T#O=5g08J9RwRauX zWs2GMoway_aE`Y$=B^7hRc~kFXru#1!Do0nfta20*C18DPLvn_0?Vleh(IVLufaU> zR)n)I9KB6z=4&yb8_<*b67^EjOi!@25a?^BXHa`Ew%9CWB@#Ap!>rZ}hhnN`3%p8& zeoN_LZCC;JbKh3Nzq?YmK=9$*nZ*YT(mz+Th1YYUFVbxgd50r$H`D@2&PNuWVRkoK z&UyPDlywcG69FSrbd(LORq8+7@|4i;aNT;cb3qko*~3A8tuOvR^ zTw~ZgVMiy!4w&;(^ODF?lO{;~xFKiSSakbv=YOBTT!f(7*19_K0Rv&Q&P3=8p@SNs zQs`LEao(VrNwjL!O3|V*+_dTp|@SS%jCf%X-0(=JU&jFa_+54-jh>qBdfp3qMWE z5=#(fRaHTW&ALCrJekvhF%U#bSX)%H1Xp`u8jm0eAVQwpz{Q!_v;72leY)O(O_3nD zkWAwT$_HuIXxI+ZYQs3(-0Z}#4;+seC##hK17!x)^K ze)!W3&#nVAftv~6-d&g|5eN4r_M=32c(z_Z#cr6iX}|I%?(94?R=7d&ICJe5t%d}g z=0~1hZ1)5;u+2`xLh-3ivh<9>)rVQ+1J|1#XJdpdB30A)&o4C(QY~0EDZjn{=Wpm7 zWbj#fu8TUUjX{I8e$dB(`>`j=#QDJfzp78Uh0~HKZ?0ZW;2dpM?ZrEv5MSgzzopoK zu>Ah@ zBTbd7MP6epvo^j_ECX+}W$31B$&KOROaxcB(#0NK9$RKv7n%pMM<1YzAin(h#HX?X z*S{1aRaFrfoAKDVl+LS|Gt3#{$^I4B(@8Ip{IQAWp=1fj)2|*wr1TZO+D)sOp#E6t zE~}mZYOC{HGHLiCvv?!%%3DO$B$;B5R%9Hqp|GcLuK4ZTSVc3v%l!|fF8f&Ci ziC_NmTpWr?+WI!oDJ9^3_P0)&vZnnC=|OXsUw|yV6J;7y&=LI(&F>Z|1ImFW{LRRQ zNGb1P>9DQj4z^p9!(t!}H)FnU;$>GkZ#ZsNnh^D0&^&jgALow;xclGOcm_N&h~2UP zTuT_y1Z)aF`vv$nInbO!%fSd}di$Yi;(zyESy*Pt5g|Zy32iQ$BAbk z!n37YTg616pojDJ_wLu)(7&Oo8riV^ ztSlFas6;fT&?cQx;G1@cvc30JN`N2^3Z(aNoJ~dBQotNsi*~{tjg5PLLU2-YIxHc?R z3lGuWeA-JQ3V)>m6!WU;wtBA4=$j<>SgRyvm;xvBxf?wqYwh*-QZhM@gDAXsKjrY4 z!Ul8hGJW25!YHdKx%lG_n*{4NNds#e9kzArxSa4Zi74mD#&k7A>%0*yRXo~-I{a1m zK~iqRW9bT4(k=N`GB0oAvv`W;{Tk}xZ9S~`x?$b*+sY)MiGY2-S?aAcGzEo#$kw%- zk~|lsgAG@XvxGN7@)z`_!E(cx+=}#iX}v##aQ+W9{G?QS+j7*KLZap_l$Y@@jmdZ` zgT&@eYYP_884p$yE$LvDgp*h;Wtak$J0c2nyD@oK4%3+6IzB!qP8zE*4hZ}+wMH=O z4 zahPD@ohXE$Nj>2qB}zc`p5=ubtJ z^pqWG6<{9m9o|Rlg~AF-Ygw|E!Gh0Ue;n#kJ06yYceL_|PHW9L8ry&T7%Z{35kt3N zw+OJ-#cX&Ob1N-nZJ)WY+32O`#UDI&Xi*n&nAlbyuGS0LPAKR$OU|Av_Ubq! zr!mj0mo={$;7hgMs2}P$qtEU@(ruPj_UB@S#2?$k=;`ZLUt_-B!t$?Y zL1f!9Jl6&0KV9jGc(fr>(vu!rb*ov1f&wKHBs$3q)xX@-M=<;#(7T!fXLFS|2W@aq zRdvTFcFerjm>tjc(og7T%?Xi}Y`$X-GdxTaf3vaYTRywjdQSzjV~Utq z!{CROf;YeqJ~pdJ6@fz)Zz)k_)yPy9{B9uYt$F#wvpxDN3^BCppj~;j?y`WnunchB zvL7*F(1PpF>oht6^E?Y4q(Tix&tbRc&uR1CFLyTwYd#0{ci=D@u)2AiERIA6jYvT% z-Kg-{wO^QOuB3Y;?%L0qVKB-6>jiz$IojUC@y)l16eWS9obr1E`NBS6DX$rFs~?h$ zemJmb@w!{h^XWtSpD)#|G$*D3?mCd3!#J4kH*6?3HAQ1(4w3Wk*0Pm08t;Nw+a?ya z)}&3Eo-Nu>(o0jJfn!#;vdECfp6D-9W%WS?lF;Mov>YOdXE|pQ?^4Tn-ubpzAF}^b zAJ`oE4Z{bH2sT-ELzQD@Xr*JWn702Cncp+GR_?hJ7I@Wkx#a*n-7lN~g-g)|&u5h@RKWa^vBa~QvOsQxLF_Te-O!AQi zmOp>-m*_oV=yp@Y_M1)PxgN;x|I=mKe z^p^O|mVYcdekgy(F5DXl9iGFlVU_%RtSrt{gN}D9W5gYK1EV~7#+~#Q`i1vo=y(| z`}k+INxbx~3s1O`&o(0AU_l7R_;f3^)~&AW_IRly|Dh?MUq6>G1R>#*dAlQe)MFXNK^NxNcIaln;G%%YB~uZV1n=S50Q2BVUT7_92X@7Lgk;nO z^Hq7pElB>LKoe+)2`Lah%yEUe>0HAV7(((hj;(5Nw6TK_*%?Fm9-6;u&RC4^hdy7F z^1Whg)FYF=9AU8BhDK8OcIX~z=o7!1en9ee;O{4NBI&=?V=W0fJ zTe81?6?i2?`8vU=mm(@g*=$9FksagV=uaGX#C4z*zs~+tAyWf3tb_ar{*r-{Kk>(n zq;V$e$0suR5loSZk%<*Zj8%r70Q>0xshrw;IWmElW&yqe9P zX$XWzLvpcjB=vmoOK$iehxlVuLw(L_vR`!ezn&Ky1dqOd{IAiqaIAdc``@=RY)BB0 zAN2n@UHB=d0z`lv+^XXzNSO?EnJ#QL(g;=#lDrV3P4?b)BfPn>^@KysKT^kzd9mV! zVOhpc znkOgpkoW2v&d$qp#8Cgz#jd+ahFAZ6Ry(t~8q9VagBe&kEkeEk7ZN_&QZpZGHAz~| zm&)McfJ@Ce7dHG*Gwk$=*Cc^BTi)1{Urz5KOUYU)9T##bT|7LqCVnZ1)<7PBuKuuE&`)NRCR8 zOaU*K-4Vyjvx%PuR! z$3aa^;hl#mvqQj{0Xd{R;H7&-_^>SNl4EALZG^9%--6fJuQtJIo-hu@!xb5SK?dbS zUN@5ZQ$#0oh?Z6`PM;?{bv%dkwKz~(1^x$xEnPC0o^%8jmK$@NDMU9gTbfIib=@vB z;0ZVyVqvsOeNFCcm=6$L6A+HcUktNKSW6>lm_YK&_{S_pQP6S@Gp(``Xc^XnwH}}M zAF=GqgcpwZeY@^Xn2OD9`zFWZk=xPNi5i4_Wfc zojPk9<~tVW)Ooh&R&eH)&bjIr&VS@HlENKX7xB?AdNnBs=NCPq7*wnBGp4MilxZ`_ zS0?^+sVmU5@{5*hSUxB9tA-EgpL1VqSnLGyHUD-BTdYFAR!j{3;z@HVDOi&Rx<=)B z=ntQ9I4@iA#*Cdp*l^3ZMbGS#>*xb^=tyFwa!^YXu2k%8Ow+#wXNIp#x($wNgjc*&Kj0>P*r=wBA9~6Hmh6TF znaZyHuYw8#Q;lK)6N(g#Wa`E)V|idZy?n;9!NG35r^A7kL*wW_adjxwZowu4cvQaVo*mO+8r5LR+Mgz=HkI=ob4Qw_IRllR%n$ zh_{Y?KT>^d$ALWMawfCNtH!xx-sukl(Wx$SeAsoGoMqbY1if3viKKOg-N~c61WzqF z)a*g#8g6v^7L*)$xoC)kYYVfQEa)j;pLtu)2;)xddP?3l$X6fVK^A*kcP?vIde1bY zoTZ_{y#0E$!PcRBE&EQ>Cnums!i}WdYA(-`M$kqju@Y>Ia?qaIdp9|fDb90zjIP^a zs$7DOdRBjN(VjuCxs@EXt_N_dx&SJkQ#_LORf zlPv0}BE>Inpgd$7P{!4aHS;yE8!f=cV7=~*t}DgzlwQ{wh^OM>(~aSg2tl6q;5C;( zQ-e*uS1aCD`KfI6{GxT;wo+vAi42v3#C{fEp||h|L9fMQJO+%H;qb?a>I~{LFDZ~a zy?v6-Od`#}0hM(?wG*8dTi>QnKdiq{kD*38~F*ez-=W&9~!# z@PfeC7*2M|aauscu$f4eVcbJ;{P*GznbHVyHNm@e5DMkAcJ|8TOyd!Ng+h3HVcCf> z-frSi%xDWUkeoN)}RDAVE| z^GAy|AdQ7TvMRTj(UG3tbH_u*wJe{+=aFI{T>iPJ_zIyOEm2DBq4ja;=!(iA#^$3SsQ*CzF2%kqG zZaw$|a}2B+W9!H!$X!M*(1_Y*%~uOx=xj!C7SQF)aYfIJ;Z!~PU?LPW>nT+6TaX=f znHpy=2fvTeF?d|k(}ET(E`Ymr1~Wvi*n?R-8|P^bclWKRa}b z5(aDCEv$Ka&MxIabc zjp&OHJT@4$`a}rnn|Q_P$+%^G3kVR(_J@3ZoPk7~r) z9|aCPkkA|K!%c-*S0l_}N>4k6`@ILk-RggC+#6A{2+v=L>m)ouV4AHx&xoe6OmBs^ zxiZ_`1qc}3h45M3iTV*PWkr~i_p!$AN_qo=CC@SlyL;Vl8!%%Km4Jpo*~3 z`p%nUQUNi9oM#Fj#RI!1w^*OxYLnZ#Wq=)QdkqyqtY?=!f=4!!!x&6i)1nq_|8*CI z%?m{LOrA#LOtXpbX6%a;GV&IBTlZ<&=<=F42~KObJZ>C%?&Zfd6X&0lNYj#S%uqak zmvpd&3pTOSveTmZLN%oUClnJ(F<&O{2s@Z;n8x(@5TOhnhTr^uvLYpm2ziraq5<+; z6Nh|gjA{DBkjh(;fkiWGI#i_)mAuJR*4$s_zFo8M)HQ)}jSA>7rWEi2$&M2KY_SdU zRhjtlI_o?Vxi{2m=tB$b3`tD?k!##fw%;aqte>?5yJ;1tMnXQ?ej<)=V~YWF;Q5m1 z&KWp0LJFU*y3Hc* zOe?*T5t@2y9v8RRO+9_>0a50ZVV;m0V50qgc9~{ff^n>;B}eW0yBL~4`c}@8aLRr0 z+l^C|E!zHBKMmdStyp5L>xzAb{M&VaL&a8-KG&E0oOQ|2$Gv{n4i>;1Zw4V10J+Zg zVWQl_aW+0mML=pq0;Pd5xlVKnnqn7wN3M_R5z>y>vmL8;>GYO)cFSyAHo|fu%Gf1{ zqGUd}l=2O2uhpwBqlk-5zrgu5AK!rAD^J5<>o$^zxT~)3(LUIgOWl>CcyVt2Y`SL2 zSXlj&F>lkQ4}b2iC=5tJSRlYP?CVuX!(PT*#xmg@wfU3wWp~{xWb>-3x6bo%%jd}A zC(wEsL_TIkXKTsXZit?+>GY$;k{`AiV^i*rNf3<63wu@-R&>ZP4=QQPmZN9*Z!+4s z!Ke1)%412!n7I$83zR5lbB3-|IOJ1;;V1=n`w1^HbKQg6k~ZMzBWC4N>=U%}Q|D06 zS{@X9ED~9ves;NIzGD4{Sf1O_Ur9V>CF_b!)wbyhdKZ^qRE~FIk;J?x-EZmSc+$MU$yw`&Sp5gmoAZdqXT_X+erbn6=li? zdpXPpIoJ4$@eO!#N>N#pB;FBv`gKE#W4YyOt5vk!uto=&&#By`?$Kv>;Dl73L}xbm z!+~`H6tM}ZHjUGZv|5U4HrSgiPe{9dLBgO~{){8#@(_TjAvO^=#>Z2~YVo;#y0l_7 zKa7N|$jWYK00n=01U}qtH|)Ww`hm z@%^bkE{~BXgsH@he5@MCP#P0?ZqjqS;6Rfc1ILFsK_7m>STdy!K`-Y)&7bMrEl~@Uplx_)OX#`nPkOiefLKbjI0f7Mnkya#?5a|$6LSR>r zlnx06q*H$T^490~o%5SHbDrzE=bC4pndgs*bKh4W&3K%dz-QVzI;2t^M}6aPAjQtq z(~0EhwU3XvB!-yv$B^OM_k6A0=85z@c7(SL(zPM`81?=3iH0#3EkEw&EN9_umS7K# zpkhSQnov}d#4c^4)i|Fex4kKBH(4J2>l}>w z&i%Xhm8!E0Xwk%_Sq2*u29e=Mv73kJoobaP+BEn2_aB3!G9S$@e5K(RP`O+X(P~m? zu+90r8dGW-FHKhN@1QA`-A{2Q3;B`6*Jp{(RFkW{>3+_q@nJu4scc&s5}CG|F}n%r z$JG=cevr(Gk`I@)3`=DTBi69G_g&}-d0`RWK=!`VD2dt5)cys-Dy=c%+*c0fR|8Ok zl0*1$L&>kyrcGMAf)irTU-iqh_((j{nj~cwGr-vD?0&Yv)kA_=Kxxtnr36vjOO3ok zsh}{a(JOlk>2I|1LQgPg#tA9v4W9HoqvLu>cdQTvkC>06Yy;osmn@@`&IBRmw z7ffk^TrW1vk!IbYMZ?InU56LcKbNJinmx$efM*8=lEzS{@0#?0*Ca6myicxqzM4_J zC60-v#)YzY+tS1;Ur{Bh1aKwK&+=^RR}m*EP^8ytbr#EhS}rkZP5shOyeJJZ?aiS; zL*?Y@(lL~kHbHEgQJ+gbDMnnzkCSe|bF;N59>{A{<|r~Yap@f{Nyut{GO;}-{bFWH zhkKXw)(Y-}PpBedLp7AT#J+f6bPK)hV245LxrO@aU3iUfh}xh)Te@*$!VHvZWbVsd z`7GaxN10L{B8e{nub1K1d>IbF)*%Gj8f^ZhQp3uY2y+WHnq4tbfJ$!La%p903~{jw zltpI-9VB`$9GWi9orKwy)n;W2^fS`uu=b&3w7aL>c&N1A$l$1bFQoHq=emkdWBBwq zScQ9=K?CUS^~IVAA()EgC(hEs)NLPTcE^S$ z{1QJ(1!UTjDzUzb#P>sqGMvazkEAQPnZYy5h zF3rsS@;bQp{f5u`X3;t|!!B2843GEdtilr^{Ko3~&Y5<$GAAvZ|B=+3q7z?e$&;0@h>-ova6lV z=g8j6$(1-<5)S973ze(S>K&m$$1#+N7Kjuzv*fgePy46G?dZ513N5a&&voA{pSi5E z9!QPf&Bm`e{(wvgE9Y)5e(Y#cZvXX2Wr=KVD60)27Cw5P@>+9P@Gr)iAa=V@GQ;CH z6)upex`c)B&%0^eX%T}EjF**64{7^-&jsF5(mG{gD>YxamnXEWO~&JG!fssq3y9T1+)4guHMPDS);3YJE6-iMZy&ag`1hTFyw<@QY>0G*HX7k zo{6BF=sJ!s%;yJgj6IT^zQR*XS~03SuS0|PD}5Cd(WQ9dra8~hGs7HD3n?F-Fc}q_ z&ZjcQP2V#06xOROUWfct%**Y{^2E8xuJ%bL+eBA3hR-bNvrUUha;@nt{V1tSbDZ~R zea&j=THU42`MsFvuo+`fGURM#qlYm>ux=;v^#z;ebX+{n$2{Fh#pL%KBg2? z6ke~u)UekcS*xxR%OQf6{7DA3+w zK^v3WMAUg#=3rcVuD-I|^^5~OzF9n&vV6IjvP0c8E~Y$wbJ29ikR%u_;=SxInr$G8 z)gW)j72tL^Wb48NaFGhh{&~&tV>1Pv!k{yrHo77gJ63Rgmr}VrUdp-3FL%h;v(Oc2 zE{Yr!zuW&}`5a@bkI^frDqdQn{zmpv zmeCYUE#8ytiLbL{P&7C4MBa4Wqsv57OTyC)6zBYyDt z#8a0Lvp0yT-bslq@)lW|@PYu@p2GChf}{s}E{w=Lb_ERT*C^<``6=U^O?p~Q>Ms(c z^R_Q#dh)qdCVfuWgNCC_ME0tfbQbE-U~>>k~Kl;$#`$CRxR zhXV}Eago`Tq$Kn-Fxpi?)V&d-D_6HFqZ`pFQj1Qawm_M@YMbR0^!-A}PVKE7j&bLN zT(N84wDsNyM(kGpw~-o}E?;6xI6Vqr7mOAoy{P8!PZm43*ltS_R3Y z?V9Da>;sn^sh2oIgssCKkfAgg_5=9(w9JL~lnqG;Md+aD2&~f2JOiLxVzhglNK8bu zM)++n3ld-F1KUQ}Kub$n41VxV^MyUbVm9a`lPZ&{AVM&r>Gs(3aTr*q|E15^kd*6) zNLe>yoTVHQBPQYFyznVw>?sNtZ%}%GTH8 zFBE#oES>WkabUajM$xL^M(wi>S%-D{THQpABM zJ!UTZaST23>D(LkML$>_3AYLg3w^dvKx8Y5V_Gwx2y+El)}) z3cLUwTS;R?__}Aw+I3!+pJ}Hm7w%-yp-Pp_*QkzV7M9=EdPdZ%4eJKAB^(~UUoxO_ zqY*hY*4=%$`r^EC98JjDQ@kQt?}6z_;GR-2$#q+9_Ej>RC2( zD~2n{(O)i_TGNAmkqGX4cBFhfv}|KTigQrI8j^Q* zl6Lm`o}6|_LMRyXBk6V20>o;_07VI2R|hteKB{;-}~?=aD5;OWqbEv zc%O{ZW==)fc}0NNhI-mb+Lmhi3)F^Y+HVJ={{AW8{`B*vH`-cCM7?L^VUZhyz)_pZqM0osX=I9hvWZ^8NkB*P~m`2PI)015W!z8Hi3Raj7fBRWmQc zcEnKby`Y>#0SBgJ@z3$Fat@eNilAdmpy|P0WuBV=1Rm(QizSHoa*~FS=_*yP~>$ zJn$Z+MX7TwsqRcBqLl+u>Sd-(e1107=6y+&P8*Uj7B`K`tM%T%-Ky8@cXiZCUTTgd zjRg2E`Y4z`54pzVcO2DLZEXoyybb8yw zf1ZP$|tCEc7Yhyc;m(inMR>IkzFe9uLJLxWb~sK;2kZi zfK(H+a+dh*jQ-nvuhw~)52`C*(;SRiKdZ5?W|XJ|ys~1lbhT$Ws91l-V57xF>=_~W zx6M)w*sN(3)g|u%GJj*8G1tOuHpb9irREkfohPYS+n=|Xnjfy8tq#2(Kn6eANbAFh z8^rEC!%ogBGGLOD+N+3c{g1FQ%DQ`JehE*D?GyDg=r8ObtXe<8H2* zvC4?+O@5m5?4M{B!Q~#(8J+dH9CYSU8;8Cmo4{qgx~uOsOl*u;Xw6!}7bbnwAsF;Z z#>}s*(7Bzs_)Wb}XL20t3!2^3X}l~vgVU9z-=joaET}6qr-AbOXal)x>$r&W`1%T& zhv*=MpUypv<(HaW7l&$SuX98Ek)#`P4#GwtYthgNF0d9`P`=b8Bi)rXO} ziOE-{M5i-t64KPh8s1;(qp_LtdSlR4@_ibAzlC z{3V#l!rybPzg3oDYw}$aVh1zidFac>`smkdy=GOJtH>!)Kyd+gzuta%i<|=y;V5qO zB0&BHJV@lIZ=JH?%Hf@F+WFsDf+|VxeDqDh9Z2Jv=LC!?TxEy-^5YhR{7FdklG}XE zuPAx(a^;R~@Wc%ztr5fea45nDqVCKs@&z7>)U4H9kBiC)RoI} zHPMt#)IX}4jmpOTs@y}`RL^ueQn6BYwjD!8m#W`e5JPsUMmxoWbqcOYcV6+?@>LEO z7y^%f>6y_=;dYHaWWzBesaYEA{ze82T{&J>jeg{fcbFb9Xb}aMe`f+UxLAFDUbeR4 zIbvmCc=VVaC9fdYXTrv^YRapIP4VDiIw2BSiS6*byKw!R$!3?(B{iqH0aMpapPChr z11rWj%t!i5kOS}NYxfBL?$A0zZrY?{2ofXfTh)IUVUj`JG?Pjta7-@LGwX89RcY_w z!T=_Z!7AnY+5g)x(Qe=z!7xz({*T)Z!S05Su>HN{heK&V*D)jzMg!K5nE{HgV6AS=@S~j3HK?Sk7Wd-hoE3VJe2m|VQlb$s*^5&w&1CzcM=Kg zBTnHY2nTJZ5Wu-hr?hlR6X26Nh4eY(AS9E8uonudPs0E~*}uXpVAl*3d`Sq&%AbZf z^I5@P(+EIH@syU$(1MmT7XfjVzo-_#t$qsGXXOE3%~NPq#(COv!7L0Y(JbA>B>(y-{kNIX->1tD*ZKdt`OVtsbUrQ+ z|L1%}-v*PR%%A}=9Lyd-00|y{Q6_ME;3BZ=OQ0N}#uqlSQ$rZg{tGiO>USC}q~Ztb zzd+%?`8fPNDngqdemm$?NIED4|E)OuH<4W^LBt1&4MV|@K^V}H1m '} - case $link in #( - /*) app_path=$link ;; #( - *) app_path=$APP_HOME$link ;; - esac +PRG="$0" +# Need this for relative symlinks. +while [ -h "$PRG" ] ; do + ls=`ls -ld "$PRG"` + link=`expr "$ls" : '.*-> \(.*\)$'` + if expr "$link" : '/.*' > /dev/null; then + PRG="$link" + else + PRG=`dirname "$PRG"`"/$link" + fi done - -APP_HOME=$( cd "${APP_HOME:-./}" && pwd -P ) || exit +SAVED="`pwd`" +cd "`dirname \"$PRG\"`/" >/dev/null +APP_HOME="`pwd -P`" +cd "$SAVED" >/dev/null APP_NAME="Gradle" -APP_BASE_NAME=${0##*/} +APP_BASE_NAME=`basename "$0"` # Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' # Use the maximum available, or set MAX_FD != -1 to use that value. -MAX_FD=maximum +MAX_FD="maximum" warn () { echo "$*" -} >&2 +} die () { echo echo "$*" echo exit 1 -} >&2 +} # OS specific support (must be 'true' or 'false'). cygwin=false msys=false darwin=false nonstop=false -case "$( uname )" in #( - CYGWIN* ) cygwin=true ;; #( - Darwin* ) darwin=true ;; #( - MSYS* | MINGW* ) msys=true ;; #( - NONSTOP* ) nonstop=true ;; +case "`uname`" in + CYGWIN* ) + cygwin=true + ;; + Darwin* ) + darwin=true + ;; + MINGW* ) + msys=true + ;; + NONSTOP* ) + nonstop=true + ;; esac CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar @@ -121,9 +87,9 @@ CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar if [ -n "$JAVA_HOME" ] ; then if [ -x "$JAVA_HOME/jre/sh/java" ] ; then # IBM's JDK on AIX uses strange locations for the executables - JAVACMD=$JAVA_HOME/jre/sh/java + JAVACMD="$JAVA_HOME/jre/sh/java" else - JAVACMD=$JAVA_HOME/bin/java + JAVACMD="$JAVA_HOME/bin/java" fi if [ ! -x "$JAVACMD" ] ; then die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME @@ -132,7 +98,7 @@ Please set the JAVA_HOME variable in your environment to match the location of your Java installation." fi else - JAVACMD=java + JAVACMD="java" which java >/dev/null 2>&1 || die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. Please set the JAVA_HOME variable in your environment to match the @@ -140,101 +106,80 @@ location of your Java installation." fi # Increase the maximum file descriptors if we can. -if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then - case $MAX_FD in #( - max*) - MAX_FD=$( ulimit -H -n ) || - warn "Could not query maximum file descriptor limit" - esac - case $MAX_FD in #( - '' | soft) :;; #( - *) - ulimit -n "$MAX_FD" || - warn "Could not set maximum file descriptor limit to $MAX_FD" - esac +if [ "$cygwin" = "false" -a "$darwin" = "false" -a "$nonstop" = "false" ] ; then + MAX_FD_LIMIT=`ulimit -H -n` + if [ $? -eq 0 ] ; then + if [ "$MAX_FD" = "maximum" -o "$MAX_FD" = "max" ] ; then + MAX_FD="$MAX_FD_LIMIT" + fi + ulimit -n $MAX_FD + if [ $? -ne 0 ] ; then + warn "Could not set maximum file descriptor limit: $MAX_FD" + fi + else + warn "Could not query maximum file descriptor limit: $MAX_FD_LIMIT" + fi fi -# Collect all arguments for the java command, stacking in reverse order: -# * args from the command line -# * the main class name -# * -classpath -# * -D...appname settings -# * --module-path (only if needed) -# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. +# For Darwin, add options to specify how the application appears in the dock +if $darwin; then + GRADLE_OPTS="$GRADLE_OPTS \"-Xdock:name=$APP_NAME\" \"-Xdock:icon=$APP_HOME/media/gradle.icns\"" +fi # For Cygwin or MSYS, switch paths to Windows format before running java -if "$cygwin" || "$msys" ; then - APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) - CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) - - JAVACMD=$( cygpath --unix "$JAVACMD" ) - +if [ "$cygwin" = "true" -o "$msys" = "true" ] ; then + APP_HOME=`cygpath --path --mixed "$APP_HOME"` + CLASSPATH=`cygpath --path --mixed "$CLASSPATH"` + + JAVACMD=`cygpath --unix "$JAVACMD"` + + # We build the pattern for arguments to be converted via cygpath + ROOTDIRSRAW=`find -L / -maxdepth 1 -mindepth 1 -type d 2>/dev/null` + SEP="" + for dir in $ROOTDIRSRAW ; do + ROOTDIRS="$ROOTDIRS$SEP$dir" + SEP="|" + done + OURCYGPATTERN="(^($ROOTDIRS))" + # Add a user-defined pattern to the cygpath arguments + if [ "$GRADLE_CYGPATTERN" != "" ] ; then + OURCYGPATTERN="$OURCYGPATTERN|($GRADLE_CYGPATTERN)" + fi # Now convert the arguments - kludge to limit ourselves to /bin/sh - for arg do - if - case $arg in #( - -*) false ;; # don't mess with options #( - /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath - [ -e "$t" ] ;; #( - *) false ;; - esac - then - arg=$( cygpath --path --ignore --mixed "$arg" ) + i=0 + for arg in "$@" ; do + CHECK=`echo "$arg"|egrep -c "$OURCYGPATTERN" -` + CHECK2=`echo "$arg"|egrep -c "^-"` ### Determine if an option + + if [ $CHECK -ne 0 ] && [ $CHECK2 -eq 0 ] ; then ### Added a condition + eval `echo args$i`=`cygpath --path --ignore --mixed "$arg"` + else + eval `echo args$i`="\"$arg\"" fi - # Roll the args list around exactly as many times as the number of - # args, so each arg winds up back in the position where it started, but - # possibly modified. - # - # NB: a `for` loop captures its iteration list before it begins, so - # changing the positional parameters here affects neither the number of - # iterations, nor the values presented in `arg`. - shift # remove old arg - set -- "$@" "$arg" # push replacement arg + i=`expr $i + 1` done + case $i in + 0) set -- ;; + 1) set -- "$args0" ;; + 2) set -- "$args0" "$args1" ;; + 3) set -- "$args0" "$args1" "$args2" ;; + 4) set -- "$args0" "$args1" "$args2" "$args3" ;; + 5) set -- "$args0" "$args1" "$args2" "$args3" "$args4" ;; + 6) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" ;; + 7) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" ;; + 8) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" ;; + 9) set -- "$args0" "$args1" "$args2" "$args3" "$args4" "$args5" "$args6" "$args7" "$args8" ;; + esac fi -# Collect all arguments for the java command; -# * $DEFAULT_JVM_OPTS, $JAVA_OPTS, and $GRADLE_OPTS can contain fragments of -# shell script including quotes and variable substitutions, so put them in -# double quotes to make sure that they get re-expanded; and -# * put everything else in single quotes, so that it's not re-expanded. - -set -- \ - "-Dorg.gradle.appname=$APP_BASE_NAME" \ - -classpath "$CLASSPATH" \ - org.gradle.wrapper.GradleWrapperMain \ - "$@" - -# Stop when "xargs" is not available. -if ! command -v xargs >/dev/null 2>&1 -then - die "xargs is not available" -fi - -# Use "xargs" to parse quoted args. -# -# With -n1 it outputs one arg per line, with the quotes and backslashes removed. -# -# In Bash we could simply go: -# -# readarray ARGS < <( xargs -n1 <<<"$var" ) && -# set -- "${ARGS[@]}" "$@" -# -# but POSIX shell has neither arrays nor command substitution, so instead we -# post-process each arg (as a line of input to sed) to backslash-escape any -# character that might be a shell metacharacter, then use eval to reverse -# that process (while maintaining the separation between arguments), and wrap -# the whole thing up as a single "set" statement. -# -# This will of course break if any of these variables contains a newline or -# an unmatched quote. -# +# Escape application args +save () { + for i do printf %s\\n "$i" | sed "s/'/'\\\\''/g;1s/^/'/;\$s/\$/' \\\\/" ; done + echo " " +} +APP_ARGS=`save "$@"` -eval "set -- $( - printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | - xargs -n1 | - sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | - tr '\n' ' ' - )" '"$@"' +# Collect all arguments for the java command, following the shell quoting and substitution rules +eval set -- $DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS "\"-Dorg.gradle.appname=$APP_BASE_NAME\"" -classpath "\"$CLASSPATH\"" org.gradle.wrapper.GradleWrapperMain "$APP_ARGS" exec "$JAVACMD" "$@" diff --git a/js/react_native/android/gradlew.bat b/js/react_native/android/gradlew.bat index f127cfd49d402..5093609d512a9 100644 --- a/js/react_native/android/gradlew.bat +++ b/js/react_native/android/gradlew.bat @@ -14,7 +14,7 @@ @rem limitations under the License. @rem -@if "%DEBUG%"=="" @echo off +@if "%DEBUG%" == "" @echo off @rem ########################################################################## @rem @rem Gradle startup script for Windows @@ -25,7 +25,7 @@ if "%OS%"=="Windows_NT" setlocal set DIRNAME=%~dp0 -if "%DIRNAME%"=="" set DIRNAME=. +if "%DIRNAME%" == "" set DIRNAME=. set APP_BASE_NAME=%~n0 set APP_HOME=%DIRNAME% @@ -40,7 +40,7 @@ if defined JAVA_HOME goto findJavaFromJavaHome set JAVA_EXE=java.exe %JAVA_EXE% -version >NUL 2>&1 -if %ERRORLEVEL% equ 0 goto execute +if "%ERRORLEVEL%" == "0" goto init echo. echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. @@ -54,7 +54,7 @@ goto fail set JAVA_HOME=%JAVA_HOME:"=% set JAVA_EXE=%JAVA_HOME%/bin/java.exe -if exist "%JAVA_EXE%" goto execute +if exist "%JAVA_EXE%" goto init echo. echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% @@ -64,6 +64,21 @@ echo location of your Java installation. goto fail +:init +@rem Get command-line arguments, handling Windows variants + +if not "%OS%" == "Windows_NT" goto win9xME_args + +:win9xME_args +@rem Slurp the command line arguments. +set CMD_LINE_ARGS= +set _SKIP=2 + +:win9xME_args_slurp +if "x%~1" == "x" goto execute + +set CMD_LINE_ARGS=%* + :execute @rem Setup the command line @@ -71,19 +86,17 @@ set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar @rem Execute Gradle -"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %CMD_LINE_ARGS% :end @rem End local scope for the variables with windows NT shell -if %ERRORLEVEL% equ 0 goto mainEnd +if "%ERRORLEVEL%"=="0" goto mainEnd :fail rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of rem the _cmd.exe /c_ return code! -set EXIT_CODE=%ERRORLEVEL% -if %EXIT_CODE% equ 0 set EXIT_CODE=1 -if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% -exit /b %EXIT_CODE% +if not "" == "%GRADLE_EXIT_CONSOLE%" exit 1 +exit /b 1 :mainEnd if "%OS%"=="Windows_NT" endlocal diff --git a/tools/ci_build/github/android/build_aar_package.py b/tools/ci_build/github/android/build_aar_package.py index 1b34b3d302e57..19f66245a45e2 100644 --- a/tools/ci_build/github/android/build_aar_package.py +++ b/tools/ci_build/github/android/build_aar_package.py @@ -23,11 +23,11 @@ # Onnx Runtime native library is built against NDK API 21 by default # It is possible to build from source for Android API levels below 21, but it is not guaranteed -DEFAULT_ANDROID_MIN_SDK_VER = 24 +DEFAULT_ANDROID_MIN_SDK_VER = 21 # Android API 24 is the default target API version for Android builds, based on Microsoft 1CS requirements # It is possible to build from source using API level 21 and higher as the target SDK version -DEFAULT_ANDROID_TARGET_SDK_VER = 34 +DEFAULT_ANDROID_TARGET_SDK_VER = 24 def _parse_build_settings(args): diff --git a/tools/ci_build/github/android/default_full_aar_build_settings.json b/tools/ci_build/github/android/default_full_aar_build_settings.json index 1c7769c623d41..b0eff75812673 100644 --- a/tools/ci_build/github/android/default_full_aar_build_settings.json +++ b/tools/ci_build/github/android/default_full_aar_build_settings.json @@ -5,8 +5,8 @@ "x86", "x86_64" ], - "android_min_sdk_version": 24, - "android_target_sdk_version": 34, + "android_min_sdk_version": 21, + "android_target_sdk_version": 24, "build_params": [ "--enable_lto", "--android", diff --git a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml index 29c5f6bb34d7a..d8ea1c35c89c4 100644 --- a/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml +++ b/tools/ci_build/github/azure-pipelines/templates/react-native-ci.yml @@ -261,6 +261,8 @@ stages: publishJUnitResults: true testResultsFiles: '**/TEST-*.xml' testRunTitle: 'React Native Android Instrumented Test results' + javaHomeOption: 'path' + jdkDirectory: '$(JAVA_HOME_11_X64)' sonarQubeRunAnalysis: false spotBugsAnalysis: false displayName: Run React Native Android Instrumented Tests From 369d7bf8875d3104ac5e6fd2ab35774634209d3f Mon Sep 17 00:00:00 2001 From: Jian Chen Date: Thu, 21 Nov 2024 06:38:39 -0500 Subject: [PATCH 25/28] Update the Docker image version (#22907) ### Description ### Motivation and Context --- .../github/azure-pipelines/bigmodels-ci-pipeline.yml | 2 +- .../github/azure-pipelines/cuda-packaging-pipeline.yml | 1 - .../ci_build/github/azure-pipelines/linux-ci-pipeline.yml | 4 ++-- .../github/azure-pipelines/linux-gpu-ci-pipeline.yml | 4 ++-- .../azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml | 4 ++-- .../azure-pipelines/py-cuda-alt-package-test-pipeline.yml | 2 +- .../azure-pipelines/py-cuda-package-test-pipeline.yml | 2 +- .../azure-pipelines/stages/java-cuda-packaging-stage.yml | 4 ++-- .../stages/jobs/py-linux-cuda-package-test-job.yml | 4 ++-- .../azure-pipelines/stages/py-gpu-packaging-stage.yml | 4 ++-- .../github/azure-pipelines/templates/c-api-linux-cpu.yml | 7 ++----- .../templates/linux-cpu-packaging-pipeline.yml | 2 -- .../github/linux/docker/Dockerfile.manylinux2_28_cpu | 2 +- .../linux/docker/inference/aarch64/default/cpu/Dockerfile | 5 ++--- .../aarch64/default/cpu/scripts/install_centos.sh | 8 -------- .../inference/aarch64/default/cpu/scripts/install_deps.sh | 3 --- .../linux/docker/inference/aarch64/python/cpu/Dockerfile | 2 +- .../linux/docker/inference/x86_64/default/cpu/Dockerfile | 7 +++---- .../x86_64/default/cpu/scripts/install_centos.sh | 8 -------- .../inference/x86_64/default/cpu/scripts/install_deps.sh | 3 --- .../docker/inference/x86_64/default/cuda11/Dockerfile | 2 +- .../docker/inference/x86_64/default/cuda12/Dockerfile | 2 +- .../linux/docker/inference/x86_64/python/cpu/Dockerfile | 2 +- 23 files changed, 27 insertions(+), 57 deletions(-) delete mode 100755 tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh delete mode 100755 tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index e6d7a1fa1b1f7..75e8b1d7fcbc6 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -41,7 +41,7 @@ parameters: variables: - name: docker_base_image - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 - name: linux_trt_version value: 10.3.0.26-1.cuda11.8 - name: Repository diff --git a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml index 9b103715e734d..bc33aba57ec93 100644 --- a/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/cuda-packaging-pipeline.yml @@ -98,7 +98,6 @@ stages: jobs: - template: templates/c-api-linux-cpu.yml parameters: - BaseImage: 'registry.access.redhat.com/ubi8/ubi' OnnxruntimeArch: 'x64' OnnxruntimeNodejsBindingArch: 'x64' PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU' diff --git a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml index 003c0967c63bf..2eb2839cdac02 100644 --- a/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-ci-pipeline.yml @@ -54,7 +54,7 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi" + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecpubuildcentos8x64 - template: templates/linux-build-step-with-cache.yml @@ -149,7 +149,7 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu Context: tools/ci_build/github/linux/docker/ - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=registry.access.redhat.com/ubi8/ubi" + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecpubuild - task: PythonScript@0 diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml index 9860d83f2a45d..7bb1deb60c6ba 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-ci-pipeline.yml @@ -49,9 +49,9 @@ parameters: variables: - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 - name: Repository ${{ if eq(parameters.CudaVersion, '11.8') }}: diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml index 2c51210a5b5cd..9d60c9ea17cd8 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-ci-pipeline.yml @@ -40,9 +40,9 @@ variables: - template: templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml index 79254e2d6362a..9296928ad97e0 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-alt-package-test-pipeline.yml @@ -18,7 +18,7 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 trt_version: '10.6.0.26-1.cuda11.8' cuda_version: '11.8' diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml index ed350a40a2ced..307415b7be16f 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-package-test-pipeline.yml @@ -18,6 +18,6 @@ stages: machine_pool: 'Onnxruntime-Linux-GPU' python_wheel_suffix: '_gpu' timeout: 480 - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241111.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 cuda_version: '12.2' diff --git a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml index 3ed24c6365ba0..716383fd61dbb 100644 --- a/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/java-cuda-packaging-stage.yml @@ -148,9 +148,9 @@ stages: value: false - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 timeoutInMinutes: 60 steps: diff --git a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml index a81787ddeb967..47092393e0039 100644 --- a/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml +++ b/tools/ci_build/github/azure-pipelines/stages/jobs/py-linux-cuda-package-test-job.yml @@ -45,9 +45,9 @@ jobs: - template: ../../templates/common-variables.yml - name: docker_base_image ${{ if eq(parameters.CudaVersion, '11.8') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 ${{ if eq(parameters.CudaVersion, '12.2') }}: - value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241111.1 + value: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 - name: linux_trt_version ${{ if eq(parameters.CudaVersion, '11.8') }}: value: ${{ variables.linux_trt_version_cuda11 }} diff --git a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml index 9056731f22049..947e4f99b984f 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-gpu-packaging-stage.yml @@ -68,9 +68,9 @@ stages: cmake_build_type: ${{ parameters.cmake_build_type }} cuda_version: ${{ parameters.cuda_version }} ${{ if eq(parameters.cuda_version, '11.8') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241111.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11:20241120.3 ${{ if eq(parameters.cuda_version, '12.2') }}: - docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241111.1 + docker_base_image: onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12:20241120.3 - ${{ if eq(parameters.enable_windows_dml, true) }}: - ${{ each python_version in parameters.PythonVersions }}: diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml index c5bd4b93db947..d3b3315ebb04c 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-linux-cpu.yml @@ -5,9 +5,6 @@ parameters: type: string default: '' -- name: BaseImage - type: string - - name: OnnxruntimeArch type: string @@ -50,7 +47,7 @@ jobs: parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/x86_64/default/cpu - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}" + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging - ${{ if eq(parameters.OnnxruntimeArch, 'aarch64') }}: @@ -58,7 +55,7 @@ jobs: parameters: Dockerfile: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile Context: tools/ci_build/github/linux/docker/inference/aarch64/default/cpu - DockerBuildArgs: "--build-arg BUILD_UID=$( id -u ) --build-arg BASEIMAGE=${{parameters.BaseImage}}" + DockerBuildArgs: "--build-arg BUILD_UID=$( id -u )" Repository: onnxruntimecpubuildcentos8${{parameters.OnnxruntimeArch}}_packaging UpdateDepsTxt: false diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml index 8972d55f6e190..7ac2e3a8addb6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/linux-cpu-packaging-pipeline.yml @@ -29,7 +29,6 @@ stages: - template: c-api-linux-cpu.yml parameters: AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} - BaseImage: 'registry.access.redhat.com/ubi8/ubi' OnnxruntimeArch: 'x64' OnnxruntimeNodejsBindingArch: 'x64' PoolName: 'onnxruntime-Ubuntu2204-AMD-CPU' @@ -40,7 +39,6 @@ stages: - template: c-api-linux-cpu.yml parameters: AdditionalBuildFlags: ${{ parameters.AdditionalBuildFlags }} - BaseImage: 'arm64v8/almalinux:8' OnnxruntimeArch: 'aarch64' OnnxruntimeNodejsBindingArch: 'arm64' PoolName: 'onnxruntime-linux-ARM64-CPU-2019' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu index f2cfdefb6a376..d2d3aa1675c2e 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu +++ b/tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cpu @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241111.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241120.3 ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile index ca00050121d67..246ef09f7be25 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/Dockerfile @@ -2,15 +2,14 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=arm64v8/almalinux:8 -FROM $BASEIMAGE +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12_dotnet:20241120.3 ENV PATH=/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 ADD scripts /tmp/scripts -RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh deleted file mode 100755 index 596a5ce436c57..0000000000000 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_centos.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -e -x - -os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) - -echo "installing for CentOS version : $os_major_version" -dnf install -y python3.12-pip python3.12-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel java-11-openjdk-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran -locale diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh index bf08a853fe7f4..70bb373efb23f 100755 --- a/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/aarch64/default/cpu/scripts/install_deps.sh @@ -39,9 +39,6 @@ mkdir -p /tmp/src cd /tmp/src CPU_ARCH=$(uname -m) -echo "Installing cmake" -GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc2/cmake-3.31.0-rc2-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" -tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr echo "Installing Ninja" GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz diff --git a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile index e69ee81c59eee..43dd3badef387 100644 --- a/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/aarch64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20241111.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_aarch64_ubi8_gcc12:20241120.3 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile index 2985d6139b29b..fffe92d2583a2 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/Dockerfile @@ -2,15 +2,14 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -ARG BASEIMAGE=amd64/almalinux:8 -FROM $BASEIMAGE +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12_dotnet:20241120.3 ENV PATH=/usr/lib/jvm/msopenjdk-17/bin:/opt/rh/gcc-toolset-12/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin ENV LANG=en_US.UTF-8 ENV LC_ALL=en_US.UTF-8 -ENV JAVA_HOME=/usr/lib/jvm/msopenjdk-17 + ADD scripts /tmp/scripts -RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_deps.sh && rm -rf /tmp/scripts ARG BUILD_UID=1001 ARG BUILD_USER=onnxruntimedev diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh deleted file mode 100755 index 015cad4045fd2..0000000000000 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_centos.sh +++ /dev/null @@ -1,8 +0,0 @@ -#!/bin/bash -set -e -x -if [ ! -f /etc/yum.repos.d/microsoft-prod.repo ]; then - os_major_version=$(tr -dc '0-9.' < /etc/redhat-release |cut -d \. -f1) - echo "installing for CentOS version : $os_major_version" -fi -dnf install -y python3.12-pip python3.12-devel glibc-langpack-\* glibc-locale-source which redhat-lsb-core expat-devel tar unzip zlib-devel make bzip2 bzip2-devel graphviz gcc-toolset-12-binutils gcc-toolset-12-gcc gcc-toolset-12-gcc-c++ gcc-toolset-12-gcc-gfortran gcc-toolset-12-libasan-devel libasan.x86_64 -locale diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh index 0cc48a720b8f4..be906bf21a4fb 100755 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cpu/scripts/install_deps.sh @@ -38,9 +38,6 @@ mkdir -p /tmp/src cd /tmp/src CPU_ARCH=$(uname -m) -echo "Installing cmake" -GetFile "https://github.com/Kitware/CMake/releases/download/v3.31.0-rc2/cmake-3.31.0-rc2-linux-$CPU_ARCH.tar.gz" "/tmp/src/cmake.tar.gz" -tar -zxf /tmp/src/cmake.tar.gz --strip=1 -C /usr echo "Installing Ninja" GetFile https://github.com/ninja-build/ninja/archive/v1.10.0.tar.gz /tmp/src/ninja-linux.tar.gz diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile index 4a3420b7781fa..d386db7ab7bd8 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda11/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20241111.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda11_x64_almalinux8_gcc11_dotnet:20241120.3 ARG TRT_VERSION #Install TensorRT only if TRT_VERSION is not empty diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile index b862f12f99d0e..ba6f28be4636c 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/default/cuda12/Dockerfile @@ -2,7 +2,7 @@ # Licensed under the MIT License. # This file is used by Zip-Nuget Packaging NoContribOps Pipeline,Zip-Nuget-Java Packaging Pipeline -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20241111.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cuda12_x64_ubi8_gcc12_dotnet:20241120.3 ARG TRT_VERSION #Install TensorRT only if TRT_VERSION is not empty diff --git a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile index 987bd83376c61..857fc445ef74a 100644 --- a/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile +++ b/tools/ci_build/github/linux/docker/inference/x86_64/python/cpu/Dockerfile @@ -1,4 +1,4 @@ -FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241111.1 +FROM onnxruntimebuildcache.azurecr.io/internal/azureml/onnxruntime/build/cpu_x64_ubi8_gcc12:20241120.3 ADD scripts /tmp/scripts RUN cd /tmp/scripts && /tmp/scripts/install_centos.sh && rm -rf /tmp/scripts From 1e605be166a6855cb37b5df61dd0bc26de6326c4 Mon Sep 17 00:00:00 2001 From: kailums <109063327+kailums@users.noreply.github.com> Date: Thu, 21 Nov 2024 07:25:01 -0800 Subject: [PATCH 26/28] bigmodel pipeline update cp38 to cp310 (#22793) ### Description when updating from cp38 to cp310, there has some issues for bigmodel pipeine. there are two jobs failed: stable_diffusion and whisper. 1. for stable_diffusion, we are now using "nvcr.io/nvidia/pytorch:22.11-py3" from nvidia repo. it is for cuda11 and python3.8. and they are not providing python3.10 version for cuda 11. the latest version of this docker image is for cuda12 and python3.10. To solve this problem, i use a docker image of ubuntu22.04, and then install all need python package for this job. 2. for whisper. the original docker image is ubuntu20.04 which doesn't have python3.10, and has to update to ubuntu22.04. --- .../requirements/requirements.txt | 4 ++ .../stable_diffusion/test/requirements.txt | 1 + .../azure-pipelines/bigmodels-ci-pipeline.yml | 27 +++++--- ...rfile.package_ubi8_cuda_tensorrt10_0_torch | 2 +- ...Dockerfile.package_ubuntu_2204_gpu_ffmpeg} | 2 +- .../Dockerfile.package_ubuntu_2204_gpu_opencv | 64 +++++++++++++++++++ 6 files changed, 89 insertions(+), 11 deletions(-) rename tools/ci_build/github/linux/docker/{Dockerfile.package_ubuntu_2004_gpu_ffmpeg => Dockerfile.package_ubuntu_2204_gpu_ffmpeg} (97%) create mode 100644 tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt index 8ff5990b7815a..5bdd422a11750 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements/requirements.txt @@ -10,6 +10,10 @@ packaging protobuf==3.20.3 psutil sympy +nvtx==0.2.5 +torchvision==0.15.2 +tensorrt==8.5.1.7 +mediapipe controlnet_aux==0.0.9 # The following are for SDXL optimum==1.20.0 diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt index e51ffb395c643..1938f59208ae7 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/test/requirements.txt @@ -2,3 +2,4 @@ git+https://github.com/openai/CLIP.git open_clip_torch sentence_transformers pillow +numpy==1.22.2 diff --git a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml index 75e8b1d7fcbc6..aca06c320d1d3 100644 --- a/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/bigmodels-ci-pipeline.yml @@ -115,7 +115,7 @@ stages: set -ex; \ env; \ ccache -s; \ - /opt/python/cp38-cp38/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ + /opt/python/cp310-cp310/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build --cmake_generator Ninja \ --config Release --update --build \ --skip_submodule_sync \ @@ -180,6 +180,17 @@ stages: TargetPath: '$(Build.BinariesDirectory)/Release' SpecificArtifact: ${{ parameters.specificArtifact }} BuildId: ${{ parameters.BuildId }} + - template: templates/get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv + Context: tools/ci_build/github/linux/docker/ + ScriptName: tools/ci_build/get_docker_image.py + DockerBuildArgs: " + --build-arg BUILD_UID=$( id -u ) + " + Repository: onnxruntimeubuntupackagestest_cuda11 + UseImageCacheContainerRegistry: false + UpdateDepsTxt: false - task: Cache@2 inputs: @@ -196,18 +207,15 @@ stages: -v $(Build.BinariesDirectory)/Release:/Release \ -v $(STABLE_DIFFUSION_MODEL_CACHE):/model_cache:rw \ -v $(GenerateImage_DIR):/images:rw \ - nvcr.io/nvidia/pytorch:22.11-py3 \ + onnxruntimeubuntupackagestest_cuda11 \ bash -c ' \ set -ex; \ - pip uninstall -y $(pip list --format=freeze | grep opencv); \ - rm -rf /usr/local/lib/python3.8/dist-packages/cv2/; \ - apt-get update; \ - DEBIAN_FRONTEND="noninteractive" apt-get install --yes python3-opencv; \ python3 --version; \ python3 -m pip install --upgrade pip; \ python3 -m pip install /Release/*.whl; \ pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion; \ python3 -m pip install -r requirements/cuda11/requirements.txt; \ + python3 -m pip install numpy==1.22.2; \ python3 -m pip install --upgrade polygraphy onnx-graphsurgeon ; \ echo Generate an image guided by a text prompt; \ python3 demo_txt2img.py --framework-model-dir /model_cache --seed 1 --deterministic "astronaut riding a horse on mars" ; \ @@ -238,7 +246,7 @@ stages: - script: | docker run --rm --gpus all -v $PWD:/workspace \ -v $(CLIP_MODEL_CACHE):/model_cache:rw \ - nvcr.io/nvidia/pytorch:22.11-py3 \ + onnxruntimeubuntupackagestest_cuda11 \ bash -c ' set -x; \ python3 --version; \ @@ -265,7 +273,7 @@ stages: - script: | docker run --rm --gpus all -v $PWD:/workspace \ -v $(CLIP_MODEL_CACHE):/model_cache:rw \ - nvcr.io/nvidia/pytorch:22.11-py3 \ + onnxruntimeubuntupackagestest_cuda11 \ bash -c ' set -ex; \ python3 --version; \ @@ -273,6 +281,7 @@ stages: pushd /workspace/onnxruntime/python/tools/transformers/models/stable_diffusion/; \ image2=$(find $(pwd) -name "astronaut_riding_a_h*.png") ; \ pushd test; \ + python3 -m pip install numpy==1.22.2; \ python3 -m pip install -r requirements.txt; \ echo check demo_txt2image.py generate image; \ python3 -u check_image.py --image1 astronaut_riding_txt2image-DDIM-50.png --image2 $image2 --cache_dir /model_cache ; \ @@ -438,7 +447,7 @@ stages: - template: templates/get-docker-image-steps.yml parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg Context: tools/ci_build/github/linux/docker/ ScriptName: tools/ci_build/get_docker_image.py DockerBuildArgs: '--build-arg BUILD_UID=$( id -u )' diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch index e1203f55106ce..2ecc6d1918b1a 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubi8_cuda_tensorrt10_0_torch @@ -9,7 +9,7 @@ ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubi8 ARG TRT_VERSION=10.6.0.26-1.cuda11.8 FROM $BASEIMAGE AS base ARG TRT_VERSION -ENV PATH=/opt/python/cp38-cp38/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} +ENV PATH=/opt/python/cp310-cp310/bin:/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} RUN dnf install -y bash wget &&\ dnf clean dbcache diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg similarity index 97% rename from tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg rename to tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg index 6ce5a59802641..4298dd53e4c66 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2004_gpu_ffmpeg +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_ffmpeg @@ -5,7 +5,7 @@ # Dockerfile to run ONNXRuntime with TensorRT integration # Build base image with required system packages -ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu20.04 +ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 ARG TRT_VERSION=10.6.0.26-1+cuda11.8 ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 FROM $BASEIMAGE AS base diff --git a/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv new file mode 100644 index 0000000000000..1312475ceca3a --- /dev/null +++ b/tools/ci_build/github/linux/docker/Dockerfile.package_ubuntu_2204_gpu_opencv @@ -0,0 +1,64 @@ +# -------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. +# -------------------------------------------------------------- +# Dockerfile to run ONNXRuntime with TensorRT integration + +# Build base image with required system packages +ARG BASEIMAGE=nvidia/cuda:11.8.0-cudnn8-devel-ubuntu22.04 +ARG TRT_VERSION=10.6.0.26-1+cuda11.8 +ARG LD_LIBRARY_PATH_ARG=/usr/local/lib64:/usr/local/cuda/lib64 +FROM $BASEIMAGE AS base +ARG TRT_VERSION +ENV PATH=/usr/local/nvidia/bin:/usr/local/cuda/bin:/usr/src/tensorrt/bin:${PATH} +ENV DEBIAN_FRONTEND=noninteractive + +ENV LD_LIBRARY_PATH=${LD_LIBRARY_PATH_ARG}:${LD_LIBRARY_PATH} + +RUN apt-get update &&\ + apt-get install -y git bash wget diffutils + +RUN DEBIAN_FRONTEND="noninteractive" apt-get install --yes python3-opencv + +# Install python3 +RUN apt-get install -y --no-install-recommends \ + python3 \ + python3-pip \ + python3-dev \ + python3-wheel + +RUN pip install --upgrade pip + +# Install TensorRT +RUN apt-key adv --fetch-keys https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64/7fa2af80.pub &&\ + apt-get update &&\ + apt-get install -y \ + libnvinfer-dev=${TRT_VERSION} \ + libnvinfer-dispatch-dev=${TRT_VERSION} \ + libnvinfer-dispatch10=${TRT_VERSION} \ + libnvinfer-headers-dev=${TRT_VERSION} \ + libnvinfer-headers-plugin-dev=${TRT_VERSION} \ + libnvinfer-lean-dev=${TRT_VERSION} \ + libnvinfer-lean10=${TRT_VERSION} \ + libnvinfer-plugin-dev=${TRT_VERSION} \ + libnvinfer-plugin10=${TRT_VERSION} \ + libnvinfer-vc-plugin-dev=${TRT_VERSION} \ + libnvinfer-vc-plugin10=${TRT_VERSION} \ + libnvinfer10=${TRT_VERSION} \ + libnvonnxparsers-dev=${TRT_VERSION} \ + libnvonnxparsers10=${TRT_VERSION} \ + tensorrt-dev=${TRT_VERSION} \ + libnvinfer-bin=${TRT_VERSION} &&\ + if [ $(echo $CUDA_VERSION | cut -d"." -f1) -ge 12 ]; then apt-get install -y cudnn9-cuda-12 ; fi +# ^^^^^^^^^^^If cuda version is 12 or higher, install cudnn 9 for cuda 12 + +ADD scripts /tmp/scripts +RUN cd /tmp/scripts && /tmp/scripts/install_dotnet.sh && rm -rf /tmp/scripts + +# Build final image from base. +FROM base as final +ARG BUILD_USER=onnxruntimedev +ARG BUILD_UID=1000 +RUN adduser --uid $BUILD_UID $BUILD_USER +WORKDIR /home/$BUILD_USER +USER $BUILD_USER From 55f0559e5d493a6ac8208b588c42aff583f7d714 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 21 Nov 2024 09:42:41 -0800 Subject: [PATCH 27/28] Update attention fusion to support SDPA pattern (#22629) ### Description Match new SDPA pattern for huggingface BERT model that exported from latest transformers package. Some changes of transformers tests in CI pipeline: (1) Enable tests for bert, distilbert and roberta models in CI. (2) Remove out-of-date tests for huggingface models that were marked as slow and not enabled in CI pipeline. (3) Upgrade transformers package version to the latest. ### Motivation and Context Recent huggingface transformers use torch SDPA in bert modeling. The graph pattern change causes attention fusion not working anymore. Update the fusion script to match the new pattern. --- .../tools/transformers/bert_test_data.py | 2 +- .../transformers/compare_bert_results.py | 2 + .../tools/transformers/fusion_attention.py | 270 +++++++++--------- .../transformers/fusion_attention_clip.py | 4 +- .../transformers/fusion_bart_attention.py | 43 +-- .../fusion_conformer_attention.py | 18 +- .../tools/transformers/onnx_exporter.py | 8 +- .../transformers/onnx_model_bert_keras.py | 23 +- .../tools/transformers/onnx_model_bert_tf.py | 23 +- .../python/transformers/test_optimizer.py | 247 +--------------- .../test_optimizer_huggingface_bert.py | 151 ++++++++++ .../python/transformers/test_parity_moe.py | 12 +- .../transformers-test/requirements.txt | 2 +- 13 files changed, 355 insertions(+), 450 deletions(-) create mode 100644 onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py diff --git a/onnxruntime/python/tools/transformers/bert_test_data.py b/onnxruntime/python/tools/transformers/bert_test_data.py index 167fc8697ce06..ccf2497d61342 100644 --- a/onnxruntime/python/tools/transformers/bert_test_data.py +++ b/onnxruntime/python/tools/transformers/bert_test_data.py @@ -250,6 +250,7 @@ def generate_test_data( average_sequence_length: int, random_sequence_length: bool, mask_type: int, + dictionary_size: int = 10000, ): """Create given number of input data for testing @@ -270,7 +271,6 @@ def generate_test_data( List[Dict[str,numpy.ndarray]]: list of test cases, where each test case is a dictionary with input name as key and a tensor as value """ - dictionary_size = 10000 all_inputs = fake_test_data( batch_size, sequence_length, diff --git a/onnxruntime/python/tools/transformers/compare_bert_results.py b/onnxruntime/python/tools/transformers/compare_bert_results.py index 0c5125e74c8a4..03bcc20d9a5de 100644 --- a/onnxruntime/python/tools/transformers/compare_bert_results.py +++ b/onnxruntime/python/tools/transformers/compare_bert_results.py @@ -85,6 +85,7 @@ def run_test( segment_ids_name, input_mask_name, mask_type, + dictionary_size: int = 1024, ): # Try deduce input names from optimized model. input_ids, segment_ids, input_mask = get_bert_inputs( @@ -105,6 +106,7 @@ def run_test( average_sequence_length, True, # random sequence length mask_type, + dictionary_size=dictionary_size, ) baseline_results, baseline_latency, output_names = run_model( diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py index a9ff623fb6967..030708783bb61 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_attention.py @@ -42,26 +42,26 @@ def get_first_mask(self): assert len(self.mask_indice) > 0 return next(iter(self.mask_indice)) - def process_mask(self, input: str) -> str: + def process_mask(self, mask_2d: str) -> Optional[str]: if self.mask_format == AttentionMaskFormat.NoMask: return None - if input in self.mask_indice: - return self.mask_indice[input] + if mask_2d in self.mask_indice: + return self.mask_indice[mask_2d] # Add cast to convert int64 to int32 - if self.model.find_graph_input(input): - casted, input_name = self.utils.cast_graph_input_to_int32(input) + if self.model.find_graph_input(mask_2d): + casted, input_name = self.utils.cast_graph_input_to_int32(mask_2d) else: - input_name, cast_node = self.utils.cast_input_to_int32(input) + input_name, _cast_node = self.utils.cast_input_to_int32(mask_2d) casted = True if casted: - self.mask_casted[input] = input_name + self.mask_casted[mask_2d] = input_name # Attention supports int32 attention mask (2D) since 1.4.0 if self.mask_format == AttentionMaskFormat.AttentionMask: - self.mask_indice[input] = input_name + self.mask_indice[mask_2d] = input_name return input_name # Add a mask processing node to convert attention mask to mask index (1D) @@ -97,7 +97,7 @@ def process_mask(self, input: str) -> str: self.model.add_node(mask_index_node) - self.mask_indice[input] = output_name + self.mask_indice[mask_2d] = output_name return output_name @@ -173,17 +173,20 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int] Tuple[int, int]: num_heads and hidden_size """ # we assume that reshape fusion has done, so the shape is a tensor like [0, 0, num_heads, head_size] - q_shape = self.model.get_initializer(reshape_q.input[1]) - if q_shape is None: + q_shape_value = self.model.get_constant_value(reshape_q.input[1]) + if q_shape_value is None: concat = self.model.get_parent(reshape_q, 1) if concat is not None and concat.op_type == "Concat": return self.get_num_heads_and_hidden_size_from_concat(concat) - logger.debug(f"{reshape_q.input[1]} is not initializer.") + logger.debug("%s is not initializer.", reshape_q.input[1]) return self.num_heads, self.hidden_size # Fall back to user specified value - q_shape_value = NumpyHelper.to_array(q_shape) - if len(q_shape_value) != 4 or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0): - logger.debug(f"q_shape_value={q_shape_value}. Expected value are like [0, 0, num_heads, head_size].") + if ( + (not isinstance(q_shape_value, np.ndarray)) + or len(q_shape_value) != 4 + or (q_shape_value[2] <= 0 or q_shape_value[3] <= 0) + ): + logger.debug("q_shape_value=%s. Expected value are like [0, 0, num_heads, head_size].", q_shape_value) return self.num_heads, self.hidden_size # Fall back to user specified value num_heads = q_shape_value[2] @@ -192,13 +195,15 @@ def get_num_heads_and_hidden_size(self, reshape_q: NodeProto) -> Tuple[int, int] if self.num_heads > 0 and num_heads != self.num_heads: if self.num_heads_warning: - logger.warning(f"--num_heads is {self.num_heads}. Detected value is {num_heads}. Using detected value.") + logger.warning( + "--num_heads is %d. Detected value is %d. Using detected value.", self.num_heads, num_heads + ) self.num_heads_warning = False # Do not show the warning more than once if self.hidden_size > 0 and hidden_size != self.hidden_size: if self.hidden_size_warning: logger.warning( - f"--hidden_size is {self.hidden_size}. Detected value is {hidden_size}. Using detected value." + "--hidden_size is %d. Detected value is %d. Using detected value.", self.hidden_size, hidden_size ) self.hidden_size_warning = False # Do not show the warning more than once @@ -216,11 +221,11 @@ def get_add_qk_str(self, add_qk: NodeProto): input_1_shape = self.shape_infer.get_edge_shape(add_qk.input[1]) if input_0_shape is None or input_1_shape is None: - logger.debug(f"one of the inputs of {add_qk} is None") + logger.debug("one of the inputs of %s is None", add_qk) return None if input_0_shape != input_1_shape: - logger.debug(f"the shape of two inputs of {add_qk} is not same") + logger.debug("the shape of two inputs of %s is not same", add_qk) return None return add_qk.input[1] @@ -305,55 +310,6 @@ def concat_kv(self, past_k: str, past_v: str) -> str: return kv_output_name - def reshape_kv(self, past_k: str, past_v: str) -> (str, str): - """Reshape past_k and past_v from 4D to 3D to use as inputs for multihead attention node. - - Args: - past_k (str): name of past K value of shape 4D - past_v (str): name of past V value of shape 4D - - Returns: - k_3d (str): name of past K value of shape 3D - v_3d (str): name of past V value of shape 3D - """ - # Reshape past_k and past_v from (B,N,P,H) to (B,P,N*H) - # B = batch size, N = num heads, P = past seq len, H = head size - - # Create initializer for reshaping past_k and past_v - new_dims_name = "kv_4d_to_3d" - new_dims = self.model.get_initializer(new_dims_name) - if new_dims is None: - new_dims = numpy_helper.from_array( - np.array([0, -1, self.model.hidden_size], dtype="int64"), name=new_dims_name - ) - self.model.add_initializer(new_dims, self.this_graph_name) - - reshape_k_name = self.model.create_node_name("Reshape") - reshape_v_name = self.model.create_node_name("Reshape") - k_3d_name = (past_k + "_3d").replace(".", "_") - v_3d_name = (past_v + "_3d").replace(".", "_") - - k_3d = helper.make_node( - "Reshape", - inputs=[past_k, new_dims_name], - outputs=[k_3d_name], - name=reshape_k_name, - ) - v_3d = helper.make_node( - "Reshape", - inputs=[past_v, new_dims_name], - outputs=[v_3d_name], - name=reshape_v_name, - ) - - # Add reshape nodes to graph - self.nodes_to_add.append(k_3d) - self.nodes_to_add.append(v_3d) - self.node_name_to_graph_name[reshape_k_name] = self.this_graph_name - self.node_name_to_graph_name[reshape_v_name] = self.this_graph_name - - return k_3d_name, v_3d_name - def split_kv(self, present_k_name: str, present_v_name: str, kv_node: str): """Split kv_node containing present KV values into separate present K and present V values. @@ -476,8 +432,7 @@ def create_packed_qkv_matmul_node( q_add: NodeProto, k_add: Union[NodeProto, None], v_add: Union[NodeProto, None], - num_heads: int, - ) -> Union[NodeProto, None]: + ) -> Tuple[NodeProto, NodeProto, NodeProto]: """Create packed QKV MatMul node before MultiHeadAttention node. This is for the scenario where an Attention node should be created but cannot be created because past_key and past_value are separate inputs and not one concatenated input. @@ -489,10 +444,11 @@ def create_packed_qkv_matmul_node( q_add (NodeProto): name of Add from Q path k_add (NodeProto): name of Add from K path v_add (NodeProto): name of Add from V path - num_heads (int): number of heads Returns: - Union[NodeProto, None]: the node created or None if failed. + q_output (NodeProto): Slice node for Q + k_output (NodeProto): Slice node for K + v_output (NodeProto): Slice node for V """ matmul_node_name = self.model.create_node_name("MatMul") @@ -611,6 +567,7 @@ def create_packed_qkv_matmul_node( self.nodes_to_add.extend(qkv_nodes) return q_output, k_output, v_output + # This function is used in child classes for bart or conformer model. def create_multihead_attention_node( self, q_matmul: NodeProto, @@ -659,7 +616,7 @@ def create_multihead_attention_node( assert num_heads > 0 if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}") + logger.debug("input hidden size %d is not a multiple of num of heads %d", hidden_size, num_heads) return None graph_input_names = set([node.name for node in self.model.graph().input]) @@ -669,17 +626,22 @@ def create_multihead_attention_node( mha_inputs = [] if packed_qkv: q_slice, k_slice, v_slice = self.create_packed_qkv_matmul_node( - q_matmul, k_matmul, v_matmul, q_add, k_add, v_add, num_heads + q_matmul, + k_matmul, + v_matmul, + q_add, + k_add, + v_add, ) mha_inputs.extend([q_slice.output[0], k_slice.output[0], v_slice.output[0]]) - elif type(k_matmul) is NodeProto and type(v_matmul) is NodeProto: + elif isinstance(k_matmul, NodeProto) and isinstance(v_matmul, NodeProto): if self.disable_multi_head_attention_bias: mha_inputs.extend([q_add.output[0], k_matmul.output[0], v_add.output[0]]) else: mha_inputs.extend([q_matmul.output[0], k_matmul.output[0], v_matmul.output[0]]) elif ( - type(k_matmul) == str # noqa: E721 - and type(v_matmul) == str # noqa: E721 + isinstance(k_matmul, str) + and isinstance(v_matmul, str) and k_matmul in graph_input_names and v_matmul in graph_input_names ): @@ -724,7 +686,7 @@ def create_multihead_attention_node( def create_attention_node( self, - mask_index: str, + mask_index: Optional[str], q_matmul: NodeProto, k_matmul: NodeProto, v_matmul: NodeProto, @@ -733,7 +695,7 @@ def create_attention_node( v_add: NodeProto, num_heads: int, hidden_size: int, - input: str, + first_input: str, output: str, add_qk_str: str = "", past_k: str = "", @@ -746,7 +708,7 @@ def create_attention_node( """Create an Attention node. Args: - mask_index (str): mask input + mask_index (str | None): mask input q_matmul (NodeProto): MatMul node in fully connection for Q k_matmul (NodeProto): MatMul node in fully connection for K v_matmul (NodeProto): MatMul node in fully connection for V @@ -755,7 +717,7 @@ def create_attention_node( v_add (NodeProto): Add bias node in fully connection for V num_heads (int): number of attention heads. If a model is pruned, it is the number of heads after pruning. hidden_size (int): hidden dimension. If a model is pruned, it is the hidden dimension after pruning. - input (str): input name + first_input (str): first input name output (str): output name add_qk_str (str): name of Add node after Q x K' past_k (str): name of input for past K value @@ -771,7 +733,7 @@ def create_attention_node( assert num_heads > 0 if hidden_size > 0 and (hidden_size % num_heads) != 0: - logger.debug(f"input hidden size {hidden_size} is not a multiple of num of heads {num_heads}") + logger.debug("input hidden size %d is not a multiple of num of heads %d", hidden_size, num_heads) return None has_bias = True @@ -813,8 +775,10 @@ def create_attention_node( if hidden_size > 0 and hidden_size != qw_in_size: logger.warning( - f"Input hidden size ({hidden_size}) is not same as weight matrix dimension of q,k,v ({qw_in_size}). " - "Please provide a correct input hidden size or pass in 0" + "Input hidden size (%d) is not same as weight matrix dimension of q,k,v (%d). " + "Please provide a correct input hidden size or pass in 0", + hidden_size, + qw_in_size, ) is_qkv_diff_dims = False @@ -836,6 +800,8 @@ def create_attention_node( qkv_weight = np.stack((qw, kw, vw), axis=1) qkv_weight_dim = 3 * qw_out_size + qkv_bias_dim = 0 + qkv_bias: Optional[np.ndarray] = None if has_bias: qb = NumpyHelper.to_array(q_bias) kb = NumpyHelper.to_array(k_bias) @@ -861,7 +827,7 @@ def create_attention_node( self.add_initializer( name=attention_node_name + "_qkv_weight", data_type=q_weight.data_type, - dims=[qw_in_size, qkv_weight_dim], + dims=[qw_in_size, int(qkv_weight_dim)], vals=qkv_weight, ) @@ -869,7 +835,7 @@ def create_attention_node( self.add_initializer( name=attention_node_name + "_qkv_bias", data_type=q_bias.data_type, - dims=[qkv_bias_dim], + dims=[int(qkv_bias_dim)], vals=qkv_bias, ) @@ -897,7 +863,7 @@ def create_attention_node( ) else: attention_inputs = [ - input, + first_input, attention_node_name + "_qkv_weight", attention_node_name + "_qkv_bias" if has_bias else "", ] @@ -911,7 +877,7 @@ def create_attention_node( past_kv = self.concat_kv(past_k, past_v) attention_inputs.append(past_kv) - if add_qk_str is not None: + if add_qk_str: mask_output_name = self.reshape_add_qk(add_qk_str) # Add attention mask to attention node @@ -951,9 +917,10 @@ def create_attention_node( return attention_node - def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): + def fuse(self, node, input_name_to_nodes, output_name_to_node): # Sometimes we can not fuse skiplayernormalization since the add before layernorm has an output that used by nodes outside skiplayernorm # Conceptually we treat add before layernorm as skiplayernorm node since they share the same pattern + normalize_node = node start_node = normalize_node if normalize_node.op_type == "LayerNormalization": add_before_layernorm = self.model.match_parent(normalize_node, "Add", 0) @@ -982,25 +949,24 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): return other_inputs = [] - for _i, input in enumerate(start_node.input): - if input not in output_name_to_node: + for _i, node_input in enumerate(start_node.input): + if node_input not in output_name_to_node: continue - if input == qkv_nodes[0].output[0]: + if node_input == qkv_nodes[0].output[0]: continue - other_inputs.append(input) + other_inputs.append(node_input) if len(other_inputs) != 1: return root_input = other_inputs[0] - """ - Match flaubert Mask - | - Mul --> LayerNormalization --> Attention --> MatMul --> Add - | | - | | - +--------------------------------------------------------- - """ + + # Match flaubert Mask + # | + # Mul --> LayerNormalization --> Attention --> MatMul --> Add + # | | + # | | + # +--------------------------------------------------------- mul_before_layernorm = self.model.match_parent(start_node, "Mul", 0) if mul_before_layernorm is not None: mul_children = input_name_to_nodes[mul_before_layernorm.output[0]] @@ -1020,19 +986,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): if child.op_type == "LayerNormalization": root_input = child.output[0] - """ - When Add before the LayerNormalization produces an output - that is consumed by some other nodes other than the LayerNormalization itself, - fused SkipLayerNormalization will have several outputs. - In this case we need to pick the one used in Attention - - For example, this is the case for ViT - - SkipLayerNormalization --> Attention --> MatMul --> Add --> SkipLayerNormalization - | | - | | - +---------------------------------------------------------------------+ - """ + # When Add before the LayerNormalization produces an output + # that is consumed by some other nodes other than the LayerNormalization itself, + # fused SkipLayerNormalization will have several outputs. + # In this case we need to pick the one used in Attention + # For example, this is the case for ViT + # SkipLayerNormalization --> Attention --> MatMul --> Add --> SkipLayerNormalization + # | | + # | | + # +---------------------------------------------------------------------+ parent_node = output_name_to_node[root_input] if parent_node.op_type == "SkipLayerNormalization" and len(parent_node.output) == 4: root_input = parent_node.output[0] @@ -1051,12 +1013,14 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): is_distill = False is_distill_add = False is_no_mask_attention = False + is_sdpa = False qk_paths = { "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]), "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]), "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]), "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]), "path5": (["Softmax", "Div", "MatMul"], [0, 0, 0]), + "sdpa": (["Softmax", "Add", "MatMul", "Mul", "Sqrt"], [0, 0, None, 0, 1]), } qk_nodes = None @@ -1066,10 +1030,12 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): continue if k == "path3": is_distill = True - if k == "path4": + elif k == "path4": is_distill_add = True - if k == "path5": + elif k == "path5": is_no_mask_attention = True + elif k == "sdpa": + is_sdpa = True break if qk_nodes is None: @@ -1079,19 +1045,23 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): add_qk = None matmul_qk = None where_qk = None + after_q = None if is_distill: (_, where_qk, matmul_qk, _) = qk_nodes elif is_distill_add: (_, add_qk, where_qk, matmul_qk) = qk_nodes elif is_no_mask_attention: (_, _, matmul_qk) = qk_nodes + elif is_sdpa: + (_, add_qk, matmul_qk, after_q, _) = qk_nodes else: (_, add_qk, _, matmul_qk) = qk_nodes - q_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]) + after_q = after_q or matmul_qk + q_nodes = self.model.match_parent_path(after_q, ["Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, None]) if q_nodes is None: q_nodes = self.model.match_parent_path( - matmul_qk, + after_q, ["Div", "Transpose", "Reshape", "Add", "MatMul"], [0, 0, 0, 0, None], ) @@ -1102,7 +1072,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): add_q = q_nodes[-2] matmul_q = q_nodes[-1] - k_nodes = self.model.match_parent_path(matmul_qk, ["Transpose", "Reshape", "Add", "MatMul"], [1, 0, 0, None]) + after_k = matmul_qk + if is_sdpa: + mul_k_nodes = self.model.match_parent_path(matmul_qk, ["Mul", "Sqrt"], [1, None]) + if mul_k_nodes is None: + logger.debug("fuse_attention: failed to match mul sqrt q path") + return + (after_k, _) = mul_k_nodes + + k_nodes = self.model.match_parent_path( + after_k, ["Transpose", "Reshape", "Add", "MatMul"], [0 if is_sdpa else 1, 0, 0, None] + ) if k_nodes is None: k_nodes = self.model.match_parent_path( matmul_qk, @@ -1117,7 +1097,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): # Note that Cast might be removed by OnnxRuntime so we match two patterns here. mask_nodes = None - add_qk_str = None + add_qk_str = "" if is_distill: _, mask_nodes, _ = self.model.match_parent_paths( where_qk, @@ -1140,7 +1120,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): if add_qk is not None: add_qk_str = self.get_add_qk_str(add_qk) if add_qk_str is None: - logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}") + logger.debug("fuse_attention: failed to verify shape inference of %s", add_qk) return elif is_no_mask_attention: pass @@ -1148,11 +1128,11 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): _, mask_nodes, _ = self.model.match_parent_paths( add_qk, [ - ( - ["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], - [None, 0, 1, 0, 0], - ), + (["Mul", "Sub", "Cast", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0, 0]), (["Mul", "Sub", "Unsqueeze", "Unsqueeze"], [None, 0, 1, 0]), + # The following two patterns are for SDPA. + (["Where", "Cast", "Sub", "Expand", "Unsqueeze", "Unsqueeze"], [None, 0, 0, 1, 0, 0]), + (["Where", "Cast", "Sub", "Cast", "Expand", "Unsqueeze", "Unsqueeze"], [None, 0, 0, 1, 0, 0, 0]), ], output_name_to_node, ) @@ -1160,10 +1140,17 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): logger.debug("fuse_attention: failed to match mask path") return - if not is_no_mask_attention and len(mask_nodes) > 1 and mask_nodes[0].op_type == "Mul": + if not is_no_mask_attention and len(mask_nodes) > 1: _, mul_val = self.model.get_constant_input(mask_nodes[0]) - if mul_val != -10000: - self.mask_filter_value = mul_val + # The mask value shall be a float scalar (usually is the lowest float value). + if ( + (mul_val is None) + or not (isinstance(mul_val, np.ndarray) and mul_val.size == 1) + or (float(mul_val) >= 0) + ): + return + if float(mul_val) != -10000: + self.mask_filter_value = float(mul_val) if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input: mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) if not is_no_mask_attention else None @@ -1181,19 +1168,20 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): # number of heads are same for all the paths, hence to create attention node, we pass the q_num_heads # the input_hidden_size represents the input hidden size, this is used as needed but hidden sizes for Q, K are extracted appropriately new_node = self.create_attention_node( - mask_index, - matmul_q, - matmul_k, - matmul_v, - add_q, - add_k, - add_v, - q_num_heads, - q_hidden_size, - root_input, - attention_last_node.output[0], - add_qk_str, + mask_index=mask_index, + q_matmul=matmul_q, + k_matmul=matmul_k, + v_matmul=matmul_v, + q_add=add_q, + k_add=add_k, + v_add=add_v, + num_heads=q_num_heads, + hidden_size=q_hidden_size, + first_input=root_input, + output=attention_last_node.output[0], + add_qk_str=add_qk_str, ) + if new_node is None: return @@ -1208,7 +1196,7 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): name="shape_modified_tensor" + unique_index, data_type=TensorProto.INT64, dims=[4], - vals=np.int64([0, 0, q_num_heads, int(q_hidden_size / q_num_heads)]), + vals=[0, 0, q_num_heads, int(q_hidden_size / q_num_heads)], raw=False, ) diff --git a/onnxruntime/python/tools/transformers/fusion_attention_clip.py b/onnxruntime/python/tools/transformers/fusion_attention_clip.py index b027957fcc725..16e2c36bfd092 100644 --- a/onnxruntime/python/tools/transformers/fusion_attention_clip.py +++ b/onnxruntime/python/tools/transformers/fusion_attention_clip.py @@ -239,9 +239,9 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): v_add=add_v, num_heads=num_heads, hidden_size=hidden_size, - input=root_input, + first_input=root_input, output=attention_last_node.output[0], - add_qk_str=None, + add_qk_str="", scale=None, causal=(add_mask is not None), ) diff --git a/onnxruntime/python/tools/transformers/fusion_bart_attention.py b/onnxruntime/python/tools/transformers/fusion_bart_attention.py index ebecc1db24792..8c334b83abfeb 100644 --- a/onnxruntime/python/tools/transformers/fusion_bart_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_bart_attention.py @@ -564,15 +564,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): # value whereas attention supports concatenated past key and past value. new_node = ( self.create_multihead_attention_node( - matmul_q, - matmul_k if decoder_cross_attention or decoder_attention_with_past else past_k, - matmul_v if decoder_cross_attention or decoder_attention_with_past else past_v, - add_q, - add_k if decoder_cross_attention or decoder_attention_with_past else None, - add_v if decoder_cross_attention or decoder_attention_with_past else None, - num_heads, - hidden_size, - attention_last_node.output[0], + q_matmul=matmul_q, + k_matmul=matmul_k if decoder_cross_attention or decoder_attention_with_past else past_k, + v_matmul=matmul_v if decoder_cross_attention or decoder_attention_with_past else past_v, + q_add=add_q, + k_add=add_k if decoder_cross_attention or decoder_attention_with_past else None, + v_add=add_v if decoder_cross_attention or decoder_attention_with_past else None, + num_heads=num_heads, + hidden_size=hidden_size, + output=attention_last_node.output[0], past_k=past_k if decoder_attention_with_past else "", past_v=past_v if decoder_attention_with_past else "", present_k=present_k, @@ -586,19 +586,20 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): # Temporarily set multihead attention flag to false use_multi_head_attention_ground_truth = self.use_multi_head_attention self.use_multi_head_attention = False + add_qk_str = mask_index if decoder_attention and mask_index else "" new_node = self.create_attention_node( - None, - matmul_q, - matmul_k, - matmul_v, - add_q, - add_k, - add_v, - num_heads, - hidden_size, - root_input, - attention_last_node.output[0], - add_qk_str=mask_index if decoder_attention else None, + mask_index=None, + q_matmul=matmul_q, + k_matmul=matmul_k, + v_matmul=matmul_v, + q_add=add_q, + k_add=add_k, + v_add=add_v, + num_heads=num_heads, + hidden_size=hidden_size, + first_input=root_input, + output=attention_last_node.output[0], + add_qk_str=add_qk_str, past_k=past_k, past_v=past_v, present_k=present_k, diff --git a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py index 6bc681c57444e..f29d0a0ac9441 100644 --- a/onnxruntime/python/tools/transformers/fusion_conformer_attention.py +++ b/onnxruntime/python/tools/transformers/fusion_conformer_attention.py @@ -102,15 +102,15 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node): return new_node = self.create_multihead_attention_node( - matmul_q, - matmul_k, - matmul_v, - add_q, - add_k, - add_v, - num_heads, - hidden_size, - attention_last_node.output[0], + q_matmul=matmul_q, + k_matmul=matmul_k, + v_matmul=matmul_v, + q_add=add_q, + k_add=add_k, + v_add=add_v, + num_heads=num_heads, + hidden_size=hidden_size, + output=attention_last_node.output[0], add_qk=add_qk.input[1], past_k=past_k, past_v=past_v, diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py index 212a7c4871e6a..c3ccde50dac85 100644 --- a/onnxruntime/python/tools/transformers/onnx_exporter.py +++ b/onnxruntime/python/tools/transformers/onnx_exporter.py @@ -392,11 +392,13 @@ def validate_and_optimize_onnx( False, output_names, ) - if optimize_info == OptimizerInfo.NOOPT: + if optimize_info.name == OptimizerInfo.NOOPT.name: return onnx_model_path, is_valid_onnx_model, config.vocab_size if ( - optimize_info == OptimizerInfo.BYSCRIPT or precision == Precision.FLOAT16 or precision == Precision.INT8 + optimize_info.name == OptimizerInfo.BYSCRIPT.name + or precision == Precision.FLOAT16 + or precision == Precision.INT8 ): # Use script (optimizer.py) to optimize optimized_model_path = get_onnx_file_path( onnx_dir, @@ -439,7 +441,7 @@ def validate_and_optimize_onnx( QuantizeHelper.quantize_onnx_model(onnx_model_path, onnx_model_path, use_external_data_format) logger.info(f"Finished quantizing model: {onnx_model_path}") - if optimize_info == OptimizerInfo.BYORT: # Use OnnxRuntime to optimize + if optimize_info.name == OptimizerInfo.BYORT.name: # Use OnnxRuntime to optimize if is_valid_onnx_model: ort_model_path = add_filename_suffix(onnx_model_path, "_ort") optimize_onnx_model_by_ort( diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py index c781a91c9e493..efcd92129597a 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py +++ b/onnxruntime/python/tools/transformers/onnx_model_bert_keras.py @@ -178,18 +178,17 @@ def fuse_attention(self): mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0]) logger.debug("Create an Attention node.") attention_node = self.attention_fusion.create_attention_node( - mask_index, - matmul_q, - matmul_k, - matmul_v, - add_q, - add_k, - add_v, - self.num_heads, - self.hidden_size, - parent.output[0], - reshape_qkv.output[0], - None, + mask_index=mask_index, + q_matmul=matmul_q, + k_matmul=matmul_k, + v_matmul=matmul_v, + q_add=add_q, + k_add=add_k, + v_add=add_v, + num_heads=self.num_heads, + hidden_size=self.hidden_size, + first_input=parent.output[0], + output=reshape_qkv.output[0], ) if attention_node is None: continue diff --git a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py index b7891223e1dc2..a89b6c9e9395d 100644 --- a/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py +++ b/onnxruntime/python/tools/transformers/onnx_model_bert_tf.py @@ -480,18 +480,17 @@ def fuse_attention(self): # For tf models, q and v are flipped. attention_node = self.attention_fusion.create_attention_node( - mask_index, - matmul_k, - matmul_q, - matmul_v, - add_k, - add_q, - add_v, - self.num_heads, - self.hidden_size, - parent.output[0], - qkv_nodes[2].output[0], - None, + mask_index=mask_index, + q_matmul=matmul_k, + k_matmul=matmul_q, + v_matmul=matmul_v, + q_add=add_k, + k_add=add_q, + v_add=add_v, + num_heads=self.num_heads, + hidden_size=self.hidden_size, + first_input=parent.output[0], + output=qkv_nodes[2].output[0], ) if attention_node is None: continue diff --git a/onnxruntime/test/python/transformers/test_optimizer.py b/onnxruntime/test/python/transformers/test_optimizer.py index c7db636a2f11f..058b1d2c9e0fa 100644 --- a/onnxruntime/test/python/transformers/test_optimizer.py +++ b/onnxruntime/test/python/transformers/test_optimizer.py @@ -5,30 +5,21 @@ # license information. # -------------------------------------------------------------------------- -# For live logging, use the command: pytest -o log_cli=true --log-cli-level=DEBUG +# For live logging, use the following command: +# pytest -o log_cli=true --log-cli-level=DEBUG test_optimizer.py -import shutil import unittest -import pytest -import torch from model_loader import get_fusion_test_model, get_test_data_path from onnx import TensorProto, load_model from parity_utilities import find_transformers_source -from transformers import is_tf_available if find_transformers_source(): - from benchmark_helper import ConfigModifier, OptimizerInfo, Precision from fusion_options import FusionOptions - from huggingface_models import MODELS - from onnx_exporter import export_onnx_model_from_pt, export_onnx_model_from_tf from onnx_model import OnnxModel from optimizer import optimize_model else: - from onnxruntime.transformers.benchmark_helper import ConfigModifier, OptimizerInfo, Precision from onnxruntime.transformers.fusion_options import FusionOptions - from onnxruntime.transformers.huggingface_models import MODELS - from onnxruntime.transformers.onnx_exporter import export_onnx_model_from_pt, export_onnx_model_from_tf from onnxruntime.transformers.onnx_model import OnnxModel from onnxruntime.transformers.optimizer import optimize_model @@ -66,70 +57,6 @@ def verify_node_count(self, onnx_model, expected_node_count, test_name): self.assertEqual(len(onnx_model.get_nodes_by_op_type(op_type)), count) - # test huggingface pytorch model - def _test_optimizer_on_huggingface_model( - self, - model_name, - expected_fusion_result_list, - inputs_count=1, - validate_model=True, - ): - # Remove cached model so that CI machine has enough space. Do not remove cache models in dev machine. - if not find_transformers_source(): - shutil.rmtree("./cache_models", ignore_errors=True) - shutil.rmtree("./onnx_models", ignore_errors=True) - - # expect fusion result list have the following keys - # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization - model_fusion_statistics = {} - - input_names = MODELS[model_name][0] - - config_modifier = ConfigModifier(None) - fusion_options = None - model_class = "AutoModel" - with torch.no_grad(): - _, is_valid_onnx_model, _, _ = export_onnx_model_from_pt( - model_name, - MODELS[model_name][1], # opset version - MODELS[model_name][2], # use_external_data_format - MODELS[model_name][3], # optimization model type - model_class, - config_modifier, - "./cache_models", - "./onnx_models", - input_names[:inputs_count], - False, - Precision.FLOAT32, - OptimizerInfo.BYSCRIPT, - True, - True, - True, - model_fusion_statistics, - fusion_options, - ) - - if validate_model: - self.assertEqual(is_valid_onnx_model, True) - - expected_node_count = { - "EmbedLayerNormalization": expected_fusion_result_list[0], - "Attention": expected_fusion_result_list[1], - "Gelu": expected_fusion_result_list[2], - "FastGelu": expected_fusion_result_list[3], - "BiasGelu": expected_fusion_result_list[4], - "LayerNormalization": expected_fusion_result_list[5], - "SkipLayerNormalization": expected_fusion_result_list[6], - } - - for value in model_fusion_statistics.values(): - actual_node_count = value - - for op_type, count in expected_node_count.items(): - if op_type not in actual_node_count or actual_node_count[op_type] != count: - print(f"expected: {expected_node_count} got {actual_node_count}") - self.assertTrue(False) - def test_gpt2_past(self): for enable_skip_layer_norm_fusion in [False, True]: input_path = _get_test_model_path("gpt2_past") @@ -227,176 +154,6 @@ def test_embed_layer_norm_fusion(self): } self.verify_node_count(model, expected_node_count, file) - @pytest.mark.slow - def test_huggingface_bert_fusion_1(self): - self._test_optimizer_on_huggingface_model("bert-base-uncased", [1, 12, 0, 0, 12, 0, 24], inputs_count=1) - - @pytest.mark.slow - def test_huggingface_bert_fusion_2(self): - self._test_optimizer_on_huggingface_model("bert-base-uncased", [1, 12, 0, 0, 12, 0, 24], inputs_count=2) - - @pytest.mark.slow - def test_huggingface_bert_fusion_3(self): - self._test_optimizer_on_huggingface_model("bert-base-uncased", [1, 12, 0, 0, 12, 0, 24], inputs_count=3) - - @pytest.mark.slow - def test_huggingface_openaigpt_fusion(self): - self._test_optimizer_on_huggingface_model("openai-gpt", [0, 12, 0, 12, 0, 0, 24]) - - @pytest.mark.slow - @unittest.skip("skip failed fusion test of gpt-2 on PyTorch 1.12 and transformers 4.18. TODO: fix it") - def test_huggingface_gpt2_fusion(self): - self._test_optimizer_on_huggingface_model("gpt2", [0, 12, 0, 12, 0, 25, 0]) - - @pytest.mark.slow - @unittest.skip("skip failed fusion test of xlm on PyTorch 1.12 and transformers 4.18. TODO: fix it") - def test_huggingface_xlm_fusion(self): - self._test_optimizer_on_huggingface_model("xlm-mlm-ende-1024", [0, 6, 0, 0, 6, 0, 13]) - - @pytest.mark.slow - def test_huggingface_roberta_fusion(self): - self._test_optimizer_on_huggingface_model("roberta-base", [0, 12, 0, 0, 12, 1, 24]) - - @pytest.mark.slow - def test_huggingface_distillbert_fusion(self): - self._test_optimizer_on_huggingface_model("distilbert-base-uncased", [1, 6, 0, 0, 6, 0, 12], inputs_count=1) - self._test_optimizer_on_huggingface_model("distilbert-base-uncased", [1, 6, 0, 0, 6, 0, 12], inputs_count=2) - - @pytest.mark.slow - @unittest.skip("skip failed fusion test of camembert on PyTorch 1.12 and transformers 4.18. TODO: fix it") - def test_huggingface_camembert_fusion(self): - self._test_optimizer_on_huggingface_model("camembert-base", [0, 12, 0, 0, 12, 1, 24], validate_model=False) - - @pytest.mark.slow - @unittest.skip("skip failed fusion test of albert on PyTorch 1.12 and transformers 4.18. TODO: fix it") - def test_huggingface_albert_fusion(self): - self._test_optimizer_on_huggingface_model("albert-base-v1", [0, 12, 0, 0, 12, 1, 24]) - - @pytest.mark.slow - @unittest.skip("skip fusion test of t5 since it is not implemented yet") - def test_huggingface_t5_fusion(self): - self._test_optimizer_on_huggingface_model("t5-small", [0, 0, 0, 0, 0, 0, 0]) - - @pytest.mark.slow - def test_huggingface_xlmroberta_fusion(self): - self._test_optimizer_on_huggingface_model("xlm-roberta-base", [0, 12, 0, 0, 12, 1, 24]) - - @pytest.mark.slow - @unittest.skip("skip failed fusion test of flaubert on PyTorch 1.12 and transformers 4.18. TODO: fix it") - def test_huggingface_flaubert_fusion(self): - self._test_optimizer_on_huggingface_model( - "flaubert/flaubert_base_cased", - [0, 12, 0, 0, 12, 0, 25], - validate_model=False, - ) - self._test_optimizer_on_huggingface_model( - "flaubert/flaubert_small_cased", - [0, 6, 0, 0, 6, 12, 1], - validate_model=False, - ) - - @pytest.mark.slow - @unittest.skip("skip failed fusion test of dialogpt on PyTorch 1.12 and transformers 4.18. TODO: fix it") - def test_huggingface_dialogpt_fusion(self): - self._test_optimizer_on_huggingface_model("microsoft/DialoGPT-small", [0, 12, 0, 12, 0, 25, 0]) - - @pytest.mark.slow - def test_huggingface_bart_fusion(self): - self._test_optimizer_on_huggingface_model("facebook/bart-base", [0, 0, 0, 0, 12, 2, 30]) - - @pytest.mark.slow - def test_huggingface_vit_fusion(self): - self._test_optimizer_on_huggingface_model("google/vit-base-patch16-224", [0, 11, 0, 0, 12, 1, 24]) - - -@unittest.skipUnless(is_tf_available(), "skip TestBertOptimizationTF since tensorflow is not available") -class TestTensorflowModelOptimization(unittest.TestCase): - def setUp(self): - try: - import tf2onnx # noqa: F401 - except ImportError: - self.skipTest("skip TestBertOptimizationTF since tf2onnx not installed") - - def _test_optimizer_on_tf_model(self, model_name, expected_fusion_result_list, inputs_count, validate_model=True): - # Remove cached model so that CI machine has enough space. Do not remove cache models in dev machine. - if not find_transformers_source(): - shutil.rmtree("./cache_models", ignore_errors=True) - shutil.rmtree("./onnx_models", ignore_errors=True) - - # expect fusion result list have the following keys - # EmbedLayerNormalization, Attention, Gelu, FastGelu, BiasGelu, LayerNormalization, SkipLayerNormalization - model_fusion_statistics = {} - print("testing mode ", model_name) - print("testing input number = ", inputs_count) - input_names = MODELS[model_name][0] - - config_modifier = ConfigModifier(None) - fusion_options = None - model_class = "AutoModel" - with torch.no_grad(): - _, is_valid_onnx_model, _, _ = export_onnx_model_from_tf( - model_name, - MODELS[model_name][1], # opset version - MODELS[model_name][2], # use_external_data_format - MODELS[model_name][3], # optimization model - model_class, - config_modifier, - "./cache_models", - "./onnx_models", - input_names[:inputs_count], - False, - Precision.FLOAT32, - True, - True, - True, - True, - model_fusion_statistics, - fusion_options, - ) - - onnx_model = next(iter(model_fusion_statistics.keys())) - fusion_result_list = list(model_fusion_statistics[onnx_model].values()) - - if validate_model: - self.assertEqual(is_valid_onnx_model, True) - self.assertEqual(fusion_result_list, expected_fusion_result_list) - - @pytest.mark.slow - def test_huggingface_bert_base_cased_from_tf2onnx_1(self): - self._test_optimizer_on_tf_model("bert-base-cased", [0, 12, 0, 0, 0, 0, 25], 1) - - @pytest.mark.slow - def test_huggingface_bert_base_cased_from_tf2onnx_2(self): - self._test_optimizer_on_tf_model("bert-base-cased", [0, 12, 0, 0, 0, 0, 25], 2) - - @pytest.mark.slow - def test_huggingface_bert_base_cased_from_tf2onnx_3(self): - self._test_optimizer_on_tf_model("bert-base-cased", [0, 12, 0, 0, 0, 0, 25], 3) - - @pytest.mark.slow - def test_huggingface_distilgpt2_from_tf2onnx(self): - self._test_optimizer_on_tf_model("distilgpt2", [0, 0, 0, 0, 0, 12, 1], 1) - - @pytest.mark.slow - def test_huggingface_albert_from_tf2onnx(self): - self._test_optimizer_on_tf_model("albert-base-v1", [0, 0, 0, 0, 0, 0, 25], 1) - - @pytest.mark.slow - def test_huggingface_gpt2_from_tf2onnx(self): - self._test_optimizer_on_tf_model("gpt2", [0, 0, 0, 0, 0, 24, 1], 1, validate_model=False) - - @pytest.mark.slow - def test_huggingface_roberta_from_tf2onnx(self): - self._test_optimizer_on_tf_model("roberta-base", [0, 12, 0, 0, 0, 0, 25], 1, validate_model=False) - - @pytest.mark.slow - def test_huggingface_distilbert_from_tf2onnx(self): - self._test_optimizer_on_tf_model("distilbert-base-uncased", [0, 0, 0, 0, 0, 0, 13], 1, validate_model=False) - - @pytest.mark.slow - def test_huggingface_xlm_from_tf2onnx(self): - self._test_optimizer_on_tf_model("xlm-mlm-ende-1024", [0, 0, 0, 0, 0, 1, 12], 1, validate_model=False) - if __name__ == "__main__": unittest.main() diff --git a/onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py b/onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py new file mode 100644 index 0000000000000..e4f883dc8b45c --- /dev/null +++ b/onnxruntime/test/python/transformers/test_optimizer_huggingface_bert.py @@ -0,0 +1,151 @@ +#!/usr/bin/env python +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- + +# For live logging, use the following command: +# pytest -o log_cli=true --log-cli-level=DEBUG test_optimizer_huggingface_bert.py + +import shutil +import unittest +from pathlib import Path + +import torch +from parity_utilities import find_transformers_source +from transformers.utils import default_cache_path + +if find_transformers_source(): + from benchmark_helper import ConfigModifier, OptimizerInfo, Precision + from compare_bert_results import run_test as bert_parity_test + from onnx_exporter import export_onnx_model_from_pt +else: + from onnxruntime.transformers.benchmark_helper import ConfigModifier, OptimizerInfo, Precision + from onnxruntime.transformers.compare_bert_results import run_test as bert_parity_test + from onnxruntime.transformers.onnx_exporter import export_onnx_model_from_pt + + +class TestHuggingfaceBertModelOptimization(unittest.TestCase): + def run_optimizer_on_model( + self, + model_name, + expected_fusion_result_list, + inputs_count=1, + validate_model=True, + opset_version=16, + use_external_data_format=False, + model_type="bert", + ): + onnx_dir = Path(".") / "onnx_models" / model_name + shutil.rmtree(onnx_dir, ignore_errors=True) + + Path(onnx_dir).mkdir(parents=True, exist_ok=True) + + model_fusion_statistics = {} + + input_names = ["input_ids", "attention_mask", "token_type_ids"] + + config_modifier = ConfigModifier(None) + fusion_options = None + model_class = "AutoModel" + with torch.no_grad(): + optimized_model_path, is_valid_onnx_model, _, _ = export_onnx_model_from_pt( + model_name=model_name, + opset_version=opset_version, + use_external_data_format=use_external_data_format, + model_type=model_type, + model_class=model_class, + config_modifier=config_modifier, + cache_dir=default_cache_path, + onnx_dir=str(onnx_dir), + input_names=input_names[:inputs_count], + use_gpu=False, + precision=Precision.FLOAT32, + optimizer_info=OptimizerInfo.BYSCRIPT, + validate_onnx=True, + use_raw_attention_mask=True, + overwrite=True, + model_fusion_statistics=model_fusion_statistics, + fusion_options=fusion_options, + ) + + if validate_model: + self.assertEqual(is_valid_onnx_model, True) + + expected_node_count = { + "EmbedLayerNormalization": expected_fusion_result_list[0], + "Attention": expected_fusion_result_list[1], + "Gelu": expected_fusion_result_list[2], + "FastGelu": expected_fusion_result_list[3], + "BiasGelu": expected_fusion_result_list[4], + "LayerNormalization": expected_fusion_result_list[5], + "SkipLayerNormalization": expected_fusion_result_list[6], + } + + node_count = None + for value in model_fusion_statistics.values(): + node_count = value + self.assertIsNotNone(node_count) + + actual_node_count = {} + for op_type in expected_node_count: + actual_node_count[op_type] = node_count.get(op_type, 0) + + expected = ", ".join(f"{key}: {value}" for key, value in sorted(expected_node_count.items())) + actual = ", ".join(f"{key}: {value}" for key, value in sorted(actual_node_count.items())) + self.assertEqual(expected, actual) + + suffix = "_fp32_cpu.onnx" + assert optimized_model_path.endswith(suffix) + baseline_model_path = optimized_model_path[: -len(suffix)] + ".onnx" + for batch_size in [1, 2]: + for sequence_length in [1, 8]: + max_abs_diff, case_passed = bert_parity_test( + baseline_model_path, + optimized_model_path, + output_dir=None, + batch_size=batch_size, + sequence_length=sequence_length, + use_gpu=False, + test_cases=1, + seed=123, + verbose=False, + rtol=1e-4, + atol=1e-4, + input_ids_name=input_names[0], + segment_ids_name=input_names[2] if inputs_count > 2 else None, + input_mask_name=input_names[1] if inputs_count > 1 else None, + mask_type=2, + dictionary_size=1024, + ) + self.assertTrue( + case_passed, f"bert parity test failed: {batch_size=} {sequence_length=} {max_abs_diff=}" + ) + + def test_bert(self): + model_name = "hf-internal-testing/tiny-random-bert" + self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=1) + self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=2) + self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=3) + + def test_roberta(self): + model_name = "hf-internal-testing/tiny-random-roberta" + # TODO: EmbedLayerNormalization fusion. + self.run_optimizer_on_model(model_name, [0, 5, 0, 0, 5, 1, 10], inputs_count=1) + self.run_optimizer_on_model(model_name, [0, 5, 0, 0, 5, 1, 10], inputs_count=2) + + def test_distillbert(self): + model_name = "hf-internal-testing/tiny-random-distilbert" + self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=1) + self.run_optimizer_on_model(model_name, [1, 5, 0, 0, 5, 0, 10], inputs_count=2) + + def test_xlm_roberta(self): + model_name = "hf-internal-testing/tiny-xlm-roberta" + # TODO: EmbedLayerNormalization fusion. + self.run_optimizer_on_model(model_name, [0, 2, 0, 0, 2, 1, 4], inputs_count=1) + self.run_optimizer_on_model(model_name, [0, 2, 0, 0, 2, 1, 4], inputs_count=2) + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py index 1e7940e38335f..baaaeaa766db9 100644 --- a/onnxruntime/test/python/transformers/test_parity_moe.py +++ b/onnxruntime/test/python/transformers/test_parity_moe.py @@ -651,7 +651,6 @@ def parity_check(self): torch_output = self.forward(hidden_state) ort_output = self.ort_forward(hidden_state) if ort_output is not None: - assert torch.allclose(torch_output, ort_output.to(torch.float32), rtol=THRESHOLD, atol=THRESHOLD) print( "name:", self.__class__.__name__, @@ -661,8 +660,8 @@ def parity_check(self): self.sequence_length, " max_diff:", (torch_output - ort_output).abs().max(), - " parity: OK", ) + torch.testing.assert_close(ort_output.to(torch.float32), torch_output, rtol=THRESHOLD, atol=THRESHOLD) def benchmark_ort(self): hidden_state = torch.randn(self.batch_size, self.sequence_length, self.hidden_dim) @@ -996,6 +995,13 @@ def small_test_cases(): yield batch_size, sequence_length +def phi3_test_cases(): + # TODO: phi3 moe failed in long sequence lengths (max diff 0.22 > threshold 0.01), need investigation. + for batch_size in [1, 4, 16]: + for sequence_length in [128]: + yield batch_size, sequence_length + + class TestSwitchMoE(unittest.TestCase): @parameterized.expand(small_test_cases()) def test_switch_moe_parity(self, batch_size, sequence_length): @@ -1023,7 +1029,7 @@ def test_mixtral_moe_parity(self, batch_size, sequence_length): class TestPhiMoE(unittest.TestCase): - @parameterized.expand(small_test_cases()) + @parameterized.expand(phi3_test_cases()) def test_phi3_moe_parity(self, batch_size, sequence_length): config = PhiMoEConfig(hidden_size=256, intermediate_size=1024) phi3_moe = PhiMoESparseMoeBlock(config, batch_size, sequence_length) diff --git a/tools/ci_build/requirements/transformers-test/requirements.txt b/tools/ci_build/requirements/transformers-test/requirements.txt index 32c5ce7dd08d1..cb93043e09b63 100644 --- a/tools/ci_build/requirements/transformers-test/requirements.txt +++ b/tools/ci_build/requirements/transformers-test/requirements.txt @@ -5,7 +5,7 @@ numpy==1.24.0 ; python_version < '3.12' numpy==1.26.0 ; python_version >= '3.12' torch coloredlogs==15.0 -transformers==4.38.0 +transformers==4.46.3 parameterized>=0.8.1 psutil einops From 8d99b1a8dc5318bde4463817c02552ebca0cf547 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 21 Nov 2024 12:26:46 -0800 Subject: [PATCH 28/28] reduce GQA test combinations (#22918) ### Description * Reduce GQA test combinations to save about 35 minutes test time in CI pipelines. * Show latency of transformers tests * Use seed in DMMHA test to avoid random failure. * For test_flash_attn_rocm.py, test skipping condition from "has cuda ep" to "not has rocm ep", so that it does not run in cpu build. * For test_flash_attn_cuda.py, move flash attention and memory efficient attention tests to different classes, so that we can skip a test suite instead of checking in each test. ### Motivation and Context It takes too long to run GQA tests in CI pipelines since there are too many combinations. ###### Linux GPU CI Pipeline Before: 5097 passed, 68 skipped, 8 warnings in 1954.64s (0:32:34) After: 150 passed, 176 skipped, 8 warnings in 530.38s (0:08:50) Time Saved: **1424** seconds (0:23:44) ###### Windows GPU CUDA CI Pipeline Before: 1781 passed, 72 skipped, 6 warnings in 605.48s (0:10:05) After: 116 passed, 118 skipped, 6 warnings in 275.48s (0:04:35) Time Saved: **330** seconds (0:05:30) ###### Linux CPU CI Pipeline Before: 5093 passed, 72 skipped, 4 warnings in 467.04s (0:07:47) - 212.96s transformers/test_gqa_cpu.py::TestGQA::test_gqa_past - 154.12s transformers/test_gqa_cpu.py::TestGQA::test_gqa_no_past - 26.45s transformers/test_gqa_cpu.py::TestGQA::test_gqa_interactive_one_batch After: 116 passed, 210 skipped, 4 warnings in 93.41s (0:01:33) - 0.97s transformers/test_gqa_cpu.py::TestGQA::test_gqa_past - 19.23s transformers/test_gqa_cpu.py::TestGQA::test_gqa_no_past - 2.41s transformers/test_gqa_cpu.py::TestGQA::test_gqa_interactive_one_batch Time Saved: **374** seconds (0:06:14). --- ...oder_masked_multihead_attention_op_test.cc | 2 +- .../transformers/test_flash_attn_cuda.py | 170 ++++++++---------- .../transformers/test_flash_attn_rocm.py | 20 +-- .../test/python/transformers/test_gqa_cpu.py | 20 +-- tools/ci_build/build.py | 2 +- 5 files changed, 98 insertions(+), 116 deletions(-) diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc index 17685ab82f0ef..208545eacf224 100644 --- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc @@ -757,7 +757,7 @@ static void TestDecoderMaskedMultiHeadAttention(bool is_cross_attn = true, bool OpTester tester("DecoderMaskedMultiHeadAttention", 1, onnxruntime::kMSDomain); FixedPatternValueGenerator generator{}; - RandomValueGenerator random{}; + RandomValueGenerator random{123}; // Attributes tester.AddAttribute("num_heads", static_cast(num_heads)); diff --git a/onnxruntime/test/python/transformers/test_flash_attn_cuda.py b/onnxruntime/test/python/transformers/test_flash_attn_cuda.py index 46ab905977f48..a74d5389e9047 100644 --- a/onnxruntime/test/python/transformers/test_flash_attn_cuda.py +++ b/onnxruntime/test/python/transformers/test_flash_attn_cuda.py @@ -24,7 +24,7 @@ from parameterized import parameterized from test_gqa_cpu import smooth_softmax_ref -from onnxruntime import InferenceSession, OrtValue, SessionOptions +from onnxruntime import InferenceSession, OrtValue, SessionOptions, get_available_providers torch.manual_seed(0) @@ -1999,6 +1999,8 @@ def parity_check_gqa_past_no_buff( def has_flash_attention(): if not torch.cuda.is_available(): return False + if "CUDAExecutionProvider" not in get_available_providers(): + return False major, _ = torch.cuda.get_device_capability() return major >= 8 and ( platform.system() == "Linux" @@ -2009,6 +2011,8 @@ def has_flash_attention(): def has_memory_efficient(): if not torch.cuda.is_available(): return False + if "CUDAExecutionProvider" not in get_available_providers(): + return False major, minor = torch.cuda.get_device_capability() if major < 5 or (major == 5 and minor < 3): return False @@ -2047,8 +2051,8 @@ def mha_test_cases(): (2048, 2048), ] ) - num_h = [1, 3] if pipeline_mode else [1, 6, 16] - h_sizes = [16, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [3] if pipeline_mode else [1, 6, 16] + h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] for b in batches: for s, s2 in seqs: @@ -2080,11 +2084,7 @@ def gqa_no_past_memory_efficient_test_cases(): batches = [3] if pipeline_mode else [1, 3, 5] seqs = ( [ - (127, 127), - (35, 35), (2000, 2000), - (200, 200), - (240, 240), ] if pipeline_mode else [ @@ -2095,8 +2095,8 @@ def gqa_no_past_memory_efficient_test_cases(): (240, 240), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] torch.manual_seed(69) for b in batches: @@ -2121,10 +2121,6 @@ def gqa_no_past_flash_attention_test_cases(): batches = [3] if pipeline_mode else [1, 3, 5] seqs = ( [ - (127, 127), - (35, 35), - (2000, 2000), - (200, 200), (240, 240), ] if pipeline_mode @@ -2136,8 +2132,8 @@ def gqa_no_past_flash_attention_test_cases(): (240, 240), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] torch.manual_seed(69) for b in batches: @@ -2163,7 +2159,7 @@ def gqa_no_past_flash_attention_test_cases(): def gqa_past_memory_efficient_test_cases(): batches = [5] if pipeline_mode else [1, 3, 5] seqs = ( - [(1, 128), (1, 1024), (1, 2048)] + [(1, 1024)] if pipeline_mode else [ (1, 128), @@ -2179,8 +2175,8 @@ def gqa_past_memory_efficient_test_cases(): # (128, 128), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] random.seed(69) for b in batches: @@ -2205,7 +2201,7 @@ def gqa_past_memory_efficient_test_cases(): def gqa_past_flash_attention_test_cases(): batches = [5] if pipeline_mode else [1, 3, 5] seqs = ( - [(1, 128), (1, 1024), (1, 2048)] + [(1, 2048)] if pipeline_mode else [ (1, 128), @@ -2221,8 +2217,8 @@ def gqa_past_flash_attention_test_cases(): # (128, 128), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] random.seed(69) for b in batches: @@ -2249,7 +2245,7 @@ def gqa_past_flash_attention_test_cases(): def gqa_interactive_one_batch_flash_attention_test_cases(): batches = [1] seqs = ( - [(2, 128), (128, 129), (32, 128), (256, 2048)] + [(128, 2048)] if pipeline_mode else [ (1, 128), @@ -2265,8 +2261,8 @@ def gqa_interactive_one_batch_flash_attention_test_cases(): # (128, 128), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] random.seed(69) for b in batches: @@ -2290,7 +2286,7 @@ def gqa_interactive_one_batch_flash_attention_test_cases(): def gqa_interactive_one_batch_memory_efficient_attention_test_cases(): batches = [1] seqs = ( - [(2, 128), (128, 129), (32, 128), (256, 2048)] + [(32, 128)] if pipeline_mode else [ (1, 128), @@ -2306,8 +2302,8 @@ def gqa_interactive_one_batch_memory_efficient_attention_test_cases(): # (128, 128), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] random.seed(69) for b in batches: @@ -2326,120 +2322,114 @@ def gqa_interactive_one_batch_memory_efficient_attention_test_cases(): ) -class TestGQA(unittest.TestCase): - @parameterized.expand(gqa_no_past_memory_efficient_test_cases()) - def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap): - if not has_memory_efficient(): - return - os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1" - print("------- MEMORY EFFICIENT ATTENTION (PROMPT CASE) ---------") +@unittest.skipIf(not has_flash_attention(), reason="Flash Attention is not available, skipping tests.") +class TestFlashGQA(unittest.TestCase): + @parameterized.expand(gqa_no_past_flash_attention_test_cases()) + def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap): + print("------- FLASH ATTENTION (PROMPT CASE) --------") + os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0" parity_check_gqa_prompt( config, - rtol=5e-3, - atol=5e-3, + local=local, past_format=Formats.BNSH, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, softcap=softcap, - use_smooth_softmax=False, + use_smooth_softmax=True, ) parity_check_gqa_prompt_no_buff( config, - rtol=5e-3, - atol=5e-3, + local=local, past_format=Formats.BNSH, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, softcap=softcap, - use_smooth_softmax=True, + use_smooth_softmax=False, ) - @parameterized.expand(gqa_no_past_flash_attention_test_cases()) - def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap): - if not has_flash_attention(): - return - print("------- FLASH ATTENTION (PROMPT CASE) --------") + @parameterized.expand(gqa_past_flash_attention_test_cases()) + def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap): + print("------- FLASH ATTENTION (TOKEN GEN) -------") os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0" - parity_check_gqa_prompt( + parity_check_gqa_past( config, local=local, past_format=Formats.BNSH, + rtol=1e-3, + atol=1e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, softcap=softcap, - use_smooth_softmax=True, + use_smooth_softmax=False, ) - parity_check_gqa_prompt_no_buff( + parity_check_gqa_past_no_buff( config, local=local, past_format=Formats.BNSH, + rtol=1e-3, + atol=1e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, softcap=softcap, - use_smooth_softmax=False, + use_smooth_softmax=True, ) - @parameterized.expand(gqa_past_memory_efficient_test_cases()) - def test_gqa_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap): - if not has_memory_efficient(): - return - os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1" - print("-------- MEMORY EFFICIENT (TOKEN GEN) --------") + @parameterized.expand(gqa_interactive_one_batch_flash_attention_test_cases()) + def test_gqa_interactive_one_batch_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed): + print("------- FLASH ATTENTION (INTERACTIVE) -------") + os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0" parity_check_gqa_past( config, + local=local, past_format=Formats.BNSH, - rtol=1e-3, - atol=1e-3, + rtol=5e-3, + atol=5e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, - softcap=softcap, - use_smooth_softmax=True, ) parity_check_gqa_past_no_buff( config, + local=local, past_format=Formats.BNSH, - rtol=1e-3, - atol=1e-3, + rtol=5e-3, + atol=5e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, - softcap=softcap, - use_smooth_softmax=False, ) - @parameterized.expand(gqa_past_flash_attention_test_cases()) - def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap): - if not has_flash_attention(): - return - print("------- FLASH ATTENTION (TOKEN GEN) -------") - os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0" - parity_check_gqa_past( +@unittest.skipIf(not has_memory_efficient(), reason="Memory efficient FMHA is not available, skipping tests.") +class TestMemoryEfficientGQA(unittest.TestCase): + @parameterized.expand(gqa_no_past_memory_efficient_test_cases()) + def test_gqa_no_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap): + os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1" + print("------- MEMORY EFFICIENT ATTENTION (PROMPT CASE) ---------") + + parity_check_gqa_prompt( config, - local=local, + rtol=5e-3, + atol=5e-3, past_format=Formats.BNSH, - rtol=1e-3, - atol=1e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, softcap=softcap, use_smooth_softmax=False, ) - parity_check_gqa_past_no_buff( + parity_check_gqa_prompt_no_buff( config, - local=local, + rtol=5e-3, + atol=5e-3, past_format=Formats.BNSH, - rtol=1e-3, - atol=1e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, @@ -2447,38 +2437,36 @@ def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interle use_smooth_softmax=True, ) - @parameterized.expand(gqa_interactive_one_batch_flash_attention_test_cases()) - def test_gqa_interactive_one_batch_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed): - if not has_flash_attention(): - return - print("------- FLASH ATTENTION (INTERACTIVE) -------") - os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "0" + @parameterized.expand(gqa_past_memory_efficient_test_cases()) + def test_gqa_past_memory_efficient(self, _, config, rotary, rotary_interleaved, packed, softcap): + os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1" + print("-------- MEMORY EFFICIENT (TOKEN GEN) --------") parity_check_gqa_past( config, - local=local, past_format=Formats.BNSH, - rtol=5e-3, - atol=5e-3, + rtol=1e-3, + atol=1e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, + softcap=softcap, + use_smooth_softmax=True, ) parity_check_gqa_past_no_buff( config, - local=local, past_format=Formats.BNSH, - rtol=5e-3, - atol=5e-3, + rtol=1e-3, + atol=1e-3, rotary=rotary, rotary_interleaved=rotary_interleaved, packed=packed, + softcap=softcap, + use_smooth_softmax=False, ) @parameterized.expand(gqa_interactive_one_batch_memory_efficient_attention_test_cases()) def test_gqa_interactive_one_batch_memory_efficient_attention(self, _, config, rotary, rotary_interleaved, packed): - if not has_memory_efficient(): - return os.environ["ORT_DISABLE_FLASH_ATTENTION"] = "1" print("-------- MEMORY EFFICIENT (INTERACTIVE) --------") diff --git a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py index 99460722c2469..a5910c28c2975 100644 --- a/onnxruntime/test/python/transformers/test_flash_attn_rocm.py +++ b/onnxruntime/test/python/transformers/test_flash_attn_rocm.py @@ -16,16 +16,16 @@ import onnxruntime -class TestGQA(unittest.TestCase): +@unittest.skipIf( + (not torch.cuda.is_available()) + or (platform.system() != "Linux") + or ("ROCMExecutionProvider" not in onnxruntime.get_available_providers()), + reason="ROCm is not available, skipping tests.", +) +class TestRocmGQA(unittest.TestCase): @parameterized.expand(gqa_no_past_flash_attention_test_cases()) def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap): config.ep = "ROCMExecutionProvider" - if not torch.cuda.is_available(): - return - if platform.system() != "Linux": - return - if "CUDAExecutionProvider" in onnxruntime.get_available_providers(): - return print("------- FLASH ATTENTION (PROMPT CASE) --------") parity_check_gqa_prompt( @@ -52,12 +52,6 @@ def test_gqa_no_past_flash_attention(self, _, config, local, rotary, rotary_inte @parameterized.expand(gqa_past_flash_attention_test_cases()) def test_gqa_past_flash_attention(self, _, config, local, rotary, rotary_interleaved, packed, softcap): config.ep = "ROCMExecutionProvider" - if not torch.cuda.is_available(): - return - if platform.system() != "Linux": - return - if "CUDAExecutionProvider" in onnxruntime.get_available_providers(): - return print("------- FLASH ATTENTION (TOKEN GEN) -------") parity_check_gqa_past( diff --git a/onnxruntime/test/python/transformers/test_gqa_cpu.py b/onnxruntime/test/python/transformers/test_gqa_cpu.py index 08ec5de328b9d..77b4b326bf645 100644 --- a/onnxruntime/test/python/transformers/test_gqa_cpu.py +++ b/onnxruntime/test/python/transformers/test_gqa_cpu.py @@ -1900,7 +1900,7 @@ class TestGQA(unittest.TestCase): def test_gqa_no_past(self): torch.manual_seed(69) print("-------- TEST GQA NO PAST (PROMPT CASE) ---------") - batches = [1, 3] if pipeline_mode else [1, 3, 5] + batches = [3] if pipeline_mode else [1, 3, 5] seqs = ( [ (127, 127), @@ -1916,8 +1916,8 @@ def test_gqa_no_past(self): (8000, 8000), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 128, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [128] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] for b in batches: for sq, skv in seqs: for n, n2 in num_h: @@ -1954,9 +1954,9 @@ def test_gqa_no_past(self): def test_gqa_past(self): print("-------- TEST GQA PAST (TOKEN GEN) ---------") - batches = [1, 3] if pipeline_mode else [1, 3, 5] + batches = [1] if pipeline_mode else [1, 3, 5] seqs = ( - [(1, 128), (1, 1024), (1, 2048)] + [(1, 128)] if pipeline_mode else [ (1, 128), @@ -1972,8 +1972,8 @@ def test_gqa_past(self): # (128, 128), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 64, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(9, 3)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [64] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] random.seed(69) for b in batches: for s, s2 in seqs: @@ -2018,7 +2018,7 @@ def test_gqa_interactive_one_batch(self): print("-------- TEST GQA INTERACTIVE ---------") batches = [1] seqs = ( - [(2, 128), (128, 129), (32, 128), (256, 2048)] + [(256, 2048)] if pipeline_mode else [ (1, 128), @@ -2034,8 +2034,8 @@ def test_gqa_interactive_one_batch(self): # (128, 128), ] ) - num_h = [(32, 8), (9, 3), (4, 4)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] - h_sizes = [16, 64, 256] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] + num_h = [(32, 8)] if pipeline_mode else [(6, 6), (6, 3), (9, 9), (9, 3)] + h_sizes = [32] if pipeline_mode else [32, 40, 64, 80, 96, 128, 160, 192, 224, 256] random.seed(69) for b in batches: for s, s2 in seqs: diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index aa1198102f978..3bfbc01086cd3 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -2149,7 +2149,7 @@ def run_onnxruntime_tests(args, source_dir, ctest_path, build_dir, configs): ], cwd=SCRIPT_DIR, ) - run_subprocess([sys.executable, "-m", "pytest", "transformers"], cwd=cwd) + run_subprocess([sys.executable, "-m", "pytest", "--durations=0", "transformers"], cwd=cwd) # Restore initial numpy/protobuf version in case other tests use it run_subprocess([sys.executable, "-m", "pip", "install", "numpy==" + numpy_init_version]) run_subprocess([sys.executable, "-m", "pip", "install", "protobuf==" + pb_init_version])