From 1fb6cbddee6dc84f3ed720425e42cb789c361696 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 14 Mar 2024 08:50:42 -0700 Subject: [PATCH 01/55] Add a build patch for Windows ARM64EC (#19898) ### Description Add a patch for Windows ARM64EC ### Motivation and Context Will need more changes in onnxruntime/core/common/cpuid_arch_definition.h and onnxruntime/core/common/cpuid_info.cc --- .../external/onnxruntime_external_deps.cmake | 24 +++++++++++++------ ...2d342fd9479679d505d93a478a6f9cd50a47.patch | 22 +++++++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) create mode 100644 cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index e4fefdbf86369..75ccc2dfd83a0 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -305,13 +305,23 @@ if (CPUINFO_SUPPORTED) set(CPUINFO_BUILD_UNIT_TESTS OFF CACHE INTERNAL "") set(CPUINFO_BUILD_MOCK_TESTS OFF CACHE INTERNAL "") set(CPUINFO_BUILD_BENCHMARKS OFF CACHE INTERNAL "") - - FetchContent_Declare( - pytorch_cpuinfo - URL ${DEP_URL_pytorch_cpuinfo} - URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo} - FIND_PACKAGE_ARGS NAMES cpuinfo - ) + if(onnxruntime_target_platform STREQUAL "ARM64EC") + message("Applying a patch for Windows ARM64EC in cpuinfo") + FetchContent_Declare( + pytorch_cpuinfo + URL ${DEP_URL_pytorch_cpuinfo} + URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo} + PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch + FIND_PACKAGE_ARGS NAMES cpuinfo + ) + else() + FetchContent_Declare( + pytorch_cpuinfo + URL ${DEP_URL_pytorch_cpuinfo} + URL_HASH SHA1=${DEP_SHA1_pytorch_cpuinfo} + FIND_PACKAGE_ARGS NAMES cpuinfo + ) + endif() set(ONNXRUNTIME_CPUINFO_PROJ pytorch_cpuinfo) endif() diff --git a/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch new file mode 100644 index 0000000000000..afb19a45ce0f4 --- /dev/null +++ b/cmake/patches/cpuinfo/9bb12d342fd9479679d505d93a478a6f9cd50a47.patch @@ -0,0 +1,22 @@ +diff --git a/include/cpuinfo.h b/include/cpuinfo.h +index c46b65e..8b83a64 100644 +--- a/include/cpuinfo.h ++++ b/include/cpuinfo.h +@@ -18,7 +18,7 @@ + #define CPUINFO_ARCH_X86 1 + #endif + +-#if defined(__x86_64__) || defined(__x86_64) || defined(_M_X64) || defined(_M_AMD64) ++#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) || (defined(_M_AMD64) && !defined(_M_ARM64EC)) + #define CPUINFO_ARCH_X86_64 1 + #endif + +@@ -26,7 +26,7 @@ + #define CPUINFO_ARCH_ARM 1 + #endif + +-#if defined(__aarch64__) || defined(_M_ARM64) ++#if defined(__aarch64__) || defined(_M_ARM64) || defined(_M_ARM64EC) + #define CPUINFO_ARCH_ARM64 1 + #endif + From 226f60f2f1de34db1df2f3ed5065e456557c78f4 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Thu, 14 Mar 2024 11:31:20 -0700 Subject: [PATCH 02/55] Add support for SGD optimizer in minimal build (#19901) --- .../orttraining/python/training/artifacts.py | 27 +++---- .../orttraining_test_ort_apis_onnxblock.py | 27 +++++++ .../orttraining/training_api/optimizer.cc | 71 +++---------------- .../orttraining/training_api/optimizer.h | 5 +- 4 files changed, 53 insertions(+), 77 deletions(-) diff --git a/orttraining/orttraining/python/training/artifacts.py b/orttraining/orttraining/python/training/artifacts.py index 4e76174d8255e..624b30ffdab3b 100644 --- a/orttraining/orttraining/python/training/artifacts.py +++ b/orttraining/orttraining/python/training/artifacts.py @@ -41,7 +41,7 @@ def generate_artifacts( requires_grad: Optional[List[str]] = None, frozen_params: Optional[List[str]] = None, loss: Optional[Union[LossType, onnxblock.Block]] = None, - optimizer: Optional[OptimType] = None, + optimizer: Optional[Union[OptimType, onnxblock.Block]] = None, artifact_directory: Optional[Union[str, bytes, os.PathLike]] = None, prefix: str = "", ort_format: bool = False, @@ -64,8 +64,8 @@ def generate_artifacts( model: The base model to be used for gradient graph generation. requires_grad: List of names of model parameters that require gradient computation frozen_params: List of names of model parameters that should be frozen. - loss: The loss function enum to be used for training. If None, no loss node is added to the graph. - optimizer: The optimizer enum to be used for training. If None, no optimizer model is generated. + loss: The loss function enum or onnxblock to be used for training. If None, no loss node is added to the graph. + optimizer: The optimizer enum or onnxblock to be used for training. If None, no optimizer model is generated. artifact_directory: The directory to save the generated artifacts. If None, the current working directory is used. prefix: The prefix to be used for the generated artifacts. If not specified, no prefix is used. @@ -219,14 +219,6 @@ def _export_to_ort_format(model_path, output_dir, ort_format, custom_op_library_ logging.info("No optimizer enum provided. Skipping optimizer model generation.") return - if not isinstance(optimizer, OptimType): - raise RuntimeError( - f"Unknown optimizer provided {type(optimizer)}. Expected optimizer to be of type " - "onnxruntime.training.artifacts.OptimType." - ) - - logging.info("Optimizer enum provided: %s", optimizer.name) - opset_version = None for domain in model.opset_import: if domain.domain == "" or domain.domain == "ai.onnx": @@ -235,8 +227,19 @@ def _export_to_ort_format(model_path, output_dir, ort_format, custom_op_library_ optim_model = None optim_blocks = {OptimType.AdamW: onnxblock.optim.AdamW, OptimType.SGD: onnxblock.optim.SGD} + optim_block = None + if isinstance(optimizer, OptimType): + logging.info("Optimizer enum provided: %s", optimizer.name) + optim_block = optim_blocks[optimizer]() + elif isinstance(optimizer, onnxblock.Block): + logging.info("Optimizer block provided: %s", optimizer.__class__.__name__) + optim_block = optimizer + else: + raise TypeError( + f"Unknown optimizer provided {type(optimizer)}. Expected optimizer to be either one of" + "onnxruntime.training.artifacts.OptimType or onnxruntime.training.onnxblock.Block." + ) - optim_block = optim_blocks[optimizer]() with onnxblock.empty_base(opset_version=opset_version): _ = optim_block(model_params) optim_model = optim_block.to_model_proto() diff --git a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py index 11df3fa347ff8..ac49c1c2834c7 100644 --- a/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py +++ b/orttraining/orttraining/test/python/orttraining_test_ort_apis_onnxblock.py @@ -1072,3 +1072,30 @@ def test_save_nominal_checkpoint(): os.stat(os.path.join(temp_dir, "checkpoint")).st_size > os.stat(os.path.join(temp_dir, "nominal_checkpoint")).st_size ) + + +def test_custom_optimizer_block(): + device = "cpu" + batch_size, input_size, hidden_size, output_size = 64, 784, 500, 10 + _, base_model = _get_models(device, batch_size, input_size, hidden_size, output_size) + weight_decay = 123 + optimizer = onnxblock.optim.AdamW(weight_decay=weight_decay) + + with tempfile.TemporaryDirectory() as temp_dir: + artifacts.generate_artifacts( + base_model, + requires_grad=["fc1.weight", "fc1.bias", "fc2.weight", "fc2.bias"], + loss=artifacts.LossType.CrossEntropyLoss, + optimizer=optimizer, + artifact_directory=temp_dir, + ) + + assert os.path.exists(os.path.join(temp_dir, "checkpoint")) + assert os.path.exists(os.path.join(temp_dir, "optimizer_model.onnx")) + + optimizer_model = onnx.load(os.path.join(temp_dir, "optimizer_model.onnx")) + for node in optimizer_model.graph.node: + if node.op_type == "AdamW": + for attr in node.attribute: + if attr.name == "weight_decay": + assert attr.f == weight_decay diff --git a/orttraining/orttraining/training_api/optimizer.cc b/orttraining/orttraining/training_api/optimizer.cc index 84c35e6100385..4647f890729f4 100644 --- a/orttraining/orttraining/training_api/optimizer.cc +++ b/orttraining/orttraining/training_api/optimizer.cc @@ -61,32 +61,19 @@ Status GraphInputsAreExpected(gsl::span actual_graph_inputs, } // namespace std::unique_ptr OptimizerAlorithmFactory::CreateInstance( - std::shared_ptr model, int32_t& group_count) { + const GraphViewer& graph_viewer, int32_t& group_count) { std::map, int32_t> opt_type_to_freq_map; -#if !defined(ORT_MINIMAL_BUILD) - if (model != nullptr) { - Graph& graph = model->MainGraph(); - for (auto& node : graph.Nodes()) { - if (node.Domain() == kMSDomain && (node.OpType() == "AdamWOptimizer" || node.OpType() == "SGDOptimizerV2")) { - auto domain_type_pair = std::make_pair(node.Domain(), node.OpType()); - if (opt_type_to_freq_map.find(domain_type_pair) == opt_type_to_freq_map.end()) { - opt_type_to_freq_map[domain_type_pair] = 0; - } - opt_type_to_freq_map[domain_type_pair] += 1; + for (const auto& node : graph_viewer.Nodes()) { + if (node.Domain() == kMSDomain && (node.OpType() == "AdamWOptimizer" || node.OpType() == "SGDOptimizerV2")) { + auto domain_type_pair = std::make_pair(node.Domain(), node.OpType()); + if (opt_type_to_freq_map.find(domain_type_pair) == opt_type_to_freq_map.end()) { + opt_type_to_freq_map[domain_type_pair] = 0; } + + opt_type_to_freq_map[domain_type_pair] += 1; } - } else { -#else - ORT_UNUSED_PARAMETER(model); -#endif - // TODO(baijumeswani): Figure out the best way to extract the optimizer type - // from the model (either onnx model or ort format model) or from the checkpoint. - // For now, assume that the optimizer type is AdamWOptimizer when using ort format models. - opt_type_to_freq_map[std::make_pair(kMSDomain, "AdamWOptimizer")] = 1; -#if !defined(ORT_MINIMAL_BUILD) } -#endif ORT_ENFORCE(opt_type_to_freq_map.size() == 1U, "Only support one type of optimizer algorithm, but got: " + std::to_string(opt_type_to_freq_map.size())); @@ -105,42 +92,6 @@ std::unique_ptr OptimizerAlorithmFactory::CreateInstance } } -std::unique_ptr OptimizerAlorithmFactory::CreateInstance( - const PathString& optim_path, int32_t& group_count) { - std::shared_ptr model = nullptr; -#if !defined(ORT_MINIMAL_BUILD) - if (!fbs::utils::IsOrtFormatModel(optim_path)) { - ORT_ENFORCE(Model::Load(optim_path, model, nullptr, - logging::LoggingManager::DefaultLogger()) - .IsOK()); - } -#else - ORT_UNUSED_PARAMETER(optim_path); -#endif - return CreateInstance(model, group_count); -} - -std::unique_ptr OptimizerAlorithmFactory::CreateInstance( - const uint8_t* optim_model_data, size_t optim_model_data_len, int32_t& group_count) { - std::shared_ptr model = nullptr; -#if !defined(ORT_MINIMAL_BUILD) - if (!fbs::utils::IsOrtFormatModelBytes(optim_model_data, static_cast(optim_model_data_len))) { - ONNX_NAMESPACE::ModelProto model_proto; - ORT_ENFORCE(model_proto.ParseFromArray(optim_model_data, static_cast(optim_model_data_len)) == true, - "Failed to load model because protobuf parsing failed."); - - ORT_ENFORCE(Model::Load(std::move(model_proto), model, nullptr, - logging::LoggingManager::DefaultLogger(), ModelOptions(true, true)) - .IsOK()); - } -#else - ORT_UNUSED_PARAMETER(optim_model_data); - ORT_UNUSED_PARAMETER(optim_model_data_len); -#endif - - return CreateInstance(model, group_count); -} - Status Optimizer::GenerateMomentumNamedStates(OptimizerCheckpointState& optimizer_checkpoint_states) { auto group_optimizer_state_it = optimizer_checkpoint_states.group_named_optimizer_states.find(GROUP_ZERO_NAME); @@ -280,17 +231,15 @@ void Optimizer::Initialize(const ModelIdentifiers& model_identifiers, auto optimizer_model = std::get>(model_identifiers.optim_model); // The above call to IsOptimizerModelAvailable() ensures that optimizer_model is not nullopt ORT_THROW_IF_ERROR(optim_sess_->Load(optimizer_model.value())); - optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(ToWideString(optimizer_model.value()), group_count_); } else { auto optimizer_model = std::get>(model_identifiers.optim_model); ORT_THROW_IF_ERROR(optim_sess_->Load(optimizer_model.data(), static_cast(optimizer_model.size()))); - optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(optimizer_model.data(), - optimizer_model.size(), - group_count_); } ORT_THROW_IF_ERROR(optim_sess_->Initialize()); + optimizer_algo_ptr_ = OptimizerAlorithmFactory::CreateInstance(optim_sess_->GetSessionState().GetGraphViewer(), + group_count_); // Make sure that the checkpoint state can copy tensors state_->optimizer_checkpoint_state.optimizer_session_data_transfer_mgr = &optim_sess_->GetDataTransferManager(); diff --git a/orttraining/orttraining/training_api/optimizer.h b/orttraining/orttraining/training_api/optimizer.h index 031b11426539b..5b908acf7c9e3 100644 --- a/orttraining/orttraining/training_api/optimizer.h +++ b/orttraining/orttraining/training_api/optimizer.h @@ -64,11 +64,8 @@ struct SGDOptimizerV2Algorithm : public OptimizerAlgorithmBase { }; struct OptimizerAlorithmFactory { - static std::unique_ptr CreateInstance(const PathString& optim_path, + static std::unique_ptr CreateInstance(const GraphViewer& graph_viewer, int32_t& group_count); - static std::unique_ptr CreateInstance(const uint8_t* optim_model_data, - size_t optim_model_data_len, int32_t& group_count); - static std::unique_ptr CreateInstance(std::shared_ptr model, int32_t& group_count); }; struct CheckpointState; From 0b90363acb8d8e9662621c811457393bbf11f309 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Thu, 14 Mar 2024 13:05:42 -0700 Subject: [PATCH 03/55] [MLAS][AArch64] SQ4BitGemm CompInt8 multi-block implementation (#19826) Update SQ4BitGemm CompInt8 implementation to process multiple blocks along a single column instead of processing single blocks from multiple columns. --- .../core/mlas/lib/sqnbitgemm_kernel_neon.cpp | 529 ++++++++++++------ .../test/mlas/bench/bench_sqnbitgemm.cpp | 91 ++- onnxruntime/test/mlas/bench/bench_util.cpp | 27 - onnxruntime/test/mlas/bench/bench_util.h | 6 - 4 files changed, 426 insertions(+), 227 deletions(-) diff --git a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp index c4c54a9be34d8..9d7b0ae06e220 100644 --- a/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp +++ b/onnxruntime/core/mlas/lib/sqnbitgemm_kernel_neon.cpp @@ -687,171 +687,314 @@ QuantizeARow_CompInt8( } } -template -MLAS_FORCEINLINE void -ComputeDotProducts_BlkBitWidth4_CompInt8( - size_t BlkLen, - const std::byte* QuantARowPtr, - const std::byte* QuantBDataColPtr, - const float* QuantBScaleColPtr, - const std::byte* QuantBZeroPointColPtr, - float* SumPtr, - size_t CountK, - size_t StrideQuantBData, - size_t StrideQuantBScale, - size_t StrideQuantBZeroPoint, - const float* BiasPtr +template +void +SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen16( + const std::byte* QuantA, + const std::byte* QuantBData, + const float* QuantBScale, + const std::byte* QuantBZeroPoint, + float* C, + size_t CountN, + size_t BlockCountK, + const float* Bias ) { constexpr size_t BlkBitWidth = 4; + constexpr size_t BlkLen = 16; - static_assert(NCols == 1 || NCols == 4, "NCols must be 1 or 4"); - static_assert(SubBlkLen == 16 || SubBlkLen == 32, "SubBlkLen must be 16 or 32"); + float* CRowPtr = C; - assert(BlkLen >= SubBlkLen && BlkLen % SubBlkLen == 0); + const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); + const size_t StrideQuantBScale = BlockCountK; + const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes(BlockCountK); - [[maybe_unused]] const uint8x8_t LowMaskU8x8 = vdup_n_u8(0x0F); // only used if SubBlkLen == 16 - [[maybe_unused]] const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F); // only used if SubBlkLen == 32 + const float* BiasPtr = Bias; - const std::byte* QuantA = QuantARowPtr; + const std::byte* QuantBDataColPtr = QuantBData; + const float* QuantBScaleColPtr = QuantBScale; + const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; - const std::byte* QuantBData = QuantBDataColPtr; - const float* QuantBScale = QuantBScaleColPtr; - [[maybe_unused]] size_t QuantBZeroPointIdx = 0; // track half byte increments with this index instead of a pointer - // only used if HasZeroPoint == true + float* SumPtr = CRowPtr; - float32x4_t acc[NCols]{}; + const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F); + const uint8x8_t LowMaskU8x8 = vdup_n_u8(0x0F); - for (size_t k = 0; k < CountK; k += BlkLen) { - const size_t k_blk_len = std::min(CountK - k, BlkLen); + for (size_t n = 0; n < CountN; ++n) { + const std::byte* QuantAPtr = QuantA; + const std::byte* QuantBDataPtr = QuantBDataColPtr; + const float* QuantBScalePtr = QuantBScaleColPtr; + const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; - const float a_scale = Q8BlkScale(QuantA); - const int8_t* a_data = Q8BlkData(QuantA); + float32x4_t acc0{}, acc1{}; - float b_scale[NCols]; - UnrolledLoop([&](size_t i) { b_scale[i] = QuantBScale[i * StrideQuantBScale]; }); + size_t k_blks_remaining = BlockCountK; + for (; k_blks_remaining > 1; k_blks_remaining -= 2) { + const std::byte* QuantABlk0 = QuantAPtr; + const std::byte* QuantABlk1 = QuantABlk0 + Q8BlkSize(BlkLen); - [[maybe_unused]] int8_t b_zp[NCols]; // only used if HasZeroPoint == true - if constexpr (HasZeroPoint) { - UnrolledLoop([&](size_t i) { - const std::byte zp_packed = - QuantBZeroPointColPtr[i * StrideQuantBZeroPoint + QuantBZeroPointIdx / 2]; - b_zp[i] = ((QuantBZeroPointIdx & 1) == 1) - ? std::to_integer(zp_packed >> 4) - : std::to_integer(zp_packed & std::byte{0x0F}); - }); - } + // compute combined scale + const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * QuantBScalePtr[0]); + const float32x4_t scale1 = vdupq_n_f32(Q8BlkScale(QuantABlk1) * QuantBScalePtr[1]); - for (size_t k_idx_in_blk = 0; k_idx_in_blk < k_blk_len; k_idx_in_blk += SubBlkLen) { - // load A row vector - int8x16_t av[SubBlkLen / 16]; - UnrolledLoop([&](size_t i) { - av[i] = vld1q_s8(a_data + k_idx_in_blk + i * 16); - }); + // load B zero point + const int8x16_t bzp0 = vdupq_n_s8( + HasZeroPoint ? std::to_integer(QuantBZeroPointPtr[0] & std::byte{0x0F}) : 8 + ); + const int8x16_t bzp1 = vdupq_n_s8( + HasZeroPoint ? std::to_integer(QuantBZeroPointPtr[0] >> 4) : 8 + ); - // load B column vectors - int8x16_t bv[NCols][SubBlkLen / 16]; + // load A + const int8x16_t av0 = vld1q_s8(Q8BlkData(QuantABlk0)); + const int8x16_t av1 = vld1q_s8(Q8BlkData(QuantABlk1)); - const size_t b_data_block_offset = k_idx_in_blk * BlkBitWidth / 8; + // load B + const uint8x16_t bv_packed01 = vld1q_u8(reinterpret_cast(QuantBDataPtr)); - if constexpr (SubBlkLen == 16) { - uint8x8_t bv_packed[NCols]; - UnrolledLoop([&](size_t i) { - bv_packed[i] = vld1_u8( - reinterpret_cast(QuantBData) + i * StrideQuantBData + b_data_block_offset - ); - }); + const uint8x16_t bv_lo01 = vandq_u8(bv_packed01, LowMaskU8x16); + const uint8x16_t bv_hi01 = vshrq_n_u8(bv_packed01, 4); - UnrolledLoop([&](size_t i) { - const int8x8_t lo = vreinterpret_s8_u8(vand_u8(bv_packed[i], LowMaskU8x8)); - const int8x8_t hi = vreinterpret_s8_u8(vshr_n_u8(bv_packed[i], 4)); - bv[i][0] = vcombine_s8(lo, hi); - }); - } else { - static_assert(SubBlkLen == 32); + int8x16_t bv0 = vreinterpretq_s8_u8(vcombine_u8(vget_low_u8(bv_lo01), vget_low_u8(bv_hi01))); + int8x16_t bv1 = vreinterpretq_s8_u8(vcombine_u8(vget_high_u8(bv_lo01), vget_high_u8(bv_hi01))); - uint8x16_t bv_packed[NCols]; - UnrolledLoop([&](size_t i) { - bv_packed[i] = vld1q_u8( - reinterpret_cast(QuantBData) + i * StrideQuantBData + b_data_block_offset - ); - }); + // subtract B zero point + bv0 = vsubq_s8(bv0, bzp0); + bv1 = vsubq_s8(bv1, bzp1); - UnrolledLoop([&](size_t i) { - bv[i][0] = vreinterpretq_s8_u8(vandq_u8(bv_packed[i], LowMaskU8x16)); - bv[i][1] = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed[i], 4)); - }); + // quantized dot product + const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0); + const int32x4_t dot1 = vdotq_s32(vdupq_n_s32(0), av1, bv1); + + // convert to float + const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); + const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1); + + // multiply by scale and update accumulator + acc0 = vfmaq_f32(acc0, dot_f32_0, scale0); + acc1 = vfmaq_f32(acc1, dot_f32_1, scale1); + + // increment block pointers + + QuantAPtr += Q8BlkSize(BlkLen) * 2; + QuantBDataPtr += 8 * 2; + QuantBScalePtr += 2; + if constexpr (HasZeroPoint) { + QuantBZeroPointPtr += 1; } + } + + if (k_blks_remaining > 0) { + const std::byte* QuantABlk0 = QuantAPtr; + + // compute combined scale + const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * (*QuantBScalePtr)); + + // load B zero point + const int8x16_t bzp0 = vdupq_n_s8( + HasZeroPoint ? std::to_integer(QuantBZeroPointPtr[0] & std::byte{0x0F}) : 8 + ); + + // load A + const int8x16_t av0 = vld1q_s8(Q8BlkData(QuantABlk0)); + + // load B + const uint8x8_t bv_packed0 = vld1_u8(reinterpret_cast(QuantBDataPtr)); + + const uint8x8_t bv_lo0 = vand_u8(bv_packed0, LowMaskU8x8); + const uint8x8_t bv_hi0 = vshr_n_u8(bv_packed0, 4); + + int8x16_t bv0 = vreinterpretq_s8_u8(vcombine_u8(bv_lo0, bv_hi0)); // subtract B zero point - if constexpr (HasZeroPoint) { - UnrolledLoop([&](size_t i) { - const int8x16_t zp_v = vdupq_n_s8(b_zp[i]); - UnrolledLoop([&](size_t j) { - bv[i][j] = vsubq_s8(bv[i][j], zp_v); - }); - }); - } else { - const int8x16_t zp_v = vdupq_n_s8(8); + bv0 = vsubq_s8(bv0, bzp0); - UnrolledLoop([&](size_t i) { - UnrolledLoop([&](size_t j) { - bv[i][j] = vsubq_s8(bv[i][j], zp_v); - }); - }); - } + // quantized dot product + const int32x4_t dot0 = vdotq_s32(vdupq_n_s32(0), av0, bv0); - // compute quantized dot product - int32x4_t dot[NCols]{}; - UnrolledLoop([&](size_t i) { - UnrolledLoop([&](size_t j) { - dot[i] = vdotq_s32(dot[i], av[j], bv[i][j]); - }); - }); + // convert to float + const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); - // convert dot product result to float - float32x4_t dot_f32[NCols]; - UnrolledLoop([&](size_t i) { - dot_f32[i] = vcvtq_f32_s32(dot[i]); - }); + // multiply by scale and update accumulator + acc0 = vfmaq_f32(acc0, dot_f32_0, scale0); + } - // multiply dot product result by scale and update accumulator - UnrolledLoop([&](size_t i) { - const float32x4_t scale_v = vdupq_n_f32(a_scale * b_scale[i]); - acc[i] = vfmaq_f32(acc[i], dot_f32[i], scale_v); - }); + *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1); + if (BiasPtr) { + *SumPtr += *BiasPtr; } - // increment pointers to next block - QuantA += Q8BlkSize(BlkLen); - QuantBData += MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); - QuantBScale += 1; + // move to next column + + QuantBDataColPtr += StrideQuantBData; + QuantBScaleColPtr += StrideQuantBScale; if constexpr (HasZeroPoint) { - QuantBZeroPointIdx += 1; + QuantBZeroPointColPtr += StrideQuantBZeroPoint; } + + BiasPtr += BiasPtr != nullptr ? 1 : 0; + SumPtr += 1; } +} - if constexpr (NCols == 4) { - float32x4_t sum = FoldAccumulators(acc[0], acc[1], acc[2], acc[3]); +template +void +SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen32( + const std::byte* QuantA, + const std::byte* QuantBData, + const float* QuantBScale, + const std::byte* QuantBZeroPoint, + float* C, + size_t CountN, + size_t BlockCountK, + const float* Bias +) +{ + constexpr size_t BlkBitWidth = 4; + constexpr size_t BlkLen = 32; - if (BiasPtr != nullptr) { - sum = vaddq_f32(sum, vld1q_f32(BiasPtr)); - } + float* CRowPtr = C; - vst1q_f32(SumPtr, sum); - } else { - for (size_t i = 0; i < NCols; ++i) { - SumPtr[i] = vaddvq_f32(acc[i]); - if (BiasPtr != nullptr) { - SumPtr[i] += BiasPtr[i]; + const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); + const size_t StrideQuantBScale = BlockCountK; + const size_t StrideQuantBZeroPoint = MlasQNBitZeroPointsForBlksSizeInBytes(BlockCountK); + + const float* BiasPtr = Bias; + + const std::byte* QuantBDataColPtr = QuantBData; + const float* QuantBScaleColPtr = QuantBScale; + const std::byte* QuantBZeroPointColPtr = QuantBZeroPoint; + + float* SumPtr = CRowPtr; + + const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F); + + for (size_t n = 0; n < CountN; ++n) { + const std::byte* QuantAPtr = QuantA; + const std::byte* QuantBDataPtr = QuantBDataColPtr; + const float* QuantBScalePtr = QuantBScaleColPtr; + const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; + + float32x4_t acc0{}, acc1{}; + + size_t k_blks_remaining = BlockCountK; + for (; k_blks_remaining > 1; k_blks_remaining -= 2) { + const std::byte* QuantABlk0 = QuantAPtr; + const std::byte* QuantABlk1 = QuantABlk0 + Q8BlkSize(BlkLen); + + // compute combined scale + const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * QuantBScalePtr[0]); + const float32x4_t scale1 = vdupq_n_f32(Q8BlkScale(QuantABlk1) * QuantBScalePtr[1]); + + // load B zero point + const int8x16_t bzp0 = vdupq_n_s8( + HasZeroPoint ? std::to_integer((*QuantBZeroPointPtr) & std::byte{0x0F}) : 8 + ); + const int8x16_t bzp1 = vdupq_n_s8( + HasZeroPoint ? std::to_integer((*QuantBZeroPointPtr) >> 4) : 8 + ); + + // load A + const int8x16_t av_lo0 = vld1q_s8(Q8BlkData(QuantABlk0)); + const int8x16_t av_hi0 = vld1q_s8(Q8BlkData(QuantABlk0) + 16); + const int8x16_t av_lo1 = vld1q_s8(Q8BlkData(QuantABlk1)); + const int8x16_t av_hi1 = vld1q_s8(Q8BlkData(QuantABlk1) + 16); + + // load B + const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast(QuantBDataPtr)); + const uint8x16_t bv_packed1 = vld1q_u8(reinterpret_cast(QuantBDataPtr) + 16); + + int8x16_t bv_lo0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16)); + int8x16_t bv_hi0 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4)); + int8x16_t bv_lo1 = vreinterpretq_s8_u8(vandq_u8(bv_packed1, LowMaskU8x16)); + int8x16_t bv_hi1 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed1, 4)); + + // subtract B zero point + bv_lo0 = vsubq_s8(bv_lo0, bzp0); + bv_hi0 = vsubq_s8(bv_hi0, bzp0); + bv_lo1 = vsubq_s8(bv_lo1, bzp1); + bv_hi1 = vsubq_s8(bv_hi1, bzp1); + + // quantized dot product + int32x4_t dot0{}, dot1{}; + dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0); + dot1 = vdotq_s32(vdotq_s32(dot1, av_lo1, bv_lo1), av_hi1, bv_hi1); + + // convert to float + const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); + const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1); + + // multiply by scale and update accumulator + acc0 = vfmaq_f32(acc0, dot_f32_0, scale0); + acc1 = vfmaq_f32(acc1, dot_f32_1, scale1); + + // increment block pointers + + QuantAPtr += Q8BlkSize(BlkLen) * 2; + QuantBDataPtr += 16 * 2; + QuantBScalePtr += 2; + if constexpr (HasZeroPoint) { + QuantBZeroPointPtr += 1; } } + + if (k_blks_remaining > 0) { + const std::byte* QuantABlk0 = QuantAPtr; + + // compute combined scale + const float32x4_t scale0 = vdupq_n_f32(Q8BlkScale(QuantABlk0) * (*QuantBScalePtr)); + + // load B zero point + const int8x16_t bzp0 = vdupq_n_s8( + HasZeroPoint ? std::to_integer((*QuantBZeroPoint) & std::byte{0x0F}) : 8 + ); + + // load A + const int8x16_t av_lo0 = vld1q_s8(Q8BlkData(QuantABlk0)); + const int8x16_t av_hi0 = vld1q_s8(Q8BlkData(QuantABlk0) + 16); + + // load B + const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast(QuantBDataPtr)); + + int8x16_t bv_lo0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16)); + int8x16_t bv_hi0 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4)); + + // subtract B zero point + bv_lo0 = vsubq_s8(bv_lo0, bzp0); + bv_hi0 = vsubq_s8(bv_hi0, bzp0); + + // quantized dot product + int32x4_t dot0{}; + dot0 = vdotq_s32(vdotq_s32(dot0, av_lo0, bv_lo0), av_hi0, bv_hi0); + + // convert to float + const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); + + // multiply by scale and update accumulator + acc0 = vfmaq_f32(acc0, dot_f32_0, scale0); + } + + *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1); + if (BiasPtr) { + *SumPtr += *BiasPtr; + } + + // move to next column + + QuantBDataColPtr += StrideQuantBData; + QuantBScaleColPtr += StrideQuantBScale; + if constexpr (HasZeroPoint) { + QuantBZeroPointColPtr += StrideQuantBZeroPoint; + } + + BiasPtr += BiasPtr != nullptr ? 1 : 0; + SumPtr += 1; } } -template +template void -SQ4BitGemmM1Kernel_CompInt8_Impl( +SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLenGreaterThan32( size_t BlkLen, const std::byte* QuantA, const std::byte* QuantBData, @@ -859,17 +1002,16 @@ SQ4BitGemmM1Kernel_CompInt8_Impl( const std::byte* QuantBZeroPoint, float* C, size_t CountN, - size_t CountK, - size_t BlockStrideQuantB, + size_t BlockCountK, const float* Bias ) { constexpr size_t BlkBitWidth = 4; - const std::byte* QuantARowPtr = QuantA; - float* CRowPtr = C; + assert(BlkLen > 32); + assert(BlkLen % 32 == 0); - const size_t BlockCountK = BlockStrideQuantB; + float* CRowPtr = C; const size_t StrideQuantBData = BlockCountK * MlasQNBitBlkDataSizeInBytes(BlkBitWidth, BlkLen); const size_t StrideQuantBScale = BlockCountK; @@ -883,39 +1025,91 @@ SQ4BitGemmM1Kernel_CompInt8_Impl( float* SumPtr = CRowPtr; - int64_t nblk = static_cast(CountN) - NCols; + const uint8x16_t LowMaskU8x16 = vdupq_n_u8(0x0F); - while (nblk >= 0) { - ComputeDotProducts_BlkBitWidth4_CompInt8( - BlkLen, - QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK, - StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint, - BiasPtr - ); + // process blocks in 32-element sub-blocks + const size_t SubBlksPerBlk = BlkLen / 32; - // move to next `NCols` columns + for (size_t n = 0; n < CountN; ++n) { + const std::byte* QuantAPtr = QuantA; + const std::byte* QuantBDataPtr = QuantBDataColPtr; + const float* QuantBScalePtr = QuantBScaleColPtr; + const std::byte* QuantBZeroPointPtr = QuantBZeroPointColPtr; - QuantBDataColPtr += NCols * StrideQuantBData; - QuantBScaleColPtr += NCols * StrideQuantBScale; - if constexpr (HasZeroPoint) { - QuantBZeroPointColPtr += NCols * StrideQuantBZeroPoint; - } + float32x4_t acc0{}, acc1{}; - BiasPtr += BiasPtr != nullptr ? NCols : 0; - SumPtr += NCols; + for (size_t k_blk_idx = 0; k_blk_idx < BlockCountK; ++k_blk_idx) { + // compute combined scale + const float32x4_t scale = vdupq_n_f32(Q8BlkScale(QuantAPtr) * (*QuantBScalePtr)); - nblk -= NCols; - } + // load B zero point + const int8x16_t bzp = [&]() -> int8x16_t { + if constexpr (HasZeroPoint) { + return vdupq_n_s8( + ((k_blk_idx & 1) == 0) ? std::to_integer((*QuantBZeroPointPtr) & std::byte{0x0F}) + : std::to_integer((*QuantBZeroPointPtr) >> 4) + ); + } else { + return vdupq_n_s8(8); + } + }(); + + const int8_t* QuantADataPtr = Q8BlkData(QuantAPtr); + + for (size_t sub_blk_idx = 0; sub_blk_idx < SubBlksPerBlk; sub_blk_idx += 2) { + // load A + const int8x16_t av0 = vld1q_s8(QuantADataPtr + 0); + const int8x16_t av1 = vld1q_s8(QuantADataPtr + 16); + const int8x16_t av2 = vld1q_s8(QuantADataPtr + 32); + const int8x16_t av3 = vld1q_s8(QuantADataPtr + 48); + + // load B + const uint8x16_t bv_packed0 = vld1q_u8(reinterpret_cast(QuantBDataPtr)); + const uint8x16_t bv_packed1 = vld1q_u8(reinterpret_cast(QuantBDataPtr) + 16); + + int8x16_t bv0 = vreinterpretq_s8_u8(vandq_u8(bv_packed0, LowMaskU8x16)); + int8x16_t bv1 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed0, 4)); + int8x16_t bv2 = vreinterpretq_s8_u8(vandq_u8(bv_packed1, LowMaskU8x16)); + int8x16_t bv3 = vreinterpretq_s8_u8(vshrq_n_u8(bv_packed1, 4)); + + // subtract B zero point + bv0 = vsubq_s8(bv0, bzp); + bv1 = vsubq_s8(bv1, bzp); + bv2 = vsubq_s8(bv2, bzp); + bv3 = vsubq_s8(bv3, bzp); + + // quantized dot product + int32x4_t dot0{}, dot1{}; + dot0 = vdotq_s32(vdotq_s32(dot0, av0, bv0), av1, bv1); + dot1 = vdotq_s32(vdotq_s32(dot1, av2, bv2), av3, bv3); + + // convert to float + const float32x4_t dot_f32_0 = vcvtq_f32_s32(dot0); + const float32x4_t dot_f32_1 = vcvtq_f32_s32(dot1); + + // multiply by scale and update accumulator + acc0 = vfmaq_f32(acc0, dot_f32_0, scale); + acc1 = vfmaq_f32(acc1, dot_f32_1, scale); + + // increment block data pointers to next sub-block + QuantADataPtr += 16 * 4; + QuantBDataPtr += 16 * 2; + } - // left over columns less than `NCols`? - nblk += NCols; - for (int64_t n = 0; n < nblk; ++n) { - ComputeDotProducts_BlkBitWidth4_CompInt8<1, SubBlkLen, HasZeroPoint>( - BlkLen, - QuantARowPtr, QuantBDataColPtr, QuantBScaleColPtr, QuantBZeroPointColPtr, SumPtr, CountK, - StrideQuantBData, StrideQuantBScale, StrideQuantBZeroPoint, - BiasPtr - ); + // increment other block pointers + + QuantAPtr += Q8BlkSize(BlkLen); + QuantBScalePtr += 1; + + if constexpr (HasZeroPoint) { + QuantBZeroPointPtr += ((k_blk_idx & 1) == 0) ? 0 : 1; + } + } + + *SumPtr = vaddvq_f32(acc0) + vaddvq_f32(acc1); + if (BiasPtr) { + *SumPtr += *BiasPtr; + } // move to next column @@ -940,26 +1134,34 @@ SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen( const std::byte* QuantBZeroPoint, float* C, size_t CountN, - size_t CountK, size_t BlockStrideQuantB, const float* Bias ) { if (BlkLen == 16) { - SQ4BitGemmM1Kernel_CompInt8_Impl<4, 16, HasZeroPoint>( - BlkLen, + SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen16( + QuantA, + QuantBData, + QuantBScale, + QuantBZeroPoint, + C, + CountN, + BlockStrideQuantB, + Bias + ); + } else if (BlkLen == 32) { + SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLen32( QuantA, QuantBData, QuantBScale, QuantBZeroPoint, C, CountN, - CountK, BlockStrideQuantB, Bias ); } else { - SQ4BitGemmM1Kernel_CompInt8_Impl<4, 32, HasZeroPoint>( + SQ4BitGemmM1Kernel_CompInt8_Impl_BlkLenGreaterThan32( BlkLen, QuantA, QuantBData, @@ -967,7 +1169,6 @@ SQ4BitGemmM1Kernel_CompInt8_DispatchOnBlkLen( QuantBZeroPoint, C, CountN, - CountK, BlockStrideQuantB, Bias ); @@ -984,7 +1185,7 @@ SQ4BitGemmM1Kernel_CompInt8( const std::byte* QuantBZeroPoint, float* C, size_t CountN, - size_t CountK, + size_t /*CountK*/, size_t BlockStrideQuantB, const float* Bias ) @@ -998,7 +1199,6 @@ SQ4BitGemmM1Kernel_CompInt8( QuantBZeroPoint, C, CountN, - CountK, BlockStrideQuantB, Bias ); @@ -1011,7 +1211,6 @@ SQ4BitGemmM1Kernel_CompInt8( QuantBZeroPoint, C, CountN, - CountK, BlockStrideQuantB, Bias ); diff --git a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp index b7b453415838a..04f5947e1371c 100644 --- a/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp +++ b/onnxruntime/test/mlas/bench/bench_sqnbitgemm.cpp @@ -5,26 +5,30 @@ #include "mlas_qnbit.h" #include +#include #include #include #include "benchmark/benchmark.h" #include "bench_util.h" -#include "core/util/thread_utils.h" #include "core/common/narrow.h" +#include "core/util/thread_utils.h" +#include "core/platform/env_var_utils.h" using onnxruntime::narrow; template -void SQNBITGEMM(benchmark::State& state) { - const auto BlkLen = narrow(state.range(0)); - const auto M = narrow(state.range(1)); - const auto N = narrow(state.range(2)); - const auto K = narrow(state.range(3)); - const auto Threads = narrow(state.range(4)); - const auto Symmetric = narrow(state.range(5)); - const auto ComputeType = static_cast(state.range(6)); +void RunSQNBitGemmBenchmark(size_t BlkLen, + size_t M, size_t N, size_t K, + size_t Threads, + bool Symmetric, + MLAS_SQNBIT_GEMM_COMPUTE_TYPE ComputeType, + benchmark::State& state) { + if (!MlasIsSQNBitGemmAvailable(BlkBitWidth, BlkLen, ComputeType)) { + state.SkipWithMessage("SQNBitGemm is not available with the given configuration on the current machine."); + return; + } size_t QuantBDataSizeInBytes, QuantBScaleSize, QuantBZeroPointSizeInBytes; MlasBlockwiseQuantizedBufferSizes( @@ -88,28 +92,57 @@ void SQNBITGEMM(benchmark::State& state) { } } -static void SQ4BitGemmArgs(benchmark::internal::Benchmark* b) { - constexpr size_t BlkBitWidth = 4; +template +void SQNBITGEMM(benchmark::State& state) { + const auto BlkLen = narrow(state.range(0)); + const auto M = narrow(state.range(1)); + const auto N = narrow(state.range(2)); + const auto K = narrow(state.range(3)); + const auto Threads = narrow(state.range(4)); + const auto Symmetric = narrow(state.range(5)); + const auto ComputeType = static_cast(state.range(6)); + + RunSQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, ComputeType, state); +} + +// This test gets benchmark arguments from environment variables. +template +void SQNBITGEMM_ENV(benchmark::State& state) { + using onnxruntime::ParseEnvironmentVariableWithDefault; + + const auto BlkLen = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_BLKLEN", 32); + const auto M = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_M", 1); + const auto N = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_N", 4096); + const auto K = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_K", 4096); + const auto Threads = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_THREADS", 1); + const auto Symmetric = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_SYMMETRIC", true); + const auto ComputeType = ParseEnvironmentVariableWithDefault("ORT_SQNBITGEMM_COMPUTE_TYPE", + static_cast(CompFp32)); + + RunSQNBitGemmBenchmark(BlkLen, M, N, K, Threads, Symmetric, + static_cast(ComputeType), + state); + + std::ostringstream s; + s << "BlkBitWidth:" << BlkBitWidth << "/BlkLen:" << BlkLen + << "/M:" << M << "/N:" << N << "/K:" << K + << "/Threads:" << Threads << "/Symmetric:" << Symmetric << "/ComputeType:" << ComputeType; + state.SetLabel(s.str()); +} +static void SQNBitGemmArgs(benchmark::internal::Benchmark* b) { b->ArgNames({"BlkLen", "M", "N", "K", "Threads", "Symmetric", "ComputeType"}); - ArgsProductWithFilter(b, - - {{16, 32, 64, 128, 256}, // BlkLen - {1, 1024, 2048}, // M - {4096, 11008}, // N - {4096, 11008}, // K - {1, 8}, // Threads - {int64_t{false}, int64_t{true}}, // Symmetric - {int64_t{CompFp32}, int64_t{CompInt8}}}, // ComputeType - - [&](const std::vector& args) { - return MlasIsSQNBitGemmAvailable( - // BlkBitWidth, BlkLen - BlkBitWidth, narrow(args[0]), - // ComputeType - static_cast(args[6])); - }); + b->ArgsProduct({ + {16, 32, 64, 128, 256}, // BlkLen + {1, 1024, 2048}, // M + {4096, 11008}, // N + {4096, 11008}, // K + {1, 8}, // Threads + {int64_t{false}, int64_t{true}}, // Symmetric + {int64_t{CompFp32}, int64_t{CompInt8}}, // ComputeType + }); } -BENCHMARK(SQNBITGEMM<4>)->Apply(SQ4BitGemmArgs)->UseRealTime(); +BENCHMARK(SQNBITGEMM<4>)->Apply(SQNBitGemmArgs)->UseRealTime(); +BENCHMARK(SQNBITGEMM_ENV<4>)->UseRealTime(); diff --git a/onnxruntime/test/mlas/bench/bench_util.cpp b/onnxruntime/test/mlas/bench/bench_util.cpp index d57564615b04e..6b59b7e01b46f 100644 --- a/onnxruntime/test/mlas/bench/bench_util.cpp +++ b/onnxruntime/test/mlas/bench/bench_util.cpp @@ -22,30 +22,3 @@ std::vector RandomVectorUniform(std::vector shape, float min_val } return RandomVectorUniform(static_cast(sz), min_value, max_value); } - -void ArgsProductWithFilter(benchmark::internal::Benchmark* bench, - const std::vector>& arglists, - std::function& args)> include_filter) { - std::vector indices(arglists.size(), 0); - const std::size_t total = std::accumulate( - std::begin(arglists), std::end(arglists), std::size_t{1}, - [](const std::size_t res, const std::vector& arglist) { - return res * arglist.size(); - }); - std::vector args; - args.reserve(arglists.size()); - for (std::size_t i = 0; i < total; i++) { - for (std::size_t arg = 0; arg < arglists.size(); arg++) { - args.push_back(arglists[arg][indices[arg]]); - } - if (include_filter(args)) { - bench->Args(args); - } - args.clear(); - - std::size_t arg = 0; - do { - indices[arg] = (indices[arg] + 1) % arglists[arg].size(); - } while (indices[arg++] == 0 && arg < arglists.size()); - } -} diff --git a/onnxruntime/test/mlas/bench/bench_util.h b/onnxruntime/test/mlas/bench/bench_util.h index ee2ec42d0f755..f96dd5c673b3d 100644 --- a/onnxruntime/test/mlas/bench/bench_util.h +++ b/onnxruntime/test/mlas/bench/bench_util.h @@ -8,12 +8,6 @@ #include #include -// Specifies benchmark arguments from the cartesian product of `arglists`, like Benchmark::ArgsProduct(). -// `include_filter` is called to determine whether a given set of arguments should be included. -void ArgsProductWithFilter(benchmark::internal::Benchmark* bench, - const std::vector>& arglists, - std::function& args)> include_filter); - template std::vector RandomVectorUniform( size_t N, From a2ffc3740b4b3810418770e3275cea487f4c3b7e Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Thu, 14 Mar 2024 13:48:37 -0700 Subject: [PATCH 04/55] [Cuda] Demo multiple cuda graphs and user compute stream (#19883) Update stable diffusion demo to add options `--max-cuda-graphs` and `--user-compute-stream`. * Add python class GpuBindingManager to manage IO Binding based on input shape and max number of cuda graphs setting. The benefit is that one inference session could enable or disable cuda graph in different runs. * When `--user-compute-stream`, the demo will use custom compute stream. --- .../tools/transformers/io_binding_helper.py | 124 +++++++++++++++--- .../models/stable_diffusion/demo_txt2img.py | 24 +++- .../stable_diffusion/demo_txt2img_xl.py | 32 +++-- .../models/stable_diffusion/demo_utils.py | 21 +-- .../engine_builder_ort_cuda.py | 41 ++++-- .../pipeline_stable_diffusion.py | 8 +- 6 files changed, 201 insertions(+), 49 deletions(-) diff --git a/onnxruntime/python/tools/transformers/io_binding_helper.py b/onnxruntime/python/tools/transformers/io_binding_helper.py index 50703b9c17e03..58a49525b9199 100644 --- a/onnxruntime/python/tools/transformers/io_binding_helper.py +++ b/onnxruntime/python/tools/transformers/io_binding_helper.py @@ -1,3 +1,4 @@ +import copy import logging from collections import OrderedDict from typing import Any, Dict, List, Tuple, Union @@ -5,7 +6,7 @@ import numpy import torch -from onnxruntime import InferenceSession +from onnxruntime import InferenceSession, RunOptions logger = logging.getLogger(__name__) @@ -227,7 +228,6 @@ def __del__(self): del self.input_tensors del self.output_tensors del self.io_binding - del self.ort_session def allocate_buffers(self, shape_dict: Dict[str, Union[Tuple[int], List[int]]]): """Allocate tensors for I/O Binding""" @@ -276,7 +276,7 @@ def allocate_buffers(self, shape_dict: Dict[str, Union[Tuple[int], List[int]]]): tensor.data_ptr(), ) - def infer(self, feed_dict: Dict[str, torch.Tensor]): + def infer(self, feed_dict: Dict[str, torch.Tensor], run_options: RunOptions = None, synchronize: bool = False): """Bind input tensors and run inference""" for name, tensor in feed_dict.items(): assert isinstance(tensor, torch.Tensor) and tensor.is_contiguous() @@ -285,16 +285,7 @@ def infer(self, feed_dict: Dict[str, torch.Tensor]): assert self.input_tensors[name].nelement() == tensor.nelement() assert self.input_tensors[name].dtype == tensor.dtype assert tensor.device.type == "cuda" - # Please install cuda-python package with a version corresponding to CUDA in your machine. - from cuda import cudart - - # Update input tensor inplace since cuda graph requires input and output has fixed memory address. - cudart.cudaMemcpy( - self.input_tensors[name].data_ptr(), - tensor.data_ptr(), - tensor.element_size() * tensor.nelement(), - cudart.cudaMemcpyKind.cudaMemcpyDeviceToDevice, - ) + self.input_tensors[name].copy_(tensor) else: self.io_binding.bind_input( name, @@ -305,14 +296,115 @@ def infer(self, feed_dict: Dict[str, torch.Tensor]): tensor.data_ptr(), ) - self.ort_session.run_with_iobinding(self.io_binding) + # Synchronization are not needed in most cases unless different streams are used or inputs/outputs are in CPU. + if synchronize: + self.io_binding.synchronize_inputs() + self.ort_session.run_with_iobinding(self.io_binding, run_options) + self.io_binding.synchronize_outputs() + else: + self.ort_session.run_with_iobinding(self.io_binding, run_options) return self.output_tensors @staticmethod - def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool) -> Dict[str, Any]: - return { + def get_cuda_provider_options(device_id: int, enable_cuda_graph: bool, stream: int = 0) -> Dict[str, Any]: + options = { "device_id": device_id, "arena_extend_strategy": "kSameAsRequested", "enable_cuda_graph": enable_cuda_graph, } + + # Stream is address of a CUDA stream. 0 means the default stream. + if stream != 0: + options["user_compute_stream"] = str(stream) + + return options + + +class GpuBinding(CudaSession): + def __init__( + self, + ort_session: InferenceSession, + device: torch.device, + shape_dict: Dict[str, Union[Tuple[int], List[int]]], + enable_gpu_graph: bool = False, + gpu_graph_id: int = -1, + stream: int = 0, + ): + super().__init__(ort_session, device, enable_gpu_graph) + self.allocate_buffers(shape_dict) + self.gpu_graph_id = gpu_graph_id + # For cuda graph, we need to keep a copy of shape_dict to check if the shape is same in inference later. + self.shape_dict = copy.deepcopy(shape_dict) if enable_gpu_graph else None + self.stream = stream + # The gpu graph id of last run. It will be saved to image metadata. + self.last_run_gpu_graph_id = None + + def get_run_options(self, disable_cuda_graph_in_run: bool = False) -> RunOptions: + options = RunOptions() + + gpu_graph_id = -1 if disable_cuda_graph_in_run else self.gpu_graph_id + + options.add_run_config_entry("gpu_graph_id", str(gpu_graph_id)) + + self.last_run_gpu_graph_id = gpu_graph_id + + return options + + def infer(self, feed_dict: Dict[str, torch.Tensor], disable_cuda_graph_in_run: bool = False): + run_options = self.get_run_options(disable_cuda_graph_in_run) + + if self.stream: + run_options.add_run_config_entry("disable_synchronize_execution_providers", "1") + + return super().infer(feed_dict, run_options) + + +class GpuBindingManager: + """A manager for I/O bindings that support multiple CUDA Graphs. + One cuda graph is reused for same input shape. Automatically add a new cuda graph for new input shape. + """ + + def __init__(self, ort_session: InferenceSession, device: torch.device, stream: int = 0, max_cuda_graphs: int = 1): + self.ort_session = ort_session + self.device = device + + # Binding supports cuda graphs. For a binding, it is able to disable cuda graph for a specific run. + self.graph_bindings = [] + + # Binding for not using cuda graph. + self.no_graph_binding = None + + self.stream = stream + + self.max_cuda_graphs = max_cuda_graphs + + def get_binding( + self, + shape_dict: Dict[str, Union[Tuple[int], List[int]]], + use_cuda_graph: bool = False, + ) -> GpuBinding: + for gpu_graph_binding in self.graph_bindings: + # Found a cuda graph that captured with the same shape + if gpu_graph_binding.shape_dict == shape_dict: + return gpu_graph_binding + + # Reached the maximum number of cuda graphs. Return a binding without cuda graph. + if len(self.graph_bindings) >= self.max_cuda_graphs or (not use_cuda_graph): + if self.no_graph_binding is None: + self.no_graph_binding = GpuBinding(self.ort_session, self.device, shape_dict, stream=self.stream) + else: + self.no_graph_binding.allocate_buffers(shape_dict) + return self.no_graph_binding + + # This is a new input shape, create a new cuda graph + gpu_graph_binding = GpuBinding( + self.ort_session, + self.device, + shape_dict, + enable_gpu_graph=True, + gpu_graph_id=len(self.graph_bindings), + stream=self.stream, + ) + self.graph_bindings.append(gpu_graph_binding) + return gpu_graph_binding diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py index 2cd64e8784c6b..a3caba138f44a 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img.py @@ -32,13 +32,8 @@ repeat_prompt, ) -if __name__ == "__main__": - coloredlogs.install(fmt="%(funcName)20s: %(message)s") - - parser = arg_parser("Options for Stable Diffusion Demo") - add_controlnet_arguments(parser) - args = parse_arguments(is_xl=False, parser=parser) +def main(args): controlnet_images, controlnet_scale = process_controlnet_arguments(args) pipeline, refiner = load_pipelines(args) @@ -88,3 +83,20 @@ def run_inference(warmup=False): pipeline.save_images(images, prompt, negative_prompt, metadata) pipeline.teardown() + + +if __name__ == "__main__": + coloredlogs.install(fmt="%(funcName)20s: %(message)s") + + parser = arg_parser("Options for Stable Diffusion Demo") + add_controlnet_arguments(parser) + args = parse_arguments(is_xl=False, parser=parser) + + if args.user_compute_stream: + import torch + + s = torch.cuda.Stream() + with torch.cuda.stream(s): + main(args) + else: + main(args) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py index 19bbb45d77c93..24fa6a2c51343 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_txt2img_xl.py @@ -132,9 +132,11 @@ def run_demo(args): def run_dynamic_shape_demo(args): - """Run demo of generating images with different settings with ORT CUDA provider.""" + """ + Run demo of generating images with different settings with ORT CUDA provider. + Try "python demo_txt2img_xl.py --max-cuda-graphs 3 --user-compute-stream" to see the effect of multiple CUDA graphs. + """ args.engine = "ORT_CUDA" - args.disable_cuda_graph = True base, refiner = load_pipelines(args, 1) prompts = [ @@ -216,7 +218,6 @@ def run_dynamic_shape_demo(args): def run_turbo_demo(args): """Run demo of generating images with test prompts with ORT CUDA provider.""" args.engine = "ORT_CUDA" - args.disable_cuda_graph = True base, refiner = load_pipelines(args, 1) from datasets import load_dataset @@ -239,13 +240,7 @@ def run_turbo_demo(args): refiner.teardown() -if __name__ == "__main__": - coloredlogs.install(fmt="%(funcName)20s: %(message)s") - - parser = arg_parser("Options for Stable Diffusion XL Demo") - add_controlnet_arguments(parser) - args = parse_arguments(is_xl=True, parser=parser) - +def main(args): no_prompt = isinstance(args.prompt, list) and len(args.prompt) == 1 and not args.prompt[0] if no_prompt: if args.version == "xl-turbo": @@ -254,3 +249,20 @@ def run_turbo_demo(args): run_dynamic_shape_demo(args) else: run_demo(args) + + +if __name__ == "__main__": + coloredlogs.install(fmt="%(funcName)20s: %(message)s") + + parser = arg_parser("Options for Stable Diffusion XL Demo") + add_controlnet_arguments(parser) + args = parse_arguments(is_xl=True, parser=parser) + + if args.user_compute_stream: + import torch + + s = torch.cuda.Stream() + with torch.cuda.stream(s): + main(args) + else: + main(args) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py index 369f31511faca..a50940933eb82 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/demo_utils.py @@ -23,7 +23,7 @@ import os import sys from importlib.metadata import PackageNotFoundError, version -from typing import Any, Dict, List +from typing import Any, Dict, List, Optional import controlnet_aux import cv2 @@ -246,6 +246,8 @@ def parse_arguments(is_xl: bool, parser): group = parser.add_argument_group("Options for ORT_CUDA engine only") group.add_argument("--enable-vae-slicing", action="store_true", help="True will feed only one image to VAE once.") + group.add_argument("--max-cuda-graphs", type=int, default=1, help="Max number of cuda graphs to use. Default 1.") + group.add_argument("--user-compute-stream", action="store_true", help="Use user compute stream.") # TensorRT only options group = parser.add_argument_group("Options for TensorRT (--engine=TRT) only") @@ -400,15 +402,16 @@ def initialize_pipeline( max_image_size: int = 1024, max_batch_size: int = 16, opt_batch_size: int = 1, - build_all_tactics=False, - do_classifier_free_guidance=False, - lcm=False, + build_all_tactics: bool = False, + do_classifier_free_guidance: bool = False, + lcm: bool = False, controlnet=None, lora_weights=None, - lora_scale=1.0, - use_fp16_vae=True, - use_vae=True, - framework_model_dir=None, + lora_scale: float = 1.0, + use_fp16_vae: bool = True, + use_vae: bool = True, + framework_model_dir: Optional[str] = None, + max_cuda_graphs: int = 1, ): pipeline_info = PipelineInfo( version, @@ -465,6 +468,7 @@ def initialize_pipeline( tmp_dir=os.path.join(work_dir or ".", engine_type.name, pipeline_info.short_name(), "tmp"), device_id=torch.cuda.current_device(), import_engine_dir=import_engine_dir, + max_cuda_graphs=max_cuda_graphs, ) elif engine_type == EngineType.ORT_TRT: pipeline.backend.build_engines( @@ -562,6 +566,7 @@ def load_pipelines(args, batch_size=None): "use_fp16_vae": "xl" in args.version, "use_vae": True, "framework_model_dir": args.framework_model_dir, + "max_cuda_graphs": args.max_cuda_graphs, } if "xl" in args.version: diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py index 6ab4858f11f23..56012e223b18c 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/engine_builder_ort_cuda.py @@ -6,7 +6,7 @@ import gc import logging import os -from typing import List, Optional +from typing import Dict, List, Optional import onnx import torch @@ -15,25 +15,25 @@ from packaging import version import onnxruntime as ort -from onnxruntime.transformers.io_binding_helper import CudaSession +from onnxruntime.transformers.io_binding_helper import CudaSession, GpuBindingManager from onnxruntime.transformers.onnx_model import OnnxModel logger = logging.getLogger(__name__) -class OrtCudaEngine(CudaSession): +class OrtCudaEngine: def __init__( self, onnx_path, device_id: int = 0, enable_cuda_graph: bool = False, disable_optimization: bool = False, + max_cuda_graphs: int = 1, ): self.onnx_path = onnx_path self.provider = "CUDAExecutionProvider" - self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph) - # self.provider_options["enable_skip_layer_norm_strict_mode"] = True - + self.stream = torch.cuda.current_stream().cuda_stream + self.provider_options = CudaSession.get_cuda_provider_options(device_id, enable_cuda_graph, self.stream) session_options = ort.SessionOptions() # When the model has been optimized by onnxruntime, we can disable optimization to save session creation time. @@ -52,10 +52,33 @@ def __init__( logger.info("created CUDA EP session for %s", onnx_path) device = torch.device("cuda", device_id) - super().__init__(ort_session, device, enable_cuda_graph) + self.enable_cuda_graph = enable_cuda_graph + + # Support multiple CUDA graphs for different input shapes. + # For clip2 model that disabled cuda graph, max_cuda_graphs is updated to 0 here. + self.gpu_binding_manager = GpuBindingManager( + ort_session=ort_session, + device=device, + stream=self.stream, + max_cuda_graphs=max_cuda_graphs if enable_cuda_graph else 0, + ) + + self.current_gpu_binding = None + + def metadata(self, name: str): + data = {} + if self.current_gpu_binding is not None: + if self.current_gpu_binding.last_run_gpu_graph_id >= 0: + data[f"{name}.gpu_graph_id"] = self.current_gpu_binding.last_run_gpu_graph_id + return data + + def infer(self, feed_dict: Dict[str, torch.Tensor]): + return self.current_gpu_binding.infer(feed_dict=feed_dict, disable_cuda_graph_in_run=not self.enable_cuda_graph) def allocate_buffers(self, shape_dict, device): - super().allocate_buffers(shape_dict) + self.current_gpu_binding = self.gpu_binding_manager.get_binding( + shape_dict=shape_dict, use_cuda_graph=self.enable_cuda_graph + ) class _ModelConfig: @@ -220,6 +243,7 @@ def build_engines( device_id: int = 0, save_fp32_intermediate_model: bool = False, import_engine_dir: Optional[str] = None, + max_cuda_graphs: int = 1, ): self.torch_device = torch.device("cuda", device_id) self.load_models(framework_model_dir) @@ -352,6 +376,7 @@ def build_engines( device_id=device_id, enable_cuda_graph=use_cuda_graph, disable_optimization=False, + max_cuda_graphs=max_cuda_graphs, ) logger.info("%s options for %s: %s", engine.provider, model_name, engine.provider_options) diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py index 0ad8b13b6091c..1629537dc294f 100644 --- a/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py +++ b/onnxruntime/python/tools/transformers/models/stable_diffusion/pipeline_stable_diffusion.py @@ -547,7 +547,7 @@ def pt_to_numpy(images: torch.FloatTensor): return ((images + 1) / 2).clamp(0, 1).detach().permute(0, 2, 3, 1).float().cpu().numpy() def metadata(self) -> Dict[str, Any]: - return { + data = { "actual_steps": self.actual_steps, "seed": self.get_current_seed(), "name": self.pipeline_info.name(), @@ -555,6 +555,12 @@ def metadata(self) -> Dict[str, Any]: "custom_unet": self.pipeline_info.custom_unet(), } + if self.engine_type == EngineType.ORT_CUDA: + for engine_name, engine in self.backend.engines.items(): + data.update(engine.metadata(engine_name)) + + return data + def save_images(self, images: List, prompt: List[str], negative_prompt: List[str], metadata: Dict[str, Any]): session_id = str(random.randint(1000, 9999)) for i, image in enumerate(images): From 8b766bd24e3637e228c336ddacd3808eaf75857a Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 14 Mar 2024 15:07:56 -0700 Subject: [PATCH 05/55] Change nuget pipeline's "Windows_Packaging_combined_GPU" job to download TRT binaries in every build (#19919) ### Description Change nuget pipeline's "Final_Jar_Testing_Windows_GPU" job to download TRT binaries in every build. Now all the other build jobs are already doing this. This is the only one left. Similar to #19909 ### Motivation and Context As a follow up of #19118 --- .../c-api-noopenmp-packaging-pipelines.yml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml index 01f316dbbaaef..a63f1b74b7633 100644 --- a/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml +++ b/tools/ci_build/github/azure-pipelines/c-api-noopenmp-packaging-pipelines.yml @@ -506,12 +506,11 @@ stages: condition: always() - script: dir $(Build.SourcesDirectory) - - task: BatchScript@1 - displayName: 'setup env' - inputs: - filename: '$(Build.SourcesDirectory)\onnxruntime\tools\ci_build\github\windows\setup_env_gpu.bat' - modifyEnvironment: true - workingFolder: '$(Build.BinariesDirectory)' + - template: templates/jobs/download_win_gpu_library.yml + parameters: + CudaVersion: ${{ parameters.CudaVersion }} + DownloadCUDA: true + DownloadTRT: true - template: templates/set-version-number-variables-step.yml parameters: versionFileDirectory: '$(Build.SourcesDirectory)\onnxruntime' From 87a9f77c56412f73da61699888033c2a6523f31b Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 15 Mar 2024 06:47:41 +0800 Subject: [PATCH 06/55] Refactor Python Packaing Pipeline (Training Cuda 11.8) (#19910) ### Description 1. Use stage to organize the pipeline and split building and testing 2. Move compilation on CPU machine 3. test stage can leverage existing artifacts 4. check wheel size, it gives warning if the size above 300M 5. docker image name wasn't change even the argument changed, which caused the docker image was always rebuilt. So update the docker image name according to the argument can save the docker build time. Pipeline duration reduced by 60% (2 hours -> 50 minutes) Compilation time reduced by 75% (1.5hours -> 20 minutes) GPU time reduced by 87% ( 8 hours to 1 hours) for debugging, the GPU time could be reduced by above 95%, because we can choose run only one test stage and skip building. ### Motivation and Context Make the pipeline efficient. Optimized https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=424177&view=results Curent https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=422393&view=results --------- --- ...orttraining-py-packaging-pipeline-cuda.yml | 13 + ...py-packaging-training-cuda-stage-steps.yml | 229 ++++++++++++++++++ .../py-packaging-training-cuda-stage.yml | 215 +++------------- 3 files changed, 279 insertions(+), 178 deletions(-) create mode 100644 tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml index 47b1e0933417e..539a61c021cfb 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cuda.yml @@ -8,6 +8,17 @@ resources: name: pypa/manylinux ref: 5eda9aded5462201e6310105728d33016e637ea7 +parameters: + - name: SpecificArtifact + displayName: Use Specific Artifact + type: boolean + default: false + + - name: BuildId + displayName: Specific Artifact's BuildId + type: string + default: '0' + stages: - template: templates/py-packaging-training-cuda-stage.yml parameters: @@ -20,3 +31,5 @@ stages: agent_pool: Onnxruntime-Linux-GPU upload_wheel: 'yes' debug_build: false + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml new file mode 100644 index 0000000000000..91d7b9f219f76 --- /dev/null +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml @@ -0,0 +1,229 @@ +parameters: + build_py_parameters: '' + torch_version: '' + opset_version: '' + cuda_version: '' + cmake_cuda_architectures: '' + docker_file: '' + upload_wheel: '' + debug_build: '' + python_version: '' + stage_name: '' + SpecificArtifact: false + BuildId: '0' + +stages: + - stage: Build_${{ parameters.stage_name }} + variables: + - name: isMain + value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} + - name: finalStorage + ${{ if eq(variables['isMain'], 'true') }}: + value: '--final_storage' + ${{ else }}: + value: '' + - name: buildConfig + ${{ if eq(parameters['debug_build'], 'true') }}: + value: 'Debug' + ${{ else }}: + value: 'Release' + - name: PythonVersion + value: ${{ parameters.python_version }} + - name: Repository + value: onnxruntimetraininggpubuild_${{ parameters.python_version }} + dependsOn: [] + + jobs: + - job: Build + pool: onnxruntime-Ubuntu2204-AMD-CPU + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - task: CmdLine@2 + displayName: 'check variables' + inputs: + script: | + echo "Branch is "${{ variables['Build.SourceBranch'] }} && \ + echo "isMain is "${{ variables['isMain'] }} && \ + echo "final_storage is "${{ variables['finalStorage'] }} + + - checkout: self + clean: true + submodules: recursive + + - template: set-python-manylinux-variables-step.yml + + - template: get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: >- + --build-arg TORCH_VERSION=${{ parameters.torch_version }} + --build-arg OPSET_VERSION=${{ parameters.opset_version }} + --build-arg PYTHON_VERSION=${{ parameters.python_version }} + --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu + --build-arg BUILD_UID=$(id -u) + --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 + --build-arg DEVTOOLSET_ROOTPATH=/usr + --build-arg PREPEND_PATH=/usr/local/cuda/bin: + --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 + Repository: $(Repository) + + - task: CmdLine@2 + displayName: 'build onnxruntime' + inputs: + script: | + set -e -x + mkdir -p $HOME/.onnx + docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ + --volume /data/onnx:/data/onnx:ro \ + --volume $(Build.SourcesDirectory):/onnxruntime_src \ + --volume $(Build.BinariesDirectory):/build \ + --volume /data/models:/build/models:ro \ + --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ + -e NVIDIA_VISIBLE_DEVICES=all \ + -e NIGHTLY_BUILD \ + -e DEFAULT_TRAINING_PACKAGE_DEVICE \ + -e BUILD_BUILDNUMBER \ + -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \ + $(Repository) \ + $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ + --build_dir /build \ + --config ${{ variables['buildConfig'] }} \ + --skip_submodule_sync \ + --parallel --use_binskim_compliant_compile_flags \ + --build_wheel \ + --enable_onnx_tests \ + ${{ parameters.build_py_parameters }} \ + --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \ + --use_cuda --cuda_version=${{ parameters.cuda_version }} --cuda_home=/usr/local/cuda-${{ parameters.cuda_version }} --cudnn_home=/usr/local/cuda-${{ parameters.cuda_version }}; + workingDirectory: $(Build.SourcesDirectory) + + - task: CopyFiles@2 + displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.BinariesDirectory)' + Contents: "${{ variables['buildConfig'] }}/dist/*.whl" + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation' + inputs: + ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}" + + - template: component-governance-component-detection-steps.yml + parameters: + condition: 'succeeded' + + - template: clean-agent-build-directory-step.yml + + - stage: Test_${{ parameters.stage_name }} + variables: + - name: isMain + value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} + - name: finalStorage + ${{ if eq(variables['isMain'], 'true') }}: + value: '--final_storage' + ${{ else }}: + value: '' + - name: buildConfig + ${{ if eq(parameters['debug_build'], 'true') }}: + value: 'Debug' + ${{ else }}: + value: 'Release' + - name: PythonVersion + value: ${{ parameters.python_version }} + - name: Repository + value: onnxruntimetraininggpubuild_${{ parameters.python_version }} + dependsOn: Build_${{ parameters.stage_name }} + jobs: + - job: Test_GPU + pool: Onnxruntime-Linux-GPU + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - template: set-python-manylinux-variables-step.yml + + - template: flex-downloadPipelineArtifact.yml + parameters: + ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}_${{ parameters.python_version }}" + StepName: 'Download Pipeline Artifact - Linux Training Build' + TargetPath: '$(Build.ArtifactStagingDirectory)' + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} + + - script: | + set -e -x + whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ + echo $whlfilename ; du -sh $whlfilename ; \ + (( $(wc -c < "$whlfilename") - 300*1024*1024 < 0 )) || ( echo 'Wheel size bigger than 300M'; exit 1) + displayName: 'Check wheel size' + continueOnError: true + + - template: get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: >- + --build-arg TORCH_VERSION=${{ parameters.torch_version }} + --build-arg OPSET_VERSION=${{ parameters.opset_version }} + --build-arg PYTHON_VERSION=${{ parameters.python_version }} + --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu + --build-arg BUILD_UID=$(id -u) + --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 + --build-arg DEVTOOLSET_ROOTPATH=/usr + --build-arg PREPEND_PATH=/usr/local/cuda/bin: + --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 + Repository: $(Repository) + + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" + displayName: 'Mount MNIST' + condition: succeededOrFailed() + workingDirectory: $(Build.SourcesDirectory) + + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" + displayName: 'Mount bert-data' + condition: succeededOrFailed() + workingDirectory: $(Build.SourcesDirectory) + + - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" + displayName: 'Mount hf-models-cache' + condition: succeededOrFailed() + workingDirectory: $(Build.SourcesDirectory) + + - task: CmdLine@2 + displayName: 'test ortmodule' + inputs: + script: | + set -ex ; \ + whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ + echo $whlfilename ; \ + basefilename=$(basename $whlfilename) ; \ + docker run --rm \ + --gpus all \ + -e NVIDIA_VISIBLE_DEVICES=all \ + --volume $(Build.ArtifactStagingDirectory):/build \ + --volume /mnist:/mnist \ + --volume /bert_data:/bert_data \ + --volume /hf_models_cache:/hf_models_cache \ + $(Repository) \ + bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/Release/dist/$basefilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ; + workingDirectory: $(Build.SourcesDirectory) + + - task: CmdLine@2 + displayName: 'Upload wheel' + condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true'))) + inputs: + script: | + set -e -x + whlfilename=$(ls $(Build.ArtifactStagingDirectory)/Release/dist/*.whl | head -n 1) ; \ + python3 tools/ci_build/upload_python_package_to_azure_storage.py \ + --python_wheel_path $whlfilename ${{ variables['finalStorage'] }} diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml index c6921e151a029..f7ecc3cf84e48 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage.yml @@ -47,183 +47,42 @@ parameters: type: boolean default: false -stages: -- stage: "Cuda_Python_Packaging_debug_${{ parameters.debug_build }}" - - variables: - - name: isMain - value: ${{ or(eq(variables['Build.SourceBranch'], 'refs/heads/main'), startsWith(variables['Build.SourceBranch'], 'refs/heads/rel-')) }} - - name: finalStorage - ${{ if eq(variables['isMain'], 'true') }}: - value: '--final_storage' - ${{ else }}: - value: '' - - name: buildConfig - ${{ if eq(parameters['debug_build'], 'true') }}: - value: 'Debug' - ${{ else }}: - value: 'Release' - - dependsOn: [] - - jobs: - - job: Linux_py_Training_Cuda_Wheels - timeoutInMinutes: 180 - workspace: - clean: all - pool: ${{ parameters.agent_pool }} - strategy: - matrix: - Python38: - PythonVersion: '3.8' - TorchVersion: ${{ parameters.torch_version }} - OpsetVersion: ${{ parameters.opset_version }} - CudaVersion: ${{ parameters.cuda_version }} - UploadWheel: ${{ parameters.upload_wheel }} - Python39: - PythonVersion: '3.9' - TorchVersion: ${{ parameters.torch_version }} - OpsetVersion: ${{ parameters.opset_version }} - CudaVersion: ${{ parameters.cuda_version }} - UploadWheel: ${{ parameters.upload_wheel }} - Python310: - PythonVersion: '3.10' - TorchVersion: ${{ parameters.torch_version }} - OpsetVersion: ${{ parameters.opset_version }} - CudaVersion: ${{ parameters.cuda_version }} - UploadWheel: ${{ parameters.upload_wheel }} - Python311: - PythonVersion: '3.11' - TorchVersion: ${{ parameters.torch_version }} - OpsetVersion: ${{ parameters.opset_version }} - CudaVersion: ${{ parameters.cuda_version }} - UploadWheel: ${{ parameters.upload_wheel }} -# TODO: enable this when we have torch support pyton 3.12 -# Python312: -# PythonVersion: '3.12' -# TorchVersion: ${{ parameters.torch_version }} -# OpsetVersion: ${{ parameters.opset_version }} -# CudaVersion: ${{ parameters.cuda_version }} -# UploadWheel: ${{ parameters.upload_wheel }} - - steps: - - task: CmdLine@2 - displayName: 'check variables' - inputs: - script: | - echo "Branch is "${{ variables['Build.SourceBranch'] }} && \ - echo "isMain is "${{ variables['isMain'] }} && \ - echo "final_storage is "${{ variables['finalStorage'] }} - - - checkout: self - clean: true - submodules: recursive - - - template: set-python-manylinux-variables-step.yml - - - template: get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: >- - --build-arg TORCH_VERSION=$(TorchVersion) - --build-arg OPSET_VERSION=$(OpsetVersion) - --build-arg PYTHON_VERSION=$(PythonVersion) - --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu - --build-arg BUILD_UID=$(id -u) - --network=host --build-arg POLICY=manylinux_2_28 --build-arg PLATFORM=x86_64 - --build-arg DEVTOOLSET_ROOTPATH=/usr - --build-arg PREPEND_PATH=/usr/local/cuda/bin: - --build-arg LD_LIBRARY_PATH_ARG=/usr/local/lib64 - Repository: onnxruntimetraininggpubuild - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/mnist" -d "/mnist" - displayName: 'Mount MNIST' - condition: succeededOrFailed() - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/bert-data" -d "/bert_data" - displayName: 'Mount bert-data' - condition: succeededOrFailed() - - - bash: tools/ci_build/github/linux/docker/scripts/training/azure_scale_set_vm_mount_test_data.sh -p $(orttrainingtestdatascus-storage-key) -s "//orttrainingtestdatascus.file.core.windows.net/hf-models-cache" -d "/hf_models_cache" - displayName: 'Mount hf-models-cache' - condition: succeededOrFailed() - - - task: CmdLine@2 - displayName: 'build onnxruntime' - inputs: - script: | - set -e -x - mkdir -p $HOME/.onnx - docker run --rm -e CFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" -e CXXFLAGS="-Wp,-D_FORTIFY_SOURCE=2 -Wp,-D_GLIBCXX_ASSERTIONS -fstack-protector-strong -fstack-clash-protection -fcf-protection -O3 -Wl,--strip-all" \ - --volume /data/onnx:/data/onnx:ro \ - --volume $(Build.SourcesDirectory):/onnxruntime_src \ - --volume $(Build.BinariesDirectory):/build \ - --volume /data/models:/build/models:ro \ - --volume $HOME/.onnx:/home/onnxruntimedev/.onnx \ - -e NVIDIA_VISIBLE_DEVICES=all \ - -e NIGHTLY_BUILD \ - -e DEFAULT_TRAINING_PACKAGE_DEVICE \ - -e BUILD_BUILDNUMBER \ - -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \ - onnxruntimetraininggpubuild \ - $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ - --build_dir /build \ - --config ${{ variables['buildConfig'] }} \ - --skip_submodule_sync \ - --parallel --use_binskim_compliant_compile_flags \ - --build_wheel \ - --enable_onnx_tests \ - ${{ parameters.build_py_parameters }} \ - --cmake_extra_defines 'CMAKE_CUDA_ARCHITECTURES=${{ parameters.cmake_cuda_architectures }}' onnxruntime_BUILD_UNIT_TESTS=OFF \ - --use_cuda --cuda_version=$(CudaVersion) --cuda_home=/usr/local/cuda-$(CudaVersion) --cudnn_home=/usr/local/cuda-$(CudaVersion) ; - workingDirectory: $(Build.SourcesDirectory) - - - task: CmdLine@2 - displayName: 'test ortmodule' - inputs: - script: | - rm -rf $(Build.BinariesDirectory)/${{ variables['buildConfig'] }}/onnxruntime/ && \ - files=($(Build.BinariesDirectory)/${{ variables['buildConfig'] }}/dist/*.whl) && \ - echo ${files[0]} && \ - whlfilename=$(basename ${files[0]}) && \ - echo $whlfilename && \ - docker run --rm \ - --gpus all \ - -e NVIDIA_VISIBLE_DEVICES=all \ - --volume $(Build.BinariesDirectory):/build \ - --volume /mnist:/mnist \ - --volume /bert_data:/bert_data \ - --volume /hf_models_cache:/hf_models_cache \ - onnxruntimetraininggpubuild \ - bash -c " $(PythonManylinuxDir)/bin/python3 -m pip install /build/${{ variables['buildConfig'] }}/dist/$whlfilename && $(PythonManylinuxDir)/bin/python3 -m onnxruntime.training.ortmodule.torch_cpp_extensions.install " ; - workingDirectory: $(Build.SourcesDirectory) - - - task: CopyFiles@2 - displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.BinariesDirectory)' - Contents: "${{ variables['buildConfig'] }}/dist/*.whl" - TargetFolder: '$(Build.ArtifactStagingDirectory)' - - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime python wheel and documentation' - inputs: - ArtifactName: "onnxruntime_gpu_${{ variables['buildConfig'] }}" - - - task: CmdLine@2 - displayName: 'Upload wheel' - condition: and(succeeded(), and(eq(variables['UploadWheel'], 'yes'), ne(variables['ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION'], 'true'))) - inputs: - script: | - set -e -x - files=($(Build.ArtifactStagingDirectory)/${{ variables['buildConfig'] }}/dist/*.whl) && \ - echo ${files[0]} && \ - python3 tools/ci_build/upload_python_package_to_azure_storage.py \ - --python_wheel_path ${files[0]} ${{ variables['finalStorage'] }} +- name: SpecificArtifact + displayName: Use Specific Artifact + type: boolean + default: false - - template: component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' +- name: BuildId + displayName: Specific Artifact's BuildId + type: string + default: '0' + +- name: PythonVersionList + displayName: Python Version List + type: object + default: + - name: '38' + version: '3.8' + - name: '39' + version: '3.9' + - name: '310' + version: '3.10' + - name: '311' + version: '3.11' - - template: clean-agent-build-directory-step.yml +stages: +- ${{ each python_version in parameters.PythonVersionList }}: + - template: py-packaging-training-cuda-stage-steps.yml + parameters: + build_py_parameters: ${{ parameters.build_py_parameters }} + torch_version: ${{ parameters.torch_version }} + opset_version: ${{ parameters.opset_version }} + cuda_version: ${{ parameters.cuda_version }} + cmake_cuda_architectures: ${{ parameters.cmake_cuda_architectures }} + docker_file: ${{ parameters.docker_file }} + upload_wheel: ${{ parameters.upload_wheel }} + debug_build: ${{ parameters.debug_build }} + stage_name: 'Linux_py_Training_Cuda_Wheels_${{ python_version.name }}' + python_version: ${{ python_version.version }} + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} From 32558134a9047f8babcac174cdc83d5366397b54 Mon Sep 17 00:00:00 2001 From: Adam Louly Date: Thu, 14 Mar 2024 16:36:24 -0700 Subject: [PATCH 07/55] [On-Device-Training] Upgrade Flatbuffers to Support 2GB+ Checkpoints. (#19770) ### Description Modifications to support 2GB+ checkpoint & Upgrading Flatbuffers ### Motivation and Context This PR includes changes that will make ort handle 2GB+ checkpoints. To do that we need to upgrade flatbuffers to 23.5.9 - https://github.com/google/flatbuffers/pull/7945 - Modified the commitHash and the hash for the new version - Removed the patch for rust generator's unused variable warning as it is no longer producing this - [Check it out here](https://github.com/CasperN/flatbuffers/blob/d121e09d89726256ddbecd6318bcc85ce080d686/src/idl_gen_rust.cpp) - Updated the VerifyField calls with alignment values that were introduced in the new version. --------- Co-authored-by: Sumit Agarwal --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/deps.txt | 2 +- .../external/onnxruntime_external_deps.cmake | 2 +- cmake/patches/flatbuffers/flatbuffers.patch | 40 +- include/onnxruntime/core/graph/graph.h | 2 +- onnxruntime/core/common/flatbuffers.h | 18 + .../core/flatbuffers/flatbuffers_utils.h | 2 +- onnxruntime/core/flatbuffers/schema/README.md | 2 +- onnxruntime/core/flatbuffers/schema/ort.fbs.h | 56 +- .../schema/ort_training_checkpoint.fbs.h | 14 +- .../core/framework/kernel_type_str_resolver.h | 2 +- .../kernel_type_str_resolver_utils.cc | 2 +- onnxruntime/core/framework/session_state.h | 2 +- .../core/graph/graph_flatbuffers_utils.cc | 2 +- .../core/graph/graph_flatbuffers_utils.h | 2 +- onnxruntime/core/graph/model.h | 2 +- onnxruntime/core/graph/op_identifier_utils.h | 2 +- .../runtime_optimization_record_container.h | 2 +- .../DirectMLHelpers/DmlGraphDesc_generated.h | 440 +++++++++------- .../OperatorFieldTypes_generated.h | 495 +++++++++--------- .../dml/DmlExecutionProvider/src/precomp.h | 2 +- .../migraphx/ort_trt_int8_cal_table.fbs.h | 2 +- .../tensorrt/ort_trt_int8_cal_table.fbs.h | 2 +- .../templates/download-deps.yml | 4 +- .../templates/jobs/win-ci-vs-2022-job.yml | 19 +- 25 files changed, 585 insertions(+), 535 deletions(-) create mode 100644 onnxruntime/core/common/flatbuffers.h diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index cfad59be6b4c0..dc7e9c3fddb2f 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -86,7 +86,7 @@ "component": { "type": "git", "git": { - "commitHash": "6df40a2471737b27271bdd9b900ab5f3aec746c7", + "commitHash": "0100f6a5779831fa7a651e4b67ef389a8752bd9b", "repositoryUrl": "https://github.com/google/flatbuffers.git" }, "comments": "flatbuffers" diff --git a/cmake/deps.txt b/cmake/deps.txt index 9630b6185fcf6..4111689c5def9 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -23,7 +23,7 @@ dlpack;https://github.com/dmlc/dlpack/archive/refs/tags/v0.6.zip;4d565dd2e5b3132 # Until the 3.4.1 release this is the best option we have. # Issue link: https://gitlab.com/libeigen/eigen/-/issues/2744 eigen;https://gitlab.com/libeigen/eigen/-/archive/e7248b26a1ed53fa030c5c459f7ea095dfd276ac/eigen-e7248b26a1ed53fa030c5c459f7ea095dfd276ac.zip;be8be39fdbc6e60e94fa7870b280707069b5b81a -flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v1.12.0.zip;ba0a75fd12dbef8f6557a74e611b7a3d0c5fe7bf +flatbuffers;https://github.com/google/flatbuffers/archive/refs/tags/v23.5.26.zip;59422c3b5e573dd192fead2834d25951f1c1670c fp16;https://github.com/Maratyszcza/FP16/archive/0a92994d729ff76a58f692d3028ca1b64b145d91.zip;b985f6985a05a1c03ff1bb71190f66d8f98a1494 fxdiv;https://github.com/Maratyszcza/FXdiv/archive/63058eff77e11aa15bf531df5dd34395ec3017c8.zip;a5658f4036402dbca7cebee32be57fb8149811e1 google_benchmark;https://github.com/google/benchmark/archive/refs/tags/v1.8.3.zip;bf9870756ee3f8d2d3b346b24ee3600a41c74d3d diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index 75ccc2dfd83a0..ac1e187f357aa 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -109,7 +109,7 @@ FetchContent_Declare( URL ${DEP_URL_flatbuffers} URL_HASH SHA1=${DEP_SHA1_flatbuffers} PATCH_COMMAND ${ONNXRUNTIME_FLATBUFFERS_PATCH_COMMAND} - FIND_PACKAGE_ARGS 1.12.0...<2.0.0 NAMES Flatbuffers + FIND_PACKAGE_ARGS 23.5.9 NAMES Flatbuffers ) # Download a protoc binary from Internet if needed diff --git a/cmake/patches/flatbuffers/flatbuffers.patch b/cmake/patches/flatbuffers/flatbuffers.patch index f141d358c54b6..fbe8db37ecb0e 100644 --- a/cmake/patches/flatbuffers/flatbuffers.patch +++ b/cmake/patches/flatbuffers/flatbuffers.patch @@ -2,35 +2,11 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt index 3987eac9..5e5462f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -223,7 +223,7 @@ elseif(CMAKE_COMPILER_IS_GNUCXX) - "${CMAKE_CXX_FLAGS} -std=c++0x") - endif(CYGWIN) - set(CMAKE_CXX_FLAGS -- "${CMAKE_CXX_FLAGS} -Wall -pedantic -Werror -Wextra -Werror=shadow") -+ "${CMAKE_CXX_FLAGS} -Wall -pedantic -Wextra -Werror=shadow -Wno-error=stringop-overflow") - set(FLATBUFFERS_PRIVATE_CXX_FLAGS "-Wold-style-cast") - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.4) - if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) -diff --git a/src/idl_gen_rust.cpp b/src/idl_gen_rust.cpp -index 55b8439b..dc03e8a8 100644 ---- a/src/idl_gen_rust.cpp -+++ b/src/idl_gen_rust.cpp -@@ -406,7 +406,8 @@ class RustGenerator : public BaseGenerator { - // example: f(A, D::E) -> super::D::E - // does not include leaf object (typically a struct type). - -- size_t i = 0; -+ // fix unused but set variable warning -+ //size_t i = 0; - std::stringstream stream; - - auto s = src->components.begin(); -@@ -417,7 +418,7 @@ class RustGenerator : public BaseGenerator { - if (*s != *d) { break; } - ++s; - ++d; -- ++i; -+ //++i; - } - - for (; s != src->components.end(); ++s) { stream << "super::"; } +@@ -279,5 +279,5 @@ + # Append FLATBUFFERS_CXX_FLAGS to CMAKE_CXX_FLAGS. + if(DEFINED FLATBUFFERS_CXX_FLAGS) + message(STATUS "extend CXX_FLAGS with ${FLATBUFFERS_CXX_FLAGS}") +- set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS}") ++ set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${FLATBUFFERS_CXX_FLAGS} -Wno-error=stringop-overflow") + endif() + message(STATUS "CMAKE_CXX_FLAGS: ${CMAKE_CXX_FLAGS}") diff --git a/include/onnxruntime/core/graph/graph.h b/include/onnxruntime/core/graph/graph.h index b9b8a25286b7b..b16d52dbdab68 100644 --- a/include/onnxruntime/core/graph/graph.h +++ b/include/onnxruntime/core/graph/graph.h @@ -21,7 +21,7 @@ #pragma warning(pop) #endif -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/gsl.h" diff --git a/onnxruntime/core/common/flatbuffers.h b/onnxruntime/core/common/flatbuffers.h new file mode 100644 index 0000000000000..0d61e1038a82c --- /dev/null +++ b/onnxruntime/core/common/flatbuffers.h @@ -0,0 +1,18 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once +#if defined(__GNUC__) +#include "onnxruntime_config.h" +#pragma GCC diagnostic push + +#ifdef HAS_SHORTEN_64_TO_32 +#pragma GCC diagnostic ignored "-Wshorten-64-to-32" +#endif +#endif + +#include "flatbuffers/flatbuffers.h" + +#if defined(__GNUC__) +#pragma GCC diagnostic pop +#endif \ No newline at end of file diff --git a/onnxruntime/core/flatbuffers/flatbuffers_utils.h b/onnxruntime/core/flatbuffers/flatbuffers_utils.h index 55bde0b2df806..76860d6ab1db8 100644 --- a/onnxruntime/core/flatbuffers/flatbuffers_utils.h +++ b/onnxruntime/core/flatbuffers/flatbuffers_utils.h @@ -5,7 +5,7 @@ #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/common.h" #include "core/common/path_string.h" diff --git a/onnxruntime/core/flatbuffers/schema/README.md b/onnxruntime/core/flatbuffers/schema/README.md index 932478111ee68..96a2936c196ae 100644 --- a/onnxruntime/core/flatbuffers/schema/README.md +++ b/onnxruntime/core/flatbuffers/schema/README.md @@ -21,7 +21,7 @@ e.g. - /build/Linux/Debug/_deps/flatbuffers-build/flatc It is possible to use another flatc as well, e.g., from a separate installation. Note that ONNX Runtime uses -FlatBuffers 1.12. +FlatBuffers 23.5.26. To update the flatbuffers schemas and generated files: 1. Modify [the ORT file format schema](ort.fbs) or [training checkpoint schema](ort_training_checkpoint.fbs). diff --git a/onnxruntime/core/flatbuffers/schema/ort.fbs.h b/onnxruntime/core/flatbuffers/schema/ort.fbs.h index e0f5342c29621..dc8a471f2d81f 100644 --- a/onnxruntime/core/flatbuffers/schema/ort.fbs.h +++ b/onnxruntime/core/flatbuffers/schema/ort.fbs.h @@ -4,7 +4,7 @@ #ifndef FLATBUFFERS_GENERATED_ORT_ONNXRUNTIME_FBS_H_ #define FLATBUFFERS_GENERATED_ORT_ONNXRUNTIME_FBS_H_ -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" namespace onnxruntime { namespace fbs { @@ -562,8 +562,8 @@ struct DimensionValue FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_DIM_TYPE) && - VerifyField(verifier, VT_DIM_VALUE) && + VerifyField(verifier, VT_DIM_TYPE, 1) && + VerifyField(verifier, VT_DIM_VALUE, 8) && VerifyOffset(verifier, VT_DIM_PARAM) && verifier.VerifyString(dim_param()) && verifier.EndTable(); @@ -634,7 +634,7 @@ struct TensorTypeAndShape FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_ELEM_TYPE) && + VerifyField(verifier, VT_ELEM_TYPE, 4) && VerifyOffset(verifier, VT_SHAPE) && verifier.VerifyTable(shape()) && verifier.EndTable(); @@ -687,7 +687,7 @@ struct MapType FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_KEY_TYPE) && + VerifyField(verifier, VT_KEY_TYPE, 4) && VerifyOffset(verifier, VT_VALUE_TYPE) && verifier.VerifyTable(value_type()) && verifier.EndTable(); @@ -787,7 +787,7 @@ struct NodeEdge FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_NODE_INDEX) && + VerifyField(verifier, VT_NODE_INDEX, 4) && VerifyOffset(verifier, VT_INPUT_EDGES) && verifier.VerifyVector(input_edges()) && VerifyOffset(verifier, VT_OUTPUT_EDGES) && @@ -911,11 +911,11 @@ struct Node FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { verifier.VerifyString(doc_string()) && VerifyOffset(verifier, VT_DOMAIN) && verifier.VerifyString(domain()) && - VerifyField(verifier, VT_SINCE_VERSION) && - VerifyField(verifier, VT_INDEX) && + VerifyField(verifier, VT_SINCE_VERSION, 4) && + VerifyField(verifier, VT_INDEX, 4) && VerifyOffset(verifier, VT_OP_TYPE) && verifier.VerifyString(op_type()) && - VerifyField(verifier, VT_TYPE) && + VerifyField(verifier, VT_TYPE, 4) && VerifyOffset(verifier, VT_EXECUTION_PROVIDER_TYPE) && verifier.VerifyString(execution_provider_type()) && VerifyOffset(verifier, VT_INPUTS) && @@ -1174,7 +1174,7 @@ struct TypeInfo FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DENOTATION) && verifier.VerifyString(denotation()) && - VerifyField(verifier, VT_VALUE_TYPE) && + VerifyField(verifier, VT_VALUE_TYPE, 1) && VerifyOffset(verifier, VT_VALUE) && VerifyTypeInfoValue(verifier, value(), value_type()) && verifier.EndTable(); @@ -1259,7 +1259,7 @@ struct OperatorSetId FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DOMAIN) && verifier.VerifyString(domain()) && - VerifyField(verifier, VT_VERSION) && + VerifyField(verifier, VT_VERSION, 8) && verifier.EndTable(); } }; @@ -1343,7 +1343,7 @@ struct Tensor FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { verifier.VerifyString(doc_string()) && VerifyOffset(verifier, VT_DIMS) && verifier.VerifyVector(dims()) && - VerifyField(verifier, VT_DATA_TYPE) && + VerifyField(verifier, VT_DATA_TYPE, 4) && VerifyOffset(verifier, VT_RAW_DATA) && verifier.VerifyVector(raw_data()) && VerifyOffset(verifier, VT_STRING_DATA) && @@ -1568,9 +1568,9 @@ struct Attribute FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { verifier.VerifyString(name()) && VerifyOffset(verifier, VT_DOC_STRING) && verifier.VerifyString(doc_string()) && - VerifyField(verifier, VT_TYPE) && - VerifyField(verifier, VT_F) && - VerifyField(verifier, VT_I) && + VerifyField(verifier, VT_TYPE, 4) && + VerifyField(verifier, VT_F, 4) && + VerifyField(verifier, VT_I, 8) && VerifyOffset(verifier, VT_S) && verifier.VerifyString(s()) && VerifyOffset(verifier, VT_T) && @@ -1759,12 +1759,12 @@ struct NodesToOptimizeIndices FLATBUFFERS_FINAL_CLASS : private flatbuffers::Tab return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NODE_INDICES) && verifier.VerifyVector(node_indices()) && - VerifyField(verifier, VT_NUM_INPUTS) && - VerifyField(verifier, VT_NUM_OUTPUTS) && - VerifyField(verifier, VT_HAS_VARIADIC_INPUT) && - VerifyField(verifier, VT_HAS_VARIADIC_OUTPUT) && - VerifyField(verifier, VT_NUM_VARIADIC_INPUTS) && - VerifyField(verifier, VT_NUM_VARIADIC_OUTPUTS) && + VerifyField(verifier, VT_NUM_INPUTS, 4) && + VerifyField(verifier, VT_NUM_OUTPUTS, 4) && + VerifyField(verifier, VT_HAS_VARIADIC_INPUT, 1) && + VerifyField(verifier, VT_HAS_VARIADIC_OUTPUT, 1) && + VerifyField(verifier, VT_NUM_VARIADIC_INPUTS, 4) && + VerifyField(verifier, VT_NUM_VARIADIC_OUTPUTS, 4) && verifier.EndTable(); } }; @@ -1862,8 +1862,8 @@ struct DeprecatedNodeIndexAndKernelDefHash FLATBUFFERS_FINAL_CLASS : private fla } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_NODE_INDEX) && - VerifyField(verifier, VT_KERNEL_DEF_HASH) && + VerifyField(verifier, VT_NODE_INDEX, 4) && + VerifyField(verifier, VT_KERNEL_DEF_HASH, 8) && verifier.EndTable(); } }; @@ -2161,7 +2161,7 @@ struct Graph FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) && verifier.VerifyVectorOfTables(nodes()) && - VerifyField(verifier, VT_MAX_NODE_INDEX) && + VerifyField(verifier, VT_MAX_NODE_INDEX, 4) && VerifyOffset(verifier, VT_NODE_EDGES) && verifier.VerifyVector(node_edges()) && verifier.VerifyVectorOfTables(node_edges()) && @@ -2390,7 +2390,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_IR_VERSION) && + VerifyField(verifier, VT_IR_VERSION, 8) && VerifyOffset(verifier, VT_OPSET_IMPORT) && verifier.VerifyVector(opset_import()) && verifier.VerifyVectorOfTables(opset_import()) && @@ -2400,7 +2400,7 @@ struct Model FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { verifier.VerifyString(producer_version()) && VerifyOffset(verifier, VT_DOMAIN) && verifier.VerifyString(domain()) && - VerifyField(verifier, VT_MODEL_VERSION) && + VerifyField(verifier, VT_MODEL_VERSION, 8) && VerifyOffset(verifier, VT_DOC_STRING) && verifier.VerifyString(doc_string()) && VerifyOffset(verifier, VT_GRAPH) && @@ -2740,8 +2740,8 @@ struct ArgTypeAndIndex FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_ARG_TYPE) && - VerifyField(verifier, VT_INDEX) && + VerifyField(verifier, VT_ARG_TYPE, 1) && + VerifyField(verifier, VT_INDEX, 4) && verifier.EndTable(); } }; diff --git a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h index d205c5eb8f409..62e6cf74394e5 100644 --- a/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h +++ b/onnxruntime/core/flatbuffers/schema/ort_training_checkpoint.fbs.h @@ -4,7 +4,7 @@ #ifndef FLATBUFFERS_GENERATED_ORTTRAININGCHECKPOINT_ONNXRUNTIME_FBS_H_ #define FLATBUFFERS_GENERATED_ORTTRAININGCHECKPOINT_ONNXRUNTIME_FBS_H_ -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "ort.fbs.h" @@ -59,7 +59,7 @@ struct ModuleState FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VerifyOffset(verifier, VT_FROZEN_PARAMS) && verifier.VerifyVector(frozen_params()) && verifier.VerifyVectorOfTables(frozen_params()) && - VerifyField(verifier, VT_IS_NOMINAL_STATE) && + VerifyField(verifier, VT_IS_NOMINAL_STATE, 1) && verifier.EndTable(); } }; @@ -206,8 +206,8 @@ struct OptimizerGroup FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_GROUP_NAME) && verifier.VerifyString(group_name()) && - VerifyField(verifier, VT_STEP) && - VerifyField(verifier, VT_INITIAL_LEARNING_RATE) && + VerifyField(verifier, VT_STEP, 8) && + VerifyField(verifier, VT_INITIAL_LEARNING_RATE, 4) && VerifyOffset(verifier, VT_OPTIMIZER_STATES) && verifier.VerifyVector(optimizer_states()) && verifier.VerifyVectorOfTables(optimizer_states()) && @@ -289,7 +289,7 @@ struct IntProperty FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) && - VerifyField(verifier, VT_VALUE) && + VerifyField(verifier, VT_VALUE, 8) && verifier.EndTable(); } }; @@ -353,7 +353,7 @@ struct FloatProperty FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) && - VerifyField(verifier, VT_VALUE) && + VerifyField(verifier, VT_VALUE, 4) && verifier.EndTable(); } }; @@ -572,7 +572,7 @@ struct Checkpoint FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { } bool Verify(flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_VERSION) && + VerifyField(verifier, VT_VERSION, 4) && VerifyOffset(verifier, VT_MODULE_STATE) && verifier.VerifyTable(module_state()) && VerifyOffset(verifier, VT_OPTIMIZER_GROUPS) && diff --git a/onnxruntime/core/framework/kernel_type_str_resolver.h b/onnxruntime/core/framework/kernel_type_str_resolver.h index 31a806dd52291..fea2a6ef3a439 100644 --- a/onnxruntime/core/framework/kernel_type_str_resolver.h +++ b/onnxruntime/core/framework/kernel_type_str_resolver.h @@ -7,7 +7,7 @@ #include #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #if !defined(ORT_MINIMAL_BUILD) #include "core/graph/onnx_protobuf.h" diff --git a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc index 4f5fa9910b5df..473e78c3f5e25 100644 --- a/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc +++ b/onnxruntime/core/framework/kernel_type_str_resolver_utils.cc @@ -5,7 +5,7 @@ #include "core/framework/kernel_type_str_resolver_utils.h" -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/common.h" #include "core/flatbuffers/schema/ort.fbs.h" diff --git a/onnxruntime/core/framework/session_state.h b/onnxruntime/core/framework/session_state.h index 51bb02918d82f..a2ee1601d386b 100644 --- a/onnxruntime/core/framework/session_state.h +++ b/onnxruntime/core/framework/session_state.h @@ -8,7 +8,7 @@ #include #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/gsl.h" diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.cc b/onnxruntime/core/graph/graph_flatbuffers_utils.cc index 6d7ed94b2956d..2314a5228f83c 100644 --- a/onnxruntime/core/graph/graph_flatbuffers_utils.cc +++ b/onnxruntime/core/graph/graph_flatbuffers_utils.cc @@ -3,7 +3,7 @@ #include "graph_flatbuffers_utils.h" -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/narrow.h" #include "core/flatbuffers/flatbuffers_utils.h" diff --git a/onnxruntime/core/graph/graph_flatbuffers_utils.h b/onnxruntime/core/graph/graph_flatbuffers_utils.h index b625cbf3ca492..9c55dad3c41ef 100644 --- a/onnxruntime/core/graph/graph_flatbuffers_utils.h +++ b/onnxruntime/core/graph/graph_flatbuffers_utils.h @@ -5,7 +5,7 @@ #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/status.h" #include "core/graph/ort_format_load_options.h" diff --git a/onnxruntime/core/graph/model.h b/onnxruntime/core/graph/model.h index 4ce6660b794bc..a774d5fe34461 100644 --- a/onnxruntime/core/graph/model.h +++ b/onnxruntime/core/graph/model.h @@ -8,7 +8,7 @@ #include #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/path.h" #include "core/graph/graph_viewer.h" diff --git a/onnxruntime/core/graph/op_identifier_utils.h b/onnxruntime/core/graph/op_identifier_utils.h index 8a9351a2d0ddc..f7b1198c31972 100644 --- a/onnxruntime/core/graph/op_identifier_utils.h +++ b/onnxruntime/core/graph/op_identifier_utils.h @@ -3,7 +3,7 @@ #pragma once -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/graph/op_identifier.h" diff --git a/onnxruntime/core/graph/runtime_optimization_record_container.h b/onnxruntime/core/graph/runtime_optimization_record_container.h index a28b19e786de0..75750c2b96987 100644 --- a/onnxruntime/core/graph/runtime_optimization_record_container.h +++ b/onnxruntime/core/graph/runtime_optimization_record_container.h @@ -9,7 +9,7 @@ #include #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "core/common/common.h" #include "core/graph/runtime_optimization_record.h" diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h index 72059b9a3f911..df485396f1e47 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/DmlGraphDesc_generated.h @@ -4,7 +4,14 @@ #ifndef FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_ #define FLATBUFFERS_GENERATED_DMLGRAPHDESC_DML_IR_H_ -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" + +// Ensure the included flatbuffers.h is the same version as when this file was +// generated, otherwise it may not be compatible. +static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && + FLATBUFFERS_VERSION_MINOR == 5 && + FLATBUFFERS_VERSION_REVISION == 26, + "Non-compatible flatbuffers version included"); #include "OperatorFieldTypes_generated.h" @@ -32,7 +39,7 @@ struct DmlGraphNodeBuilder; struct DmlGraphDesc; struct DmlGraphDescBuilder; -enum ConstantNodeDescDetail { +enum ConstantNodeDescDetail : uint8_t { ConstantNodeDescDetail_NONE = 0, ConstantNodeDescDetail_ConstantName = 1, ConstantNodeDescDetail_ConstantRawData = 2, @@ -60,7 +67,7 @@ inline const char * const *EnumNamesConstantNodeDescDetail() { } inline const char *EnumNameConstantNodeDescDetail(ConstantNodeDescDetail e) { - if (flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return ""; + if (::flatbuffers::IsOutRange(e, ConstantNodeDescDetail_NONE, ConstantNodeDescDetail_ConstantRawData)) return ""; const size_t index = static_cast(e); return EnumNamesConstantNodeDescDetail()[index]; } @@ -77,10 +84,10 @@ template<> struct ConstantNodeDescDetailTraits { static const ConstantNodeDescDetail enum_value = ConstantNodeDescDetail_ConstantRawData; }; -bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type); -bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); +bool VerifyConstantNodeDescDetail(::flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type); +bool VerifyConstantNodeDescDetailVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); -enum NodeDesc { +enum NodeDesc : uint8_t { NodeDesc_NONE = 0, NodeDesc_OperatorNodeDesc = 1, NodeDesc_ConstantNodeDesc = 2, @@ -108,7 +115,7 @@ inline const char * const *EnumNamesNodeDesc() { } inline const char *EnumNameNodeDesc(NodeDesc e) { - if (flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return ""; + if (::flatbuffers::IsOutRange(e, NodeDesc_NONE, NodeDesc_ConstantNodeDesc)) return ""; const size_t index = static_cast(e); return EnumNamesNodeDesc()[index]; } @@ -125,18 +132,21 @@ template<> struct NodeDescTraits { static const NodeDesc enum_value = NodeDesc_ConstantNodeDesc; }; -bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type); -bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); +bool VerifyNodeDesc(::flatbuffers::Verifier &verifier, const void *obj, NodeDesc type); +bool VerifyNodeDescVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); -struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef ConstantRawDataBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA = 4 }; - const flatbuffers::Vector *data() const { - return GetPointer *>(VT_DATA); + const ::flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); + } + ::flatbuffers::Vector *mutable_data() { + return GetPointer<::flatbuffers::Vector *>(VT_DATA); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) && verifier.VerifyVector(data()) && @@ -146,33 +156,32 @@ struct ConstantRawData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct ConstantRawDataBuilder { typedef ConstantRawData Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_data(flatbuffers::Offset> data) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_data(::flatbuffers::Offset<::flatbuffers::Vector> data) { fbb_.AddOffset(ConstantRawData::VT_DATA, data); } - explicit ConstantRawDataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit ConstantRawDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - ConstantRawDataBuilder &operator=(const ConstantRawDataBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateConstantRawData( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> data = 0) { +inline ::flatbuffers::Offset CreateConstantRawData( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector> data = 0) { ConstantRawDataBuilder builder_(_fbb); builder_.add_data(data); return builder_.Finish(); } -inline flatbuffers::Offset CreateConstantRawDataDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateConstantRawDataDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const std::vector *data = nullptr) { auto data__ = data ? _fbb.CreateVector(*data) : 0; return dml::ir::CreateConstantRawData( @@ -180,15 +189,18 @@ inline flatbuffers::Offset CreateConstantRawDataDirect( data__); } -struct ConstantName FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct ConstantName FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef ConstantNameBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_NAME = 4 }; - const flatbuffers::String *name() const { - return GetPointer(VT_NAME); + const ::flatbuffers::String *name() const { + return GetPointer(VT_NAME); + } + ::flatbuffers::String *mutable_name() { + return GetPointer<::flatbuffers::String *>(VT_NAME); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) && @@ -198,33 +210,32 @@ struct ConstantName FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct ConstantNameBuilder { typedef ConstantName Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_name(flatbuffers::Offset name) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { fbb_.AddOffset(ConstantName::VT_NAME, name); } - explicit ConstantNameBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit ConstantNameBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - ConstantNameBuilder &operator=(const ConstantNameBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateConstantName( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset name = 0) { +inline ::flatbuffers::Offset CreateConstantName( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> name = 0) { ConstantNameBuilder builder_(_fbb); builder_.add_name(name); return builder_.Finish(); } -inline flatbuffers::Offset CreateConstantNameDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateConstantNameDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const char *name = nullptr) { auto name__ = name ? _fbb.CreateString(name) : 0; return dml::ir::CreateConstantName( @@ -232,7 +243,7 @@ inline flatbuffers::Offset CreateConstantNameDirect( name__); } -struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef ConstantNodeDescBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA_TYPE = 4, @@ -251,9 +262,12 @@ struct ConstantNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const dml::ir::ConstantRawData *data_as_ConstantRawData() const { return data_type() == dml::ir::ConstantNodeDescDetail_ConstantRawData ? static_cast(data()) : nullptr; } - bool Verify(flatbuffers::Verifier &verifier) const { + void *mutable_data() { + return GetPointer(VT_DATA); + } + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_DATA_TYPE) && + VerifyField(verifier, VT_DATA_TYPE, 1) && VerifyOffset(verifier, VT_DATA) && VerifyConstantNodeDescDetail(verifier, data(), data_type()) && verifier.EndTable(); @@ -270,37 +284,36 @@ template<> inline const dml::ir::ConstantRawData *ConstantNodeDesc::data_as(ConstantNodeDesc::VT_DATA_TYPE, static_cast(data_type), 0); } - void add_data(flatbuffers::Offset data) { + void add_data(::flatbuffers::Offset data) { fbb_.AddOffset(ConstantNodeDesc::VT_DATA, data); } - explicit ConstantNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit ConstantNodeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - ConstantNodeDescBuilder &operator=(const ConstantNodeDescBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateConstantNodeDesc( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateConstantNodeDesc( + ::flatbuffers::FlatBufferBuilder &_fbb, dml::ir::ConstantNodeDescDetail data_type = dml::ir::ConstantNodeDescDetail_NONE, - flatbuffers::Offset data = 0) { + ::flatbuffers::Offset data = 0) { ConstantNodeDescBuilder builder_(_fbb); builder_.add_data(data); builder_.add_data_type(data_type); return builder_.Finish(); } -struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef DmlBufferTensorDescBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATATYPE = 4, @@ -308,19 +321,31 @@ struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table VT_STRIDES = 8, VT_TOTALTENSORSIZEINBYTES = 10 }; - const flatbuffers::String *dataType() const { - return GetPointer(VT_DATATYPE); + const ::flatbuffers::String *dataType() const { + return GetPointer(VT_DATATYPE); + } + ::flatbuffers::String *mutable_dataType() { + return GetPointer<::flatbuffers::String *>(VT_DATATYPE); + } + const ::flatbuffers::Vector *sizes() const { + return GetPointer *>(VT_SIZES); } - const flatbuffers::Vector *sizes() const { - return GetPointer *>(VT_SIZES); + ::flatbuffers::Vector *mutable_sizes() { + return GetPointer<::flatbuffers::Vector *>(VT_SIZES); } - const flatbuffers::Vector *strides() const { - return GetPointer *>(VT_STRIDES); + const ::flatbuffers::Vector *strides() const { + return GetPointer *>(VT_STRIDES); + } + ::flatbuffers::Vector *mutable_strides() { + return GetPointer<::flatbuffers::Vector *>(VT_STRIDES); } uint64_t totalTensorSizeInBytes() const { return GetField(VT_TOTALTENSORSIZEINBYTES, 0); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool mutate_totalTensorSizeInBytes(uint64_t _totalTensorSizeInBytes = 0) { + return SetField(VT_TOTALTENSORSIZEINBYTES, _totalTensorSizeInBytes, 0); + } + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATATYPE) && verifier.VerifyString(dataType()) && @@ -328,44 +353,43 @@ struct DmlBufferTensorDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table verifier.VerifyVector(sizes()) && VerifyOffset(verifier, VT_STRIDES) && verifier.VerifyVector(strides()) && - VerifyField(verifier, VT_TOTALTENSORSIZEINBYTES) && + VerifyField(verifier, VT_TOTALTENSORSIZEINBYTES, 8) && verifier.EndTable(); } }; struct DmlBufferTensorDescBuilder { typedef DmlBufferTensorDesc Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_dataType(flatbuffers::Offset dataType) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_dataType(::flatbuffers::Offset<::flatbuffers::String> dataType) { fbb_.AddOffset(DmlBufferTensorDesc::VT_DATATYPE, dataType); } - void add_sizes(flatbuffers::Offset> sizes) { + void add_sizes(::flatbuffers::Offset<::flatbuffers::Vector> sizes) { fbb_.AddOffset(DmlBufferTensorDesc::VT_SIZES, sizes); } - void add_strides(flatbuffers::Offset> strides) { + void add_strides(::flatbuffers::Offset<::flatbuffers::Vector> strides) { fbb_.AddOffset(DmlBufferTensorDesc::VT_STRIDES, strides); } void add_totalTensorSizeInBytes(uint64_t totalTensorSizeInBytes) { fbb_.AddElement(DmlBufferTensorDesc::VT_TOTALTENSORSIZEINBYTES, totalTensorSizeInBytes, 0); } - explicit DmlBufferTensorDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit DmlBufferTensorDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - DmlBufferTensorDescBuilder &operator=(const DmlBufferTensorDescBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateDmlBufferTensorDesc( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset dataType = 0, - flatbuffers::Offset> sizes = 0, - flatbuffers::Offset> strides = 0, +inline ::flatbuffers::Offset CreateDmlBufferTensorDesc( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> dataType = 0, + ::flatbuffers::Offset<::flatbuffers::Vector> sizes = 0, + ::flatbuffers::Offset<::flatbuffers::Vector> strides = 0, uint64_t totalTensorSizeInBytes = 0) { DmlBufferTensorDescBuilder builder_(_fbb); builder_.add_totalTensorSizeInBytes(totalTensorSizeInBytes); @@ -375,8 +399,8 @@ inline flatbuffers::Offset CreateDmlBufferTensorDesc( return builder_.Finish(); } -inline flatbuffers::Offset CreateDmlBufferTensorDescDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateDmlBufferTensorDescDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const char *dataType = nullptr, const std::vector *sizes = nullptr, const std::vector *strides = nullptr, @@ -392,7 +416,7 @@ inline flatbuffers::Offset CreateDmlBufferTensorDescDirect( totalTensorSizeInBytes); } -struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef OperatorNodeDescBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_TYPE = 4, @@ -400,19 +424,31 @@ struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { VT_OUTPUTS = 8, VT_ATTRIBUTES = 10 }; - const flatbuffers::String *type() const { - return GetPointer(VT_TYPE); + const ::flatbuffers::String *type() const { + return GetPointer(VT_TYPE); + } + ::flatbuffers::String *mutable_type() { + return GetPointer<::flatbuffers::String *>(VT_TYPE); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *inputs() const { + return GetPointer> *>(VT_INPUTS); + } + ::flatbuffers::Vector<::flatbuffers::Offset> *mutable_inputs() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset> *>(VT_INPUTS); + } + const ::flatbuffers::Vector<::flatbuffers::Offset> *outputs() const { + return GetPointer> *>(VT_OUTPUTS); } - const flatbuffers::Vector> *inputs() const { - return GetPointer> *>(VT_INPUTS); + ::flatbuffers::Vector<::flatbuffers::Offset> *mutable_outputs() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset> *>(VT_OUTPUTS); } - const flatbuffers::Vector> *outputs() const { - return GetPointer> *>(VT_OUTPUTS); + const ::flatbuffers::Vector<::flatbuffers::Offset> *attributes() const { + return GetPointer> *>(VT_ATTRIBUTES); } - const flatbuffers::Vector> *attributes() const { - return GetPointer> *>(VT_ATTRIBUTES); + ::flatbuffers::Vector<::flatbuffers::Offset> *mutable_attributes() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset> *>(VT_ATTRIBUTES); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_TYPE) && verifier.VerifyString(type()) && @@ -431,38 +467,37 @@ struct OperatorNodeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct OperatorNodeDescBuilder { typedef OperatorNodeDesc Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_type(flatbuffers::Offset type) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_type(::flatbuffers::Offset<::flatbuffers::String> type) { fbb_.AddOffset(OperatorNodeDesc::VT_TYPE, type); } - void add_inputs(flatbuffers::Offset>> inputs) { + void add_inputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> inputs) { fbb_.AddOffset(OperatorNodeDesc::VT_INPUTS, inputs); } - void add_outputs(flatbuffers::Offset>> outputs) { + void add_outputs(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> outputs) { fbb_.AddOffset(OperatorNodeDesc::VT_OUTPUTS, outputs); } - void add_attributes(flatbuffers::Offset>> attributes) { + void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> attributes) { fbb_.AddOffset(OperatorNodeDesc::VT_ATTRIBUTES, attributes); } - explicit OperatorNodeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit OperatorNodeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - OperatorNodeDescBuilder &operator=(const OperatorNodeDescBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateOperatorNodeDesc( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset type = 0, - flatbuffers::Offset>> inputs = 0, - flatbuffers::Offset>> outputs = 0, - flatbuffers::Offset>> attributes = 0) { +inline ::flatbuffers::Offset CreateOperatorNodeDesc( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> type = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> inputs = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> outputs = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> attributes = 0) { OperatorNodeDescBuilder builder_(_fbb); builder_.add_attributes(attributes); builder_.add_outputs(outputs); @@ -471,16 +506,16 @@ inline flatbuffers::Offset CreateOperatorNodeDesc( return builder_.Finish(); } -inline flatbuffers::Offset CreateOperatorNodeDescDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateOperatorNodeDescDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const char *type = nullptr, - const std::vector> *inputs = nullptr, - const std::vector> *outputs = nullptr, - const std::vector> *attributes = nullptr) { + const std::vector<::flatbuffers::Offset> *inputs = nullptr, + const std::vector<::flatbuffers::Offset> *outputs = nullptr, + const std::vector<::flatbuffers::Offset> *attributes = nullptr) { auto type__ = type ? _fbb.CreateString(type) : 0; - auto inputs__ = inputs ? _fbb.CreateVector>(*inputs) : 0; - auto outputs__ = outputs ? _fbb.CreateVector>(*outputs) : 0; - auto attributes__ = attributes ? _fbb.CreateVector>(*attributes) : 0; + auto inputs__ = inputs ? _fbb.CreateVector<::flatbuffers::Offset>(*inputs) : 0; + auto outputs__ = outputs ? _fbb.CreateVector<::flatbuffers::Offset>(*outputs) : 0; + auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset>(*attributes) : 0; return dml::ir::CreateOperatorNodeDesc( _fbb, type__, @@ -489,7 +524,7 @@ inline flatbuffers::Offset CreateOperatorNodeDescDirect( attributes__); } -struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef DmlGraphNodeBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DESC_TYPE = 4, @@ -511,18 +546,30 @@ struct DmlGraphNode FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { const dml::ir::ConstantNodeDesc *desc_as_ConstantNodeDesc() const { return desc_type() == dml::ir::NodeDesc_ConstantNodeDesc ? static_cast(desc()) : nullptr; } - const flatbuffers::String *name() const { - return GetPointer(VT_NAME); + void *mutable_desc() { + return GetPointer(VT_DESC); } - const flatbuffers::Vector> *inputNames() const { - return GetPointer> *>(VT_INPUTNAMES); + const ::flatbuffers::String *name() const { + return GetPointer(VT_NAME); } - const flatbuffers::Vector> *outputNames() const { - return GetPointer> *>(VT_OUTPUTNAMES); + ::flatbuffers::String *mutable_name() { + return GetPointer<::flatbuffers::String *>(VT_NAME); } - bool Verify(flatbuffers::Verifier &verifier) const { + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *inputNames() const { + return GetPointer> *>(VT_INPUTNAMES); + } + ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_inputNames() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_INPUTNAMES); + } + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *outputNames() const { + return GetPointer> *>(VT_OUTPUTNAMES); + } + ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_outputNames() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_OUTPUTNAMES); + } + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_DESC_TYPE) && + VerifyField(verifier, VT_DESC_TYPE, 1) && VerifyOffset(verifier, VT_DESC) && VerifyNodeDesc(verifier, desc(), desc_type()) && VerifyOffset(verifier, VT_NAME) && @@ -547,42 +594,41 @@ template<> inline const dml::ir::ConstantNodeDesc *DmlGraphNode::desc_as(DmlGraphNode::VT_DESC_TYPE, static_cast(desc_type), 0); } - void add_desc(flatbuffers::Offset desc) { + void add_desc(::flatbuffers::Offset desc) { fbb_.AddOffset(DmlGraphNode::VT_DESC, desc); } - void add_name(flatbuffers::Offset name) { + void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { fbb_.AddOffset(DmlGraphNode::VT_NAME, name); } - void add_inputNames(flatbuffers::Offset>> inputNames) { + void add_inputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputNames) { fbb_.AddOffset(DmlGraphNode::VT_INPUTNAMES, inputNames); } - void add_outputNames(flatbuffers::Offset>> outputNames) { + void add_outputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputNames) { fbb_.AddOffset(DmlGraphNode::VT_OUTPUTNAMES, outputNames); } - explicit DmlGraphNodeBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit DmlGraphNodeBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - DmlGraphNodeBuilder &operator=(const DmlGraphNodeBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateDmlGraphNode( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateDmlGraphNode( + ::flatbuffers::FlatBufferBuilder &_fbb, dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE, - flatbuffers::Offset desc = 0, - flatbuffers::Offset name = 0, - flatbuffers::Offset>> inputNames = 0, - flatbuffers::Offset>> outputNames = 0) { + ::flatbuffers::Offset desc = 0, + ::flatbuffers::Offset<::flatbuffers::String> name = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> inputNames = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> outputNames = 0) { DmlGraphNodeBuilder builder_(_fbb); builder_.add_outputNames(outputNames); builder_.add_inputNames(inputNames); @@ -592,16 +638,16 @@ inline flatbuffers::Offset CreateDmlGraphNode( return builder_.Finish(); } -inline flatbuffers::Offset CreateDmlGraphNodeDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateDmlGraphNodeDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, dml::ir::NodeDesc desc_type = dml::ir::NodeDesc_NONE, - flatbuffers::Offset desc = 0, + ::flatbuffers::Offset desc = 0, const char *name = nullptr, - const std::vector> *inputNames = nullptr, - const std::vector> *outputNames = nullptr) { + const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *inputNames = nullptr, + const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *outputNames = nullptr) { auto name__ = name ? _fbb.CreateString(name) : 0; - auto inputNames__ = inputNames ? _fbb.CreateVector>(*inputNames) : 0; - auto outputNames__ = outputNames ? _fbb.CreateVector>(*outputNames) : 0; + auto inputNames__ = inputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*inputNames) : 0; + auto outputNames__ = outputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*outputNames) : 0; return dml::ir::CreateDmlGraphNode( _fbb, desc_type, @@ -611,23 +657,32 @@ inline flatbuffers::Offset CreateDmlGraphNodeDirect( outputNames__); } -struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef DmlGraphDescBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_NODES = 4, VT_GRAPHINPUTNAMES = 6, VT_GRAPHOUTPUTNAMES = 8 }; - const flatbuffers::Vector> *nodes() const { - return GetPointer> *>(VT_NODES); + const ::flatbuffers::Vector<::flatbuffers::Offset> *nodes() const { + return GetPointer> *>(VT_NODES); + } + ::flatbuffers::Vector<::flatbuffers::Offset> *mutable_nodes() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset> *>(VT_NODES); + } + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *graphInputNames() const { + return GetPointer> *>(VT_GRAPHINPUTNAMES); } - const flatbuffers::Vector> *graphInputNames() const { - return GetPointer> *>(VT_GRAPHINPUTNAMES); + ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_graphInputNames() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHINPUTNAMES); } - const flatbuffers::Vector> *graphOutputNames() const { - return GetPointer> *>(VT_GRAPHOUTPUTNAMES); + const ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *graphOutputNames() const { + return GetPointer> *>(VT_GRAPHOUTPUTNAMES); } - bool Verify(flatbuffers::Verifier &verifier) const { + ::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *mutable_graphOutputNames() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>> *>(VT_GRAPHOUTPUTNAMES); + } + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NODES) && verifier.VerifyVector(nodes()) && @@ -644,34 +699,33 @@ struct DmlGraphDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct DmlGraphDescBuilder { typedef DmlGraphDesc Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_nodes(flatbuffers::Offset>> nodes) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_nodes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> nodes) { fbb_.AddOffset(DmlGraphDesc::VT_NODES, nodes); } - void add_graphInputNames(flatbuffers::Offset>> graphInputNames) { + void add_graphInputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphInputNames) { fbb_.AddOffset(DmlGraphDesc::VT_GRAPHINPUTNAMES, graphInputNames); } - void add_graphOutputNames(flatbuffers::Offset>> graphOutputNames) { + void add_graphOutputNames(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphOutputNames) { fbb_.AddOffset(DmlGraphDesc::VT_GRAPHOUTPUTNAMES, graphOutputNames); } - explicit DmlGraphDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit DmlGraphDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - DmlGraphDescBuilder &operator=(const DmlGraphDescBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateDmlGraphDesc( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset>> nodes = 0, - flatbuffers::Offset>> graphInputNames = 0, - flatbuffers::Offset>> graphOutputNames = 0) { +inline ::flatbuffers::Offset CreateDmlGraphDesc( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> nodes = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphInputNames = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset<::flatbuffers::String>>> graphOutputNames = 0) { DmlGraphDescBuilder builder_(_fbb); builder_.add_graphOutputNames(graphOutputNames); builder_.add_graphInputNames(graphInputNames); @@ -679,14 +733,14 @@ inline flatbuffers::Offset CreateDmlGraphDesc( return builder_.Finish(); } -inline flatbuffers::Offset CreateDmlGraphDescDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector> *nodes = nullptr, - const std::vector> *graphInputNames = nullptr, - const std::vector> *graphOutputNames = nullptr) { - auto nodes__ = nodes ? _fbb.CreateVector>(*nodes) : 0; - auto graphInputNames__ = graphInputNames ? _fbb.CreateVector>(*graphInputNames) : 0; - auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector>(*graphOutputNames) : 0; +inline ::flatbuffers::Offset CreateDmlGraphDescDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const std::vector<::flatbuffers::Offset> *nodes = nullptr, + const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *graphInputNames = nullptr, + const std::vector<::flatbuffers::Offset<::flatbuffers::String>> *graphOutputNames = nullptr) { + auto nodes__ = nodes ? _fbb.CreateVector<::flatbuffers::Offset>(*nodes) : 0; + auto graphInputNames__ = graphInputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*graphInputNames) : 0; + auto graphOutputNames__ = graphOutputNames ? _fbb.CreateVector<::flatbuffers::Offset<::flatbuffers::String>>(*graphOutputNames) : 0; return dml::ir::CreateDmlGraphDesc( _fbb, nodes__, @@ -694,7 +748,7 @@ inline flatbuffers::Offset CreateDmlGraphDescDirect( graphOutputNames__); } -inline bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) { +inline bool VerifyConstantNodeDescDetail(::flatbuffers::Verifier &verifier, const void *obj, ConstantNodeDescDetail type) { switch (type) { case ConstantNodeDescDetail_NONE: { return true; @@ -711,10 +765,10 @@ inline bool VerifyConstantNodeDescDetail(flatbuffers::Verifier &verifier, const } } -inline bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { +inline bool VerifyConstantNodeDescDetailVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types) { if (!values || !types) return !values && !types; if (values->size() != types->size()) return false; - for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { if (!VerifyConstantNodeDescDetail( verifier, values->Get(i), types->GetEnum(i))) { return false; @@ -723,7 +777,7 @@ inline bool VerifyConstantNodeDescDetailVector(flatbuffers::Verifier &verifier, return true; } -inline bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) { +inline bool VerifyNodeDesc(::flatbuffers::Verifier &verifier, const void *obj, NodeDesc type) { switch (type) { case NodeDesc_NONE: { return true; @@ -740,10 +794,10 @@ inline bool VerifyNodeDesc(flatbuffers::Verifier &verifier, const void *obj, Nod } } -inline bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { +inline bool VerifyNodeDescVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types) { if (!values || !types) return !values && !types; if (values->size() != types->size()) return false; - for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { if (!VerifyNodeDesc( verifier, values->Get(i), types->GetEnum(i))) { return false; @@ -753,32 +807,40 @@ inline bool VerifyNodeDescVector(flatbuffers::Verifier &verifier, const flatbuff } inline const dml::ir::DmlGraphDesc *GetDmlGraphDesc(const void *buf) { - return flatbuffers::GetRoot(buf); + return ::flatbuffers::GetRoot(buf); } inline const dml::ir::DmlGraphDesc *GetSizePrefixedDmlGraphDesc(const void *buf) { - return flatbuffers::GetSizePrefixedRoot(buf); + return ::flatbuffers::GetSizePrefixedRoot(buf); +} + +inline DmlGraphDesc *GetMutableDmlGraphDesc(void *buf) { + return ::flatbuffers::GetMutableRoot(buf); +} + +inline dml::ir::DmlGraphDesc *GetMutableSizePrefixedDmlGraphDesc(void *buf) { + return ::flatbuffers::GetMutableSizePrefixedRoot(buf); } inline bool VerifyDmlGraphDescBuffer( - flatbuffers::Verifier &verifier) { + ::flatbuffers::Verifier &verifier) { return verifier.VerifyBuffer(nullptr); } inline bool VerifySizePrefixedDmlGraphDescBuffer( - flatbuffers::Verifier &verifier) { + ::flatbuffers::Verifier &verifier) { return verifier.VerifySizePrefixedBuffer(nullptr); } inline void FinishDmlGraphDescBuffer( - flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { + ::flatbuffers::FlatBufferBuilder &fbb, + ::flatbuffers::Offset root) { fbb.Finish(root); } inline void FinishSizePrefixedDmlGraphDescBuffer( - flatbuffers::FlatBufferBuilder &fbb, - flatbuffers::Offset root) { + ::flatbuffers::FlatBufferBuilder &fbb, + ::flatbuffers::Offset root) { fbb.FinishSizePrefixed(root); } diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h index 167a913bb0132..639c31f0dc5c8 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/External/DirectMLHelpers/OperatorFieldTypes_generated.h @@ -4,7 +4,14 @@ #ifndef FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_ #define FLATBUFFERS_GENERATED_OPERATORFIELDTYPES_DML_IR_OPERATORFIELDTYPES_H_ -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" + +// Ensure the included flatbuffers.h is the same version as when this file was +// generated, otherwise it may not be compatible. +static_assert(FLATBUFFERS_VERSION_MAJOR == 23 && + FLATBUFFERS_VERSION_MINOR == 5 && + FLATBUFFERS_VERSION_REVISION == 26, + "Non-compatible flatbuffers version included"); namespace dml { namespace ir { @@ -59,7 +66,7 @@ struct ScalarUnionDataBuilder; struct Bool; -enum AttributeFieldVariant { +enum AttributeFieldVariant : uint8_t { AttributeFieldVariant_NONE = 0, AttributeFieldVariant_Activation = 1, AttributeFieldVariant_ActivationArray = 2, @@ -120,7 +127,7 @@ inline const char * const *EnumNamesAttributeFieldVariant() { } inline const char *EnumNameAttributeFieldVariant(AttributeFieldVariant e) { - if (flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return ""; + if (::flatbuffers::IsOutRange(e, AttributeFieldVariant_NONE, AttributeFieldVariant_Bool)) return ""; const size_t index = static_cast(e); return EnumNamesAttributeFieldVariant()[index]; } @@ -181,10 +188,10 @@ template<> struct AttributeFieldVariantTraits static const AttributeFieldVariant enum_value = AttributeFieldVariant_Bool; }; -bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type); -bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); +bool VerifyAttributeFieldVariant(::flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type); +bool VerifyAttributeFieldVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); -enum ScalarVariant { +enum ScalarVariant : uint8_t { ScalarVariant_NONE = 0, ScalarVariant_ByteArray = 1, ScalarVariant_Int8 = 2, @@ -239,7 +246,7 @@ inline const char * const *EnumNamesScalarVariant() { } inline const char *EnumNameScalarVariant(ScalarVariant e) { - if (flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return ""; + if (::flatbuffers::IsOutRange(e, ScalarVariant_NONE, ScalarVariant_Float64)) return ""; const size_t index = static_cast(e); return EnumNamesScalarVariant()[index]; } @@ -292,25 +299,25 @@ template<> struct ScalarVariantTraits { static const ScalarVariant enum_value = ScalarVariant_Float64; }; -bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type); -bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types); +bool VerifyScalarVariant(::flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type); +bool VerifyScalarVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types); FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) UInt8 FLATBUFFERS_FINAL_CLASS { private: uint8_t data_; public: - UInt8() { - memset(static_cast(this), 0, sizeof(UInt8)); + UInt8() + : data_(0) { } UInt8(uint8_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } uint8_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(uint8_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(UInt8, 1); @@ -320,17 +327,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) UInt16 FLATBUFFERS_FINAL_CLASS { uint16_t data_; public: - UInt16() { - memset(static_cast(this), 0, sizeof(UInt16)); + UInt16() + : data_(0) { } UInt16(uint16_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } uint16_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(uint16_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(UInt16, 2); @@ -340,17 +347,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) UInt32 FLATBUFFERS_FINAL_CLASS { uint32_t data_; public: - UInt32() { - memset(static_cast(this), 0, sizeof(UInt32)); + UInt32() + : data_(0) { } UInt32(uint32_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } uint32_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(uint32_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(UInt32, 4); @@ -360,17 +367,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) UInt64 FLATBUFFERS_FINAL_CLASS { uint64_t data_; public: - UInt64() { - memset(static_cast(this), 0, sizeof(UInt64)); + UInt64() + : data_(0) { } UInt64(uint64_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } uint64_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(uint64_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(UInt64, 8); @@ -380,17 +387,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Int8 FLATBUFFERS_FINAL_CLASS { int8_t data_; public: - Int8() { - memset(static_cast(this), 0, sizeof(Int8)); + Int8() + : data_(0) { } Int8(int8_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } int8_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(int8_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(Int8, 1); @@ -400,17 +407,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(2) Int16 FLATBUFFERS_FINAL_CLASS { int16_t data_; public: - Int16() { - memset(static_cast(this), 0, sizeof(Int16)); + Int16() + : data_(0) { } Int16(int16_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } int16_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(int16_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(Int16, 2); @@ -420,17 +427,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Int32 FLATBUFFERS_FINAL_CLASS { int32_t data_; public: - Int32() { - memset(static_cast(this), 0, sizeof(Int32)); + Int32() + : data_(0) { } Int32(int32_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } int32_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(int32_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(Int32, 4); @@ -440,17 +447,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Int64 FLATBUFFERS_FINAL_CLASS { int64_t data_; public: - Int64() { - memset(static_cast(this), 0, sizeof(Int64)); + Int64() + : data_(0) { } Int64(int64_t _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } int64_t data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(int64_t _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(Int64, 8); @@ -460,17 +467,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Float32 FLATBUFFERS_FINAL_CLASS { float data_; public: - Float32() { - memset(static_cast(this), 0, sizeof(Float32)); + Float32() + : data_(0) { } Float32(float _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } float data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(float _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(Float32, 4); @@ -480,17 +487,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(8) Float64 FLATBUFFERS_FINAL_CLASS { double data_; public: - Float64() { - memset(static_cast(this), 0, sizeof(Float64)); + Float64() + : data_(0) { } Float64(double _data) - : data_(flatbuffers::EndianScalar(_data)) { + : data_(::flatbuffers::EndianScalar(_data)) { } double data() const { - return flatbuffers::EndianScalar(data_); + return ::flatbuffers::EndianScalar(data_); } void mutate_data(double _data) { - flatbuffers::WriteScalar(&data_, _data); + ::flatbuffers::WriteScalar(&data_, _data); } }; FLATBUFFERS_STRUCT_END(Float64, 8); @@ -501,24 +508,25 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) ScaleBias FLATBUFFERS_FINAL_CLASS { float bias_; public: - ScaleBias() { - memset(static_cast(this), 0, sizeof(ScaleBias)); + ScaleBias() + : scale_(0), + bias_(0) { } ScaleBias(float _scale, float _bias) - : scale_(flatbuffers::EndianScalar(_scale)), - bias_(flatbuffers::EndianScalar(_bias)) { + : scale_(::flatbuffers::EndianScalar(_scale)), + bias_(::flatbuffers::EndianScalar(_bias)) { } float scale() const { - return flatbuffers::EndianScalar(scale_); + return ::flatbuffers::EndianScalar(scale_); } void mutate_scale(float _scale) { - flatbuffers::WriteScalar(&scale_, _scale); + ::flatbuffers::WriteScalar(&scale_, _scale); } float bias() const { - return flatbuffers::EndianScalar(bias_); + return ::flatbuffers::EndianScalar(bias_); } void mutate_bias(float _bias) { - flatbuffers::WriteScalar(&bias_, _bias); + ::flatbuffers::WriteScalar(&bias_, _bias); } }; FLATBUFFERS_STRUCT_END(ScaleBias, 8); @@ -529,24 +537,25 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(4) Size2D FLATBUFFERS_FINAL_CLASS { uint32_t height_; public: - Size2D() { - memset(static_cast(this), 0, sizeof(Size2D)); + Size2D() + : width_(0), + height_(0) { } Size2D(uint32_t _width, uint32_t _height) - : width_(flatbuffers::EndianScalar(_width)), - height_(flatbuffers::EndianScalar(_height)) { + : width_(::flatbuffers::EndianScalar(_width)), + height_(::flatbuffers::EndianScalar(_height)) { } uint32_t width() const { - return flatbuffers::EndianScalar(width_); + return ::flatbuffers::EndianScalar(width_); } void mutate_width(uint32_t _width) { - flatbuffers::WriteScalar(&width_, _width); + ::flatbuffers::WriteScalar(&width_, _width); } uint32_t height() const { - return flatbuffers::EndianScalar(height_); + return ::flatbuffers::EndianScalar(height_); } void mutate_height(uint32_t _height) { - flatbuffers::WriteScalar(&height_, _height); + ::flatbuffers::WriteScalar(&height_, _height); } }; FLATBUFFERS_STRUCT_END(Size2D, 8); @@ -556,14 +565,17 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) ByteArray FLATBUFFERS_FINAL_CLASS { uint8_t data_[8]; public: - ByteArray() { - memset(static_cast(this), 0, sizeof(ByteArray)); + ByteArray() + : data_() { + } + ByteArray(::flatbuffers::span _data) { + ::flatbuffers::CastToArray(data_).CopyFromSpan(_data); } - const flatbuffers::Array *data() const { - return reinterpret_cast *>(data_); + const ::flatbuffers::Array *data() const { + return &::flatbuffers::CastToArray(data_); } - flatbuffers::Array *mutable_data() { - return reinterpret_cast *>(data_); + ::flatbuffers::Array *mutable_data() { + return &::flatbuffers::CastToArray(data_); } }; FLATBUFFERS_STRUCT_END(ByteArray, 8); @@ -573,33 +585,33 @@ FLATBUFFERS_MANUALLY_ALIGNED_STRUCT(1) Bool FLATBUFFERS_FINAL_CLASS { uint8_t data_; public: - Bool() { - memset(static_cast(this), 0, sizeof(Bool)); + Bool() + : data_(0) { } Bool(bool _data) - : data_(flatbuffers::EndianScalar(static_cast(_data))) { + : data_(::flatbuffers::EndianScalar(static_cast(_data))) { } bool data() const { - return flatbuffers::EndianScalar(data_) != 0; + return ::flatbuffers::EndianScalar(data_) != 0; } void mutate_data(bool _data) { - flatbuffers::WriteScalar(&data_, static_cast(_data)); + ::flatbuffers::WriteScalar(&data_, static_cast(_data)); } }; FLATBUFFERS_STRUCT_END(Bool, 1); -struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef AttributeDescBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_NAME = 4, VT_VAL_TYPE = 6, VT_VAL = 8 }; - const flatbuffers::String *name() const { - return GetPointer(VT_NAME); + const ::flatbuffers::String *name() const { + return GetPointer(VT_NAME); } - flatbuffers::String *mutable_name() { - return GetPointer(VT_NAME); + ::flatbuffers::String *mutable_name() { + return GetPointer<::flatbuffers::String *>(VT_NAME); } dml::ir::operatorFieldTypes::AttributeFieldVariant val_type() const { return static_cast(GetField(VT_VAL_TYPE, 0)); @@ -650,11 +662,11 @@ struct AttributeDesc FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { void *mutable_val() { return GetPointer(VT_VAL); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_NAME) && verifier.VerifyString(name()) && - VerifyField(verifier, VT_VAL_TYPE) && + VerifyField(verifier, VT_VAL_TYPE, 1) && VerifyOffset(verifier, VT_VAL) && VerifyAttributeFieldVariant(verifier, val(), val_type()) && verifier.EndTable(); @@ -715,34 +727,33 @@ template<> inline const dml::ir::operatorFieldTypes::Bool *AttributeDesc::val_as struct AttributeDescBuilder { typedef AttributeDesc Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_name(flatbuffers::Offset name) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_name(::flatbuffers::Offset<::flatbuffers::String> name) { fbb_.AddOffset(AttributeDesc::VT_NAME, name); } void add_val_type(dml::ir::operatorFieldTypes::AttributeFieldVariant val_type) { fbb_.AddElement(AttributeDesc::VT_VAL_TYPE, static_cast(val_type), 0); } - void add_val(flatbuffers::Offset val) { + void add_val(::flatbuffers::Offset val) { fbb_.AddOffset(AttributeDesc::VT_VAL, val); } - explicit AttributeDescBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit AttributeDescBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - AttributeDescBuilder &operator=(const AttributeDescBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateAttributeDesc( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset name = 0, +inline ::flatbuffers::Offset CreateAttributeDesc( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> name = 0, dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE, - flatbuffers::Offset val = 0) { + ::flatbuffers::Offset val = 0) { AttributeDescBuilder builder_(_fbb); builder_.add_val(val); builder_.add_name(name); @@ -750,11 +761,11 @@ inline flatbuffers::Offset CreateAttributeDesc( return builder_.Finish(); } -inline flatbuffers::Offset CreateAttributeDescDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateAttributeDescDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const char *name = nullptr, dml::ir::operatorFieldTypes::AttributeFieldVariant val_type = dml::ir::operatorFieldTypes::AttributeFieldVariant_NONE, - flatbuffers::Offset val = 0) { + ::flatbuffers::Offset val = 0) { auto name__ = name ? _fbb.CreateString(name) : 0; return dml::ir::operatorFieldTypes::CreateAttributeDesc( _fbb, @@ -763,25 +774,25 @@ inline flatbuffers::Offset CreateAttributeDescDirect( val); } -struct Activation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct Activation FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef ActivationBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_TYPE = 4, VT_ATTRIBUTES = 6 }; - const flatbuffers::String *type() const { - return GetPointer(VT_TYPE); + const ::flatbuffers::String *type() const { + return GetPointer(VT_TYPE); } - flatbuffers::String *mutable_type() { - return GetPointer(VT_TYPE); + ::flatbuffers::String *mutable_type() { + return GetPointer<::flatbuffers::String *>(VT_TYPE); } - const flatbuffers::Vector> *attributes() const { - return GetPointer> *>(VT_ATTRIBUTES); + const ::flatbuffers::Vector<::flatbuffers::Offset> *attributes() const { + return GetPointer> *>(VT_ATTRIBUTES); } - flatbuffers::Vector> *mutable_attributes() { - return GetPointer> *>(VT_ATTRIBUTES); + ::flatbuffers::Vector<::flatbuffers::Offset> *mutable_attributes() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset> *>(VT_ATTRIBUTES); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_TYPE) && verifier.VerifyString(type()) && @@ -794,60 +805,59 @@ struct Activation FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct ActivationBuilder { typedef Activation Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_type(flatbuffers::Offset type) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_type(::flatbuffers::Offset<::flatbuffers::String> type) { fbb_.AddOffset(Activation::VT_TYPE, type); } - void add_attributes(flatbuffers::Offset>> attributes) { + void add_attributes(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> attributes) { fbb_.AddOffset(Activation::VT_ATTRIBUTES, attributes); } - explicit ActivationBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit ActivationBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - ActivationBuilder &operator=(const ActivationBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateActivation( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset type = 0, - flatbuffers::Offset>> attributes = 0) { +inline ::flatbuffers::Offset CreateActivation( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::String> type = 0, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> attributes = 0) { ActivationBuilder builder_(_fbb); builder_.add_attributes(attributes); builder_.add_type(type); return builder_.Finish(); } -inline flatbuffers::Offset CreateActivationDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateActivationDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const char *type = nullptr, - const std::vector> *attributes = nullptr) { + const std::vector<::flatbuffers::Offset> *attributes = nullptr) { auto type__ = type ? _fbb.CreateString(type) : 0; - auto attributes__ = attributes ? _fbb.CreateVector>(*attributes) : 0; + auto attributes__ = attributes ? _fbb.CreateVector<::flatbuffers::Offset>(*attributes) : 0; return dml::ir::operatorFieldTypes::CreateActivation( _fbb, type__, attributes__); } -struct ActivationArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct ActivationArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef ActivationArrayBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA = 4 }; - const flatbuffers::Vector> *data() const { - return GetPointer> *>(VT_DATA); + const ::flatbuffers::Vector<::flatbuffers::Offset> *data() const { + return GetPointer> *>(VT_DATA); } - flatbuffers::Vector> *mutable_data() { - return GetPointer> *>(VT_DATA); + ::flatbuffers::Vector<::flatbuffers::Offset> *mutable_data() { + return GetPointer<::flatbuffers::Vector<::flatbuffers::Offset> *>(VT_DATA); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) && verifier.VerifyVector(data()) && @@ -858,52 +868,51 @@ struct ActivationArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct ActivationArrayBuilder { typedef ActivationArray Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_data(flatbuffers::Offset>> data) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_data(::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> data) { fbb_.AddOffset(ActivationArray::VT_DATA, data); } - explicit ActivationArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit ActivationArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - ActivationArrayBuilder &operator=(const ActivationArrayBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateActivationArray( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset>> data = 0) { +inline ::flatbuffers::Offset CreateActivationArray( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector<::flatbuffers::Offset>> data = 0) { ActivationArrayBuilder builder_(_fbb); builder_.add_data(data); return builder_.Finish(); } -inline flatbuffers::Offset CreateActivationArrayDirect( - flatbuffers::FlatBufferBuilder &_fbb, - const std::vector> *data = nullptr) { - auto data__ = data ? _fbb.CreateVector>(*data) : 0; +inline ::flatbuffers::Offset CreateActivationArrayDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, + const std::vector<::flatbuffers::Offset> *data = nullptr) { + auto data__ = data ? _fbb.CreateVector<::flatbuffers::Offset>(*data) : 0; return dml::ir::operatorFieldTypes::CreateActivationArray( _fbb, data__); } -struct UIntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct UIntArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef UIntArrayBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA = 4 }; - const flatbuffers::Vector *data() const { - return GetPointer *>(VT_DATA); + const ::flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); } - flatbuffers::Vector *mutable_data() { - return GetPointer *>(VT_DATA); + ::flatbuffers::Vector *mutable_data() { + return GetPointer<::flatbuffers::Vector *>(VT_DATA); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) && verifier.VerifyVector(data()) && @@ -913,33 +922,32 @@ struct UIntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct UIntArrayBuilder { typedef UIntArray Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_data(flatbuffers::Offset> data) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_data(::flatbuffers::Offset<::flatbuffers::Vector> data) { fbb_.AddOffset(UIntArray::VT_DATA, data); } - explicit UIntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit UIntArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - UIntArrayBuilder &operator=(const UIntArrayBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateUIntArray( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> data = 0) { +inline ::flatbuffers::Offset CreateUIntArray( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector> data = 0) { UIntArrayBuilder builder_(_fbb); builder_.add_data(data); return builder_.Finish(); } -inline flatbuffers::Offset CreateUIntArrayDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateUIntArrayDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const std::vector *data = nullptr) { auto data__ = data ? _fbb.CreateVector(*data) : 0; return dml::ir::operatorFieldTypes::CreateUIntArray( @@ -947,18 +955,18 @@ inline flatbuffers::Offset CreateUIntArrayDirect( data__); } -struct IntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct IntArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef IntArrayBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA = 4 }; - const flatbuffers::Vector *data() const { - return GetPointer *>(VT_DATA); + const ::flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); } - flatbuffers::Vector *mutable_data() { - return GetPointer *>(VT_DATA); + ::flatbuffers::Vector *mutable_data() { + return GetPointer<::flatbuffers::Vector *>(VT_DATA); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) && verifier.VerifyVector(data()) && @@ -968,33 +976,32 @@ struct IntArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct IntArrayBuilder { typedef IntArray Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_data(flatbuffers::Offset> data) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_data(::flatbuffers::Offset<::flatbuffers::Vector> data) { fbb_.AddOffset(IntArray::VT_DATA, data); } - explicit IntArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit IntArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - IntArrayBuilder &operator=(const IntArrayBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateIntArray( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> data = 0) { +inline ::flatbuffers::Offset CreateIntArray( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector> data = 0) { IntArrayBuilder builder_(_fbb); builder_.add_data(data); return builder_.Finish(); } -inline flatbuffers::Offset CreateIntArrayDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateIntArrayDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const std::vector *data = nullptr) { auto data__ = data ? _fbb.CreateVector(*data) : 0; return dml::ir::operatorFieldTypes::CreateIntArray( @@ -1002,18 +1009,18 @@ inline flatbuffers::Offset CreateIntArrayDirect( data__); } -struct FloatArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct FloatArray FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef FloatArrayBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA = 4 }; - const flatbuffers::Vector *data() const { - return GetPointer *>(VT_DATA); + const ::flatbuffers::Vector *data() const { + return GetPointer *>(VT_DATA); } - flatbuffers::Vector *mutable_data() { - return GetPointer *>(VT_DATA); + ::flatbuffers::Vector *mutable_data() { + return GetPointer<::flatbuffers::Vector *>(VT_DATA); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && VerifyOffset(verifier, VT_DATA) && verifier.VerifyVector(data()) && @@ -1023,33 +1030,32 @@ struct FloatArray FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { struct FloatArrayBuilder { typedef FloatArray Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; - void add_data(flatbuffers::Offset> data) { + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; + void add_data(::flatbuffers::Offset<::flatbuffers::Vector> data) { fbb_.AddOffset(FloatArray::VT_DATA, data); } - explicit FloatArrayBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit FloatArrayBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - FloatArrayBuilder &operator=(const FloatArrayBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateFloatArray( - flatbuffers::FlatBufferBuilder &_fbb, - flatbuffers::Offset> data = 0) { +inline ::flatbuffers::Offset CreateFloatArray( + ::flatbuffers::FlatBufferBuilder &_fbb, + ::flatbuffers::Offset<::flatbuffers::Vector> data = 0) { FloatArrayBuilder builder_(_fbb); builder_.add_data(data); return builder_.Finish(); } -inline flatbuffers::Offset CreateFloatArrayDirect( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateFloatArrayDirect( + ::flatbuffers::FlatBufferBuilder &_fbb, const std::vector *data = nullptr) { auto data__ = data ? _fbb.CreateVector(*data) : 0; return dml::ir::operatorFieldTypes::CreateFloatArray( @@ -1057,7 +1063,7 @@ inline flatbuffers::Offset CreateFloatArrayDirect( data__); } -struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { +struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private ::flatbuffers::Table { typedef ScalarUnionDataBuilder Builder; enum FlatBuffersVTableOffset FLATBUFFERS_VTABLE_UNDERLYING_TYPE { VT_DATA_TYPE = 4, @@ -1106,9 +1112,9 @@ struct ScalarUnionData FLATBUFFERS_FINAL_CLASS : private flatbuffers::Table { void *mutable_data() { return GetPointer(VT_DATA); } - bool Verify(flatbuffers::Verifier &verifier) const { + bool Verify(::flatbuffers::Verifier &verifier) const { return VerifyTableStart(verifier) && - VerifyField(verifier, VT_DATA_TYPE) && + VerifyField(verifier, VT_DATA_TYPE, 1) && VerifyOffset(verifier, VT_DATA) && VerifyScalarVariant(verifier, data(), data_type()) && verifier.EndTable(); @@ -1161,37 +1167,36 @@ template<> inline const dml::ir::operatorFieldTypes::Float64 *ScalarUnionData::d struct ScalarUnionDataBuilder { typedef ScalarUnionData Table; - flatbuffers::FlatBufferBuilder &fbb_; - flatbuffers::uoffset_t start_; + ::flatbuffers::FlatBufferBuilder &fbb_; + ::flatbuffers::uoffset_t start_; void add_data_type(dml::ir::operatorFieldTypes::ScalarVariant data_type) { fbb_.AddElement(ScalarUnionData::VT_DATA_TYPE, static_cast(data_type), 0); } - void add_data(flatbuffers::Offset data) { + void add_data(::flatbuffers::Offset data) { fbb_.AddOffset(ScalarUnionData::VT_DATA, data); } - explicit ScalarUnionDataBuilder(flatbuffers::FlatBufferBuilder &_fbb) + explicit ScalarUnionDataBuilder(::flatbuffers::FlatBufferBuilder &_fbb) : fbb_(_fbb) { start_ = fbb_.StartTable(); } - ScalarUnionDataBuilder &operator=(const ScalarUnionDataBuilder &); - flatbuffers::Offset Finish() { + ::flatbuffers::Offset Finish() { const auto end = fbb_.EndTable(start_); - auto o = flatbuffers::Offset(end); + auto o = ::flatbuffers::Offset(end); return o; } }; -inline flatbuffers::Offset CreateScalarUnionData( - flatbuffers::FlatBufferBuilder &_fbb, +inline ::flatbuffers::Offset CreateScalarUnionData( + ::flatbuffers::FlatBufferBuilder &_fbb, dml::ir::operatorFieldTypes::ScalarVariant data_type = dml::ir::operatorFieldTypes::ScalarVariant_NONE, - flatbuffers::Offset data = 0) { + ::flatbuffers::Offset data = 0) { ScalarUnionDataBuilder builder_(_fbb); builder_.add_data(data); builder_.add_data_type(data_type); return builder_.Finish(); } -inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) { +inline bool VerifyAttributeFieldVariant(::flatbuffers::Verifier &verifier, const void *obj, AttributeFieldVariant type) { switch (type) { case AttributeFieldVariant_NONE: { return true; @@ -1205,16 +1210,16 @@ inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const v return verifier.VerifyTable(ptr); } case AttributeFieldVariant_UInt32: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case AttributeFieldVariant_UInt64: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 8); } case AttributeFieldVariant_Int32: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case AttributeFieldVariant_Float32: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case AttributeFieldVariant_UIntArray: { auto ptr = reinterpret_cast(obj); @@ -1229,26 +1234,26 @@ inline bool VerifyAttributeFieldVariant(flatbuffers::Verifier &verifier, const v return verifier.VerifyTable(ptr); } case AttributeFieldVariant_ScaleBias: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case AttributeFieldVariant_Size2D: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case AttributeFieldVariant_ScalarUnionData: { auto ptr = reinterpret_cast(obj); return verifier.VerifyTable(ptr); } case AttributeFieldVariant_Bool: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 1); } default: return true; } } -inline bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { +inline bool VerifyAttributeFieldVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types) { if (!values || !types) return !values && !types; if (values->size() != types->size()) return false; - for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { if (!VerifyAttributeFieldVariant( verifier, values->Get(i), types->GetEnum(i))) { return false; @@ -1257,52 +1262,52 @@ inline bool VerifyAttributeFieldVariantVector(flatbuffers::Verifier &verifier, c return true; } -inline bool VerifyScalarVariant(flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) { +inline bool VerifyScalarVariant(::flatbuffers::Verifier &verifier, const void *obj, ScalarVariant type) { switch (type) { case ScalarVariant_NONE: { return true; } case ScalarVariant_ByteArray: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 1); } case ScalarVariant_Int8: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 1); } case ScalarVariant_UInt8: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 1); } case ScalarVariant_Int16: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 2); } case ScalarVariant_UInt16: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 2); } case ScalarVariant_Int32: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case ScalarVariant_UInt32: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case ScalarVariant_Int64: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 8); } case ScalarVariant_UInt64: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 8); } case ScalarVariant_Float32: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 4); } case ScalarVariant_Float64: { - return verifier.Verify(static_cast(obj), 0); + return verifier.VerifyField(static_cast(obj), 0, 8); } default: return true; } } -inline bool VerifyScalarVariantVector(flatbuffers::Verifier &verifier, const flatbuffers::Vector> *values, const flatbuffers::Vector *types) { +inline bool VerifyScalarVariantVector(::flatbuffers::Verifier &verifier, const ::flatbuffers::Vector<::flatbuffers::Offset> *values, const ::flatbuffers::Vector *types) { if (!values || !types) return !values && !types; if (values->size() != types->size()) return false; - for (flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { + for (::flatbuffers::uoffset_t i = 0; i < values->size(); ++i) { if (!VerifyScalarVariant( verifier, values->Get(i), types->GetEnum(i))) { return false; diff --git a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h index 332bf86685e8a..1a796b25c5d1f 100644 --- a/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h +++ b/onnxruntime/core/providers/dml/DmlExecutionProvider/src/precomp.h @@ -39,7 +39,7 @@ #include #include "External/D3DX12/d3dx12.h" #endif -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" #include "GraphicsUnknownHelper.h" diff --git a/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h b/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h index 9639040f772da..a2721f6a5b44f 100644 --- a/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h +++ b/onnxruntime/core/providers/migraphx/ort_trt_int8_cal_table.fbs.h @@ -4,7 +4,7 @@ #define ONNXRUNTIME_CORE_PROVIDERS_MIGRAPHX_ORT_TRT_INT8_CAL_TABLE_FBS_H_ #include -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" namespace CalTableFlatBuffers { diff --git a/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h b/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h index 9e4324fb9f516..a2e027f56fbd9 100644 --- a/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h +++ b/onnxruntime/core/providers/tensorrt/ort_trt_int8_cal_table.fbs.h @@ -3,7 +3,7 @@ #ifndef FLATBUFFERS_GENERATED_ORTTRTINT8CALTABLE_CALTABLEFLATBUFFERS_H_ #define FLATBUFFERS_GENERATED_ORTTRTINT8CALTABLE_CALTABLEFLATBUFFERS_H_ -#include "flatbuffers/flatbuffers.h" +#include "core/common/flatbuffers.h" namespace CalTableFlatBuffers { diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index 01be343795a56..c60b3e467d4f1 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.134 + version: 1.0.143 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.134 + version: 1.0.143 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. diff --git a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml index dd703f3199d9b..30e427a18509d 100644 --- a/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml +++ b/tools/ci_build/github/azure-pipelines/templates/jobs/win-ci-vs-2022-job.yml @@ -148,12 +148,9 @@ jobs: Get-Volume $("$(Build.BinariesDirectory)")[0] displayName: check disk size - - task: DeleteFiles@1 - displayName: 'Delete intermedia files from $(Build.BinariesDirectory)\${{ parameters.BuildConfig }}' - inputs: - SourceFolder: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}' - Contents: | - **/*.obj + - powershell: | + Remove-Item "$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}" -Include "*.obj" -Recurse + displayName: 'Delete intermediate files from $(Build.BinariesDirectory)\${{ parameters.BuildConfig }}' - powershell: | Get-Volume $("$(Build.BinariesDirectory)")[0] @@ -221,14 +218,6 @@ jobs: workingDirectory: '$(Build.BinariesDirectory)\${{ parameters.BuildConfig }}\${{ parameters.BuildConfig }}' displayName: 'Run tests' - - task: PublishTestResults@2 - displayName: 'Publish unit test results' - inputs: - testResultsFiles: '**/*.results.xml' - searchFolder: '$(Build.BinariesDirectory)/${{ parameters.BuildConfig }}' - testRunTitle: 'Unit Test Run' - condition: succeededOrFailed() - - ${{ if eq(parameters.GenerateDocumentation, true) }}: - task: PythonScript@0 displayName: 'Generate documentation' @@ -251,4 +240,4 @@ jobs: condition: and(failed(), eq(variables['DocUpdateNeeded'], 'true')) inputs: pathtoPublish: '$(Build.SourcesDirectory)/docs/ContribOperators.md' - artifactName: 'ContribOperators.md' + artifactName: 'ContribOperators.md' \ No newline at end of file From bcf47d354646f67decdc04f7ebb427f8343faabc Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Thu, 14 Mar 2024 17:05:50 -0700 Subject: [PATCH 08/55] Update install_deps_lort.sh to fix onnxscript installation (#19922) Install onnxscript correctly with `pip install`. Dev dependencies are not required. ### Motivation and Context Fix build breaks. --- .../github/linux/docker/scripts/manylinux/install_deps_lort.sh | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh index da8a45e00cc90..39c15338aeddb 100755 --- a/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh +++ b/tools/ci_build/github/linux/docker/scripts/manylinux/install_deps_lort.sh @@ -31,8 +31,7 @@ cd /usr/local/ echo "Cloning ONNX Script" git clone --recursive https://github.com/microsoft/onnxscript.git cd onnxscript -/opt/python/cp39-cp39/bin/python3.9 -m pip install -r requirements-dev.txt -/opt/python/cp39-cp39/bin/python3.9 setup.py install +/opt/python/cp39-cp39/bin/python3.9 -m pip install . cd ~ && /opt/python/cp39-cp39/bin/python3.9 -c "import onnxscript; print(f'Installed ONNX Script: {onnxscript.__version__}')" cd /usr/local From 42399dfd2b248876bc184d653585f06ed088b229 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Thu, 14 Mar 2024 18:13:47 -0700 Subject: [PATCH 09/55] Fix a potential race in the CUDA TopK kernel (#19917) ### Description If the `K` value is flowing through as a tensor, we are updating a mutable member of the `TopK` class and basing the compute off that - which is likely to cause data race issues with concurrent Run() calls and `K` value changes. ### Motivation and Context Fix potential race in CUDA TopK kernel --- onnxruntime/core/providers/cuda/math/topk.cc | 24 ++++++++++++++------ onnxruntime/core/providers/cuda/math/topk.h | 2 +- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/onnxruntime/core/providers/cuda/math/topk.cc b/onnxruntime/core/providers/cuda/math/topk.cc index d516537e25949..cf26e0acfa557 100644 --- a/onnxruntime/core/providers/cuda/math/topk.cc +++ b/onnxruntime/core/providers/cuda/math/topk.cc @@ -56,7 +56,7 @@ TopK::TopK(const OpKernelInfo& info) : CudaKernel(info) { info.GetAttrOrDefault("largest", &largest_, 1); info.GetAttrOrDefault("sorted", &sorted_, 1); if (!inputk) { - info.GetAttrOrDefault("k", &K_, 0); + info.GetAttrOrDefault("k", &attr_k_, 0); } } @@ -67,7 +67,7 @@ TopK::TopK(const OpKernelInfo& info) : CudaKernel(info) { static_cast(tensor_I->MutableDataRaw()), \ elem_nums_cuda, \ elem_nums.size(), \ - axis, K_, largest_, sorted_, N, dimension) + axis, k_value, largest_, sorted_, N, dimension) template Status TopK::ComputeInternal(OpKernelContext* ctx) const { @@ -77,19 +77,29 @@ Status TopK::ComputeInternal(OpKernelContext* ctx) const { int32_t axis = static_cast(axis_ < 0 ? rank + axis_ : axis_); ORT_ENFORCE(axis > -1 && axis < rank); + int64_t k_value = 0; if (inputk) { auto tensor_K = ctx->Input(1); ORT_ENFORCE(nullptr != tensor_K); - K_ = *tensor_K->Data(); - ORT_ENFORCE(K_ >= 0 && K_ <= tensor_X->Shape().GetDims()[axis]); + k_value = *tensor_K->Data(); + } else { // from attribute + k_value = attr_k_; } - auto output_shape = tensor_X->Shape(); - output_shape[axis] = K_; + // Now that we know the value of 'K' and the input shape, + // make a final validation before going to the implementation + const auto& input_shape = tensor_X->Shape(); + if ((k_value < 0) || (k_value > input_shape.GetDims()[axis])) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Value of K outside range. K value: ", k_value, + ". Input shape: ", input_shape, " . Axis: ", axis); + } + + auto output_shape = input_shape; + output_shape[axis] = k_value; auto tensor_V = ctx->Output(0, output_shape); auto tensor_I = ctx->Output(1, output_shape); - if (0 == K_) { + if (output_shape.Size() == 0) { // Bail out early if the output is going to be empty return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/math/topk.h b/onnxruntime/core/providers/cuda/math/topk.h index 9dec13ad2a930..5731df3130c5a 100644 --- a/onnxruntime/core/providers/cuda/math/topk.h +++ b/onnxruntime/core/providers/cuda/math/topk.h @@ -17,7 +17,7 @@ class TopK final : public CudaKernel { int64_t axis_; int64_t largest_; int64_t sorted_; - mutable int64_t K_; + int64_t attr_k_; }; } // namespace cuda } // namespace onnxruntime From 0b2a75b274e45c7a510bfdae9071a97a69e75618 Mon Sep 17 00:00:00 2001 From: Yifan Li <109183385+yf711@users.noreply.github.com> Date: Fri, 15 Mar 2024 23:41:21 +0900 Subject: [PATCH 10/55] [EP Perf] Add concurrency test (#19804) ### Description * Add concurrency test to EP Perf CI panel (impl. by onnx_test_runner) * Model: FasterRCNN-10 model within CI image * `-c` param configurable via CI panel when kicking off CI tasks * Auto-replicate test input/outputs according to `-c` param * By default, the model test will be executed in 100 iterations (~2min added to T4 CI task load overall) ### Motivation and Context To monitor potential concurrency issues of ORT-TRT --- .../tools/tensorrt/perf/mem_test/run.sh | 23 ++++++- .../perf/mem_test/run_mem_test_docker.sh | 5 +- .../python/tools/tensorrt/perf/post.py | 61 +++++++++++++++---- onnxruntime/test/onnx/main.cc | 5 -- ...linux-gpu-tensorrt-daily-perf-pipeline.yml | 17 ++++-- 5 files changed, 86 insertions(+), 25 deletions(-) diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh index dd53fe6127462..2cfdd39bc96aa 100755 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run.sh @@ -4,13 +4,14 @@ set -x -while getopts p:o:l:s: parameter +while getopts p:o:l:s:c: parameter do case "${parameter}" in p) WORKSPACE=${OPTARG};; o) ORT_BINARY_PATH=${OPTARG};; l) BUILD_ORT_LATEST=${OPTARG};; s) ORT_SOURCE=${OPTARG};; +c) CONCURRENCY=${OPTARG};; esac done @@ -104,6 +105,26 @@ fi mv valgrind.log result +# Concurrency Test +FRCNN_FOLDER="/data/ep-perf-models/onnx-zoo-models/FasterRCNN-10/" + +mkdir FasterRCNN-10/ +cp -r ${FRCNN_FOLDER}/test_data_set_0 ${FRCNN_FOLDER}/faster_rcnn_R_50_FPN_1x.onnx ./FasterRCNN-10/ + +# replicate test inputs +for (( i=1; i concurrency_test.log 2>&1 +mv concurrency_test.log result + # Run AddressSanitizer ASAN_OPTIONS=${ASAN_OPTIONS} ./onnx_memtest diff --git a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh index 4e94c63ee6c25..a355e4cf5d365 100755 --- a/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh +++ b/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh @@ -3,13 +3,14 @@ set -x # Parse Arguments -while getopts w:d:p:l: parameter +while getopts w:d:p:l:c: parameter do case "${parameter}" in w) WORKSPACE=${OPTARG};; # workspace folder of onnxruntime d) DOCKER_IMAGE=${OPTARG};; # docker image:"trt-ep-mem-test" docker image is already pre-built on perf machine p) MEM_TEST_DIR=${OPTARG};; # mem test dir l) BUILD_ORT_LATEST=${OPTARG};; # whether to build latest ORT +c) CONCURRENCY=${OPTARG};; esac done @@ -24,4 +25,4 @@ then BUILD_ORT_LATEST="true" fi -docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST +docker run --rm --gpus all -v $MEM_TEST_DIR:$DOCKER_MEM_TEST_DIR -v /data/ep-perf-models:/data/ep-perf-models $DOCKER_IMAGE /bin/bash $DOCKER_MEM_TEST_DIR'run.sh' -p $DOCKER_MEM_TEST_DIR -o $DOCKER_ORT_LIBS -s $DOCKER_ORT_SOURCE -l $BUILD_ORT_LATEST -c $CONCURRENCY diff --git a/onnxruntime/python/tools/tensorrt/perf/post.py b/onnxruntime/python/tools/tensorrt/perf/post.py index 363fa3a96d283..df389ad572596 100644 --- a/onnxruntime/python/tools/tensorrt/perf/post.py +++ b/onnxruntime/python/tools/tensorrt/perf/post.py @@ -3,6 +3,7 @@ # Licensed under the MIT License. # -------------------------------------------------------------------------- import argparse +import csv import datetime import os import sys @@ -419,10 +420,11 @@ def main(): upload_time = datetime.datetime.now(tz=datetime.timezone.utc).replace(microsecond=0) try: + # Load EP Perf test results from /result result_file = args.report_folder - - folders = os.listdir(result_file) - os.chdir(result_file) + result_perf_test_path = os.path.join(result_file, "result") + folders = os.listdir(result_perf_test_path) + os.chdir(result_perf_test_path) tables = [ fail_name, @@ -445,13 +447,13 @@ def main(): for model_group in folders: os.chdir(model_group) csv_filenames = os.listdir() - for csv in csv_filenames: - table = pd.read_csv(csv) - if session_name in csv: + for csv_file in csv_filenames: + table = pd.read_csv(csv_file) + if session_name in csv_file: table_results[session_name] = pd.concat( [table_results[session_name], get_session(table, model_group)], ignore_index=True ) - elif specs_name in csv: + elif specs_name in csv_file: table_results[specs_name] = pd.concat( [ table_results[specs_name], @@ -459,12 +461,12 @@ def main(): ], ignore_index=True, ) - elif fail_name in csv: + elif fail_name in csv_file: table_results[fail_name] = pd.concat( [table_results[fail_name], get_failures(table, model_group)], ignore_index=True, ) - elif latency_name in csv: + elif latency_name in csv_file: table_results[memory_name] = pd.concat( [table_results[memory_name], get_memory(table, model_group)], ignore_index=True, @@ -474,11 +476,11 @@ def main(): [table_results[latency_name], get_latency(table, model_group)], ignore_index=True, ) - elif status_name in csv: + elif status_name in csv_file: table_results[status_name] = pd.concat( [table_results[status_name], get_status(table, model_group)], ignore_index=True ) - elif op_metrics_name in csv: + elif op_metrics_name in csv_file: table = table.assign(Group=model_group) table_results[op_metrics_name] = pd.concat( [table_results[op_metrics_name], table], ignore_index=True @@ -512,6 +514,43 @@ def main(): args.commit_datetime, ) + # Load concurrency test results + result_mem_test_path = os.path.join(result_file, "result_mem_test") + os.chdir(result_mem_test_path) + log_path = "concurrency_test.log" + if os.path.exists(log_path): + print("Generating concurrency test report") + with open(log_path) as log_file: + log_content = log_file.read() + + failed_cases_section = log_content.split("Failed Test Cases:")[1] + + # passed = 1 if no failed test cases + if failed_cases_section.strip() == "": + passed = 1 + else: + passed = 0 + + csv_path = "concurrency_test.csv" + with open(csv_path, "w", newline="") as csv_file: + csv_writer = csv.writer(csv_file) + csv_writer.writerow(["Passed", "Log"]) + csv_writer.writerow([passed, log_content]) + + db_table_name = "ep_concurrencytest_record" + table = pd.read_csv(csv_path) + write_table( + ingest_client, + args.database, + table, + db_table_name, + upload_time, + identifier, + args.branch, + args.commit_hash, + args.commit_datetime, + ) + except BaseException as e: print(str(e)) sys.exit(1) diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 5a2104ffeb0da..9c2c24e3c337d 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -341,11 +341,6 @@ int real_main(int argc, char* argv[], Ort::Env& env) { logging_level = ORT_LOGGING_LEVEL_VERBOSE; } - if (concurrent_session_runs > 1 && repeat_count > 1) { - fprintf(stderr, "when you use '-r [repeat]', please set '-c' to 1\n"); - usage(); - return -1; - } argc -= optind; argv += optind; if (argc < 1) { diff --git a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml index 15f558e6f9ef0..af2d722a6b90c 100644 --- a/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-gpu-tensorrt-daily-perf-pipeline.yml @@ -28,10 +28,15 @@ parameters: - "partner-models" - name: MemTest - displayName: Run Memory Test + displayName: Run Memory Test and Concurrency Test type: boolean default: true +- name: ConcurrencyTest + displayName: Specifies the number of concurrency model test to invoke simultaneously + type: string + default: 2 + - name: TrtEPOptions displayName: TensorRT EP options type: object @@ -107,8 +112,8 @@ jobs: workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/build' - ${{ if eq(parameters.MemTest, true) }}: - - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false' - displayName: 'Run Memory Test' + - script: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/run_mem_test_docker.sh -d $(image) -p $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/ -w /code/ -l false -c ${{ parameters.ConcurrencyTest }}' + displayName: 'Run Memory Test and Concurrency Test' workingDirectory: '$(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/mem_test/' - ${{ each option in parameters.ModelGroups }}: @@ -152,7 +157,7 @@ jobs: displayName: 'Check and Install Azure CLI' - task: AzureCLI@2 - displayName: 'Azure CLI Post to Dashboard' + displayName: 'Post EP Perf Results to Dashboard' inputs: azureSubscription: AIInfraBuildOnnxRuntimeOSS scriptLocation: inlineScript @@ -160,8 +165,8 @@ jobs: inlineScript: | short_hash=$(git rev-parse --short HEAD) && commit_date=$(git log -1 --date=iso-strict --pretty=format:%cd) && - python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact/result -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) - + python3 $(Build.SourcesDirectory)/onnxruntime/python/tools/tensorrt/perf/post.py -r $(Build.SourcesDirectory)/Artifact -c $short_hash -d $commit_date -u "$(reportUrl)?buildId=$(Build.BuildId)" -t $(trtVersion) -b $(branchName) --kusto_conn $(kustoConn) --database $(database) $(parser) + - template: templates/component-governance-component-detection-steps.yml parameters : condition : 'succeeded' From 79e50aeef3d99177867c07e38a574cf641fe6c22 Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Fri, 15 Mar 2024 11:47:45 -0700 Subject: [PATCH 11/55] [js/web] rewrite backend resolve to allow multiple EPs (#19735) ### Description This PR rewrite the backend resolve logic to support specifying multiple EPs. #### Backend The first version of ONNX Runtime Web actually carried some existing code from [ONNX.js](https://github.com/microsoft/onnxjs), which includes the "backend" concept. The original "backend" in ONNX.js is designed in a way assuming there is only one backend from user's backend hint list will be used. For example, in ONNX.js, if user specify a backend hint as `['webgl', 'wasm']`, ONNX.js will first try to use WebGL backend - if it loads successfully (the browser supports webgl), then "webgl" backend will be used and "wasm" will be ignored; otherwise, "webgl" will be ignored and try to load "wasm" backend. In short: only one backend will be used when initializing a session. #### Execution Provider Execution Provider, or EP, in ONNX Runtime is a different concept. One of the differences is that users are allow to specify multiple EPs, and if one does not support a particular kernel, it can fallback to other EP. This is a very common case when using a GPU EP in ONNX Runtime. #### Current Status: Backend v.s. EP Because of the history reasons mentioned above, the current status is quite confusing. There are **real backend**s, which means it's different implementation in code; and there are **backend hint**s, which are used as string names for backend hint; and there are **EP**s of the ONNX Runtime concepts. currently there are only 2 **backend**s in our code base: The "onnxjs backend", and the "wasm backend". The "onnxjs backend" currently only powers backend hint "webgl", which go into the old onnx.js code path. All other backend hints including "wasm", "cpu"(alias to wasm), "webgpu" and "webnn" are all powered by "wasm backend". And because ORT Web treat "backend" as an internal concept and want to align with ONNX Runtime, so those names of backend hints are becoming EP names. The following table shows today's status: | Execution Provider Name (public) / Backend Hint (internal) | Backend | EP in ORT | -------- | ------- | ------- | | "wasm"/"cpu" | WasmBackend | CPU EP | "webgl" | OnnxjsBackend | \* technically not an EP | "webgpu" | WasmBackend | JSEP | "webnn" | WasmBackend | WebNN EP #### Problem While the API allows to specify multiple EPs, the backend resolving only allows one backend. This causes issues when user specify multiple EP names in session options, the backend resolve behavior and EP registration behavior is inconsistent. Specifically, in this issue: https://github.com/microsoft/onnxruntime/issues/15796#issuecomment-1925363908: EP list `['webgpu', 'wasm']` on a browser without WebGPU support resolves to 'wasm' backend, but the full EP list is passed in session options, so JSEP is still enabled, causing the runtime error. #### Solution Since we still need WebGL backend, we cannot totally remove the backend register/resolve system. In this PR I made the following changes: - initialize every backend from the EP list, instead of only do that for the first successful one. - for the first resolved backend, filter all EP using the exact same backend. Remove all EPs not using this backend from session options - for every explicitly specified EP, if it's removed, show a warning message in console --- js/common/lib/backend-impl.ts | 121 +++++++++--- js/common/lib/inference-session-impl.ts | 10 +- js/common/lib/training-session-impl.ts | 11 +- js/web/lib/wasm/binding/ort-wasm.d.ts | 240 +++++++++++++----------- js/web/lib/wasm/jsep/init.ts | 38 ++-- js/web/lib/wasm/proxy-wrapper.ts | 2 +- js/web/lib/wasm/wasm-core-impl.ts | 76 +++++--- onnxruntime/wasm/js_internal_api.js | 82 ++++---- 8 files changed, 348 insertions(+), 232 deletions(-) diff --git a/js/common/lib/backend-impl.ts b/js/common/lib/backend-impl.ts index 3e1e833addb91..e90efd7b97c29 100644 --- a/js/common/lib/backend-impl.ts +++ b/js/common/lib/backend-impl.ts @@ -2,6 +2,7 @@ // Licensed under the MIT License. import {Backend} from './backend.js'; +import {InferenceSession} from './inference-session.js'; interface BackendInfo { backend: Backend; @@ -10,6 +11,7 @@ interface BackendInfo { initPromise?: Promise; initialized?: boolean; aborted?: boolean; + error?: string; } const backends: Map = new Map(); @@ -60,43 +62,100 @@ export const registerBackend = (name: string, backend: Backend, priority: number }; /** - * Resolve backend by specified hints. + * Try to resolve and initialize a backend. * - * @param backendHints - a list of execution provider names to lookup. If omitted use registered backends as list. - * @returns a promise that resolves to the backend. + * @param backendName - the name of the backend. + * @returns the backend instance if resolved and initialized successfully, or an error message if failed. + */ +const tryResolveAndInitializeBackend = async(backendName: string): Promise => { + const backendInfo = backends.get(backendName); + if (!backendInfo) { + return 'backend not found.'; + } + + if (backendInfo.initialized) { + return backendInfo.backend; + } else if (backendInfo.aborted) { + return backendInfo.error!; + } else { + const isInitializing = !!backendInfo.initPromise; + try { + if (!isInitializing) { + backendInfo.initPromise = backendInfo.backend.init(backendName); + } + await backendInfo.initPromise; + backendInfo.initialized = true; + return backendInfo.backend; + } catch (e) { + if (!isInitializing) { + backendInfo.error = `${e}`; + backendInfo.aborted = true; + } + return backendInfo.error!; + } finally { + delete backendInfo.initPromise; + } + } +}; + +/** + * Resolve execution providers from the specific session options. + * + * @param options - the session options object. + * @returns a promise that resolves to a tuple of an initialized backend instance and a session options object with + * filtered EP list. * * @ignore */ -export const resolveBackend = async(backendHints: readonly string[]): Promise => { - const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints; - const errors = []; - for (const backendName of backendNames) { - const backendInfo = backends.get(backendName); - if (backendInfo) { - if (backendInfo.initialized) { - return backendInfo.backend; - } else if (backendInfo.aborted) { - continue; // current backend is unavailable; try next - } +export const resolveBackendAndExecutionProviders = async(options: InferenceSession.SessionOptions): + Promise<[backend: Backend, options: InferenceSession.SessionOptions]> => { + // extract backend hints from session options + const eps = options.executionProviders || []; + const backendHints = eps.map(i => typeof i === 'string' ? i : i.name); + const backendNames = backendHints.length === 0 ? backendsSortedByPriority : backendHints; - const isInitializing = !!backendInfo.initPromise; - try { - if (!isInitializing) { - backendInfo.initPromise = backendInfo.backend.init(backendName); + // try to resolve and initialize all requested backends + let backend: Backend|undefined; + const errors = []; + const availableBackendNames = new Set(); + for (const backendName of backendNames) { + const resolveResult = await tryResolveAndInitializeBackend(backendName); + if (typeof resolveResult === 'string') { + errors.push({name: backendName, err: resolveResult}); + } else { + if (!backend) { + backend = resolveResult; + } + if (backend === resolveResult) { + availableBackendNames.add(backendName); + } } - await backendInfo.initPromise; - backendInfo.initialized = true; - return backendInfo.backend; - } catch (e) { - if (!isInitializing) { - errors.push({name: backendName, err: e}); + } + + // if no backend is available, throw error. + if (!backend) { + throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`); + } + + // for each explicitly requested backend, if it's not available, output warning message. + for (const {name, err} of errors) { + if (backendHints.includes(name)) { + // eslint-disable-next-line no-console + console.warn(`removing requested execution provider "${ + name}" from session options because it is not available: ${err}`); } - backendInfo.aborted = true; - } finally { - delete backendInfo.initPromise; } - } - } - throw new Error(`no available backend found. ERR: ${errors.map(e => `[${e.name}] ${e.err}`).join(', ')}`); -}; + const filteredEps = eps.filter(i => availableBackendNames.has(typeof i === 'string' ? i : i.name)); + + return [ + backend, new Proxy(options, { + get: (target, prop) => { + if (prop === 'executionProviders') { + return filteredEps; + } + return Reflect.get(target, prop); + } + }) + ]; + }; diff --git a/js/common/lib/inference-session-impl.ts b/js/common/lib/inference-session-impl.ts index 55f40c8907a89..ab4c6a3e0c46b 100644 --- a/js/common/lib/inference-session-impl.ts +++ b/js/common/lib/inference-session-impl.ts @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import {resolveBackend} from './backend-impl.js'; +import {resolveBackendAndExecutionProviders} from './backend-impl.js'; import {InferenceSessionHandler} from './backend.js'; import {InferenceSession as InferenceSessionInterface} from './inference-session.js'; import {OnnxValue} from './onnx-value.js'; @@ -195,11 +195,9 @@ export class InferenceSession implements InferenceSessionInterface { throw new TypeError('Unexpected argument[0]: must be \'path\' or \'buffer\'.'); } - // get backend hints - const eps = options.executionProviders || []; - const backendHints = eps.map(i => typeof i === 'string' ? i : i.name); - const backend = await resolveBackend(backendHints); - const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, options); + // resolve backend, update session options with validated EPs, and create session handler + const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options); + const handler = await backend.createInferenceSessionHandler(filePathOrUint8Array, optionsWithValidatedEPs); TRACE_FUNC_END(); return new InferenceSession(handler); } diff --git a/js/common/lib/training-session-impl.ts b/js/common/lib/training-session-impl.ts index 23bd4421ae672..bae38b0dfda5a 100644 --- a/js/common/lib/training-session-impl.ts +++ b/js/common/lib/training-session-impl.ts @@ -1,7 +1,7 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. -import {resolveBackend} from './backend-impl.js'; +import {resolveBackendAndExecutionProviders} from './backend-impl.js'; import {SessionHandler, TrainingSessionHandler} from './backend.js'; import {InferenceSession as InferenceSession} from './inference-session.js'; import {OnnxValue} from './onnx-value.js'; @@ -55,13 +55,12 @@ export class TrainingSession implements TrainingSessionInterface { const optimizerModel: string|Uint8Array = trainingOptions.optimizerModel || ''; const options: SessionOptions = sessionOptions || {}; - // get backend hints - const eps = options.executionProviders || []; - const backendHints = eps.map(i => typeof i === 'string' ? i : i.name); - const backend = await resolveBackend(backendHints); + // resolve backend, update session options with validated EPs, and create session handler + const [backend, optionsWithValidatedEPs] = await resolveBackendAndExecutionProviders(options); if (backend.createTrainingSessionHandler) { const handler = await backend.createTrainingSessionHandler( - trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel, options); + trainingOptions.checkpointState, trainingOptions.trainModel, evalModel, optimizerModel, + optionsWithValidatedEPs); return new TrainingSession(handler, !!trainingOptions.optimizerModel, !!trainingOptions.evalModel); } else { throw new Error(noBackendErrMsg); diff --git a/js/web/lib/wasm/binding/ort-wasm.d.ts b/js/web/lib/wasm/binding/ort-wasm.d.ts index 5dd715191c830..56925b728e9a3 100644 --- a/js/web/lib/wasm/binding/ort-wasm.d.ts +++ b/js/web/lib/wasm/binding/ort-wasm.d.ts @@ -16,20 +16,97 @@ export declare namespace JSEP { type CaptureBeginFunction = () => void; type CaptureEndFunction = () => void; type ReplayFunction = () => void; -} -export interface OrtWasmModule extends EmscriptenModule { - // #region emscripten functions - stackSave(): number; - stackRestore(stack: number): void; - stackAlloc(size: number): number; - - UTF8ToString(offset: number, maxBytesToRead?: number): string; - lengthBytesUTF8(str: string): number; - stringToUTF8(str: string, offset: number, maxBytes: number): void; - // #endregion + export interface Module extends WebGpuModule { + /** + * Mount the external data file to an internal map, which will be used during session initialization. + * + * @param externalDataFilePath - specify the relative path of the external data file. + * @param externalDataFileData - specify the content data. + */ + mountExternalData(externalDataFilePath: string, externalDataFileData: Uint8Array): void; + /** + * Unmount all external data files from the internal map. + */ + unmountExternalData(): void; + + /** + * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime per + * backend. This function initializes Asyncify support. If name is 'webgpu', also initializes WebGPU backend and + * registers a few callbacks that will be called in C++ code. + */ + jsepInit(name: 'webgpu', initParams: [ + backend: BackendType, alloc: AllocFunction, free: FreeFunction, upload: UploadFunction, + download: DownloadFunction, createKernel: CreateKernelFunction, releaseKernel: ReleaseKernelFunction, + run: RunFunction, captureBegin: CaptureBeginFunction, captureEnd: CaptureEndFunction, replay: ReplayFunction + ]): void; + jsepInit(name: 'webnn', initParams?: never): void; + } + + export interface WebGpuModule { + /** + * [exported from wasm] Specify a kernel's output when running OpKernel::Compute(). + * + * @param context - specify the kernel context pointer. + * @param index - specify the index of the output. + * @param data - specify the pointer to encoded data of type and dims. + */ + _JsepOutput(context: number, index: number, data: number): number; + /** + * [exported from wasm] Get name of an operator node. + * + * @param kernel - specify the kernel pointer. + * @returns the pointer to a C-style UTF8 encoded string representing the node name. + */ + _JsepGetNodeName(kernel: number): number; + + /** + * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output. + * + * @param sessionId - specify the session ID. + * @param index - specify an integer to represent which input/output it is registering for. For input, it is the + * input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index + * corresponding to the session's ouputNames. + * @param buffer - specify the GPU buffer to register. + * @param size - specify the original data size in byte. + * @returns the GPU data ID for the registered GPU buffer. + */ + jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number; + /** + * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID. + * + * @param dataId - specify the GPU data ID + * @returns the GPU buffer. + */ + jsepGetBuffer: (dataId: number) => GPUBuffer; + /** + * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor. + * + * @param gpuBuffer - specify the GPU buffer + * @param size - specify the original data size in byte. + * @param type - specify the tensor type. + * @returns the generated downloader function. + */ + jsepCreateDownloader: + (gpuBuffer: GPUBuffer, size: number, + type: Tensor.GpuBufferDataTypes) => () => Promise; + /** + * [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before + * _OrtRun[WithBinding]() is called. + * @param sessionId - specify the session ID. + */ + jsepOnRunStart: (sessionId: number) => void; + /** + * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is + * called. + * @param sessionId - specify the session ID. + * @returns + */ + jsepOnReleaseSession: (sessionId: number) => void; + } +} - // #region ORT APIs +export interface OrtInferenceAPIs { _OrtInit(numThreads: number, loggingLevel: number): number; _OrtGetLastError(errorCodeOffset: number, errorMessageOffset: number): void; @@ -74,126 +151,61 @@ export interface OrtWasmModule extends EmscriptenModule { _OrtReleaseRunOptions(runOptionsHandle: number): void; _OrtEndProfiling(sessionHandle: number): number; - // #endregion +} + +export interface OrtTrainingAPIs { + _OrtTrainingLoadCheckpoint(dataOffset: number, dataLength: number): number; - // #region ORT Training APIs - _OrtTrainingLoadCheckpoint?(dataOffset: number, dataLength: number): number; + _OrtTrainingReleaseCheckpoint(checkpointHandle: number): void; - _OrtTrainingReleaseCheckpoint?(checkpointHandle: number): void; + _OrtTrainingCreateSession( + sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number, + evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number; - _OrtTrainingCreateSession? - (sessionOptionsHandle: number, checkpointHandle: number, trainOffset: number, trainLength: number, - evalOffset: number, evalLength: number, optimizerOffset: number, optimizerLength: number): number; + _OrtTrainingLazyResetGrad(trainingHandle: number): number; - _OrtTrainingLazyResetGrad?(trainingHandle: number): number; + _OrtTrainingRunTrainStep( + trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number, + runOptionsHandle: number): number; - _OrtTrainingRunTrainStep? - (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number, - runOptionsHandle: number): number; + _OrtTrainingOptimizerStep(trainingHandle: number, runOptionsHandle: number): number; - _OrtTrainingOptimizerStep?(trainingHandle: number, runOptionsHandle: number): number; + _OrtTrainingEvalStep( + trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number, + runOptionsHandle: number): number; - _OrtTrainingEvalStep? - (trainingHandle: number, inputsOffset: number, inputCount: number, outputsOffset: number, outputCount: number, - runOptionsHandle: number): number; + _OrtTrainingGetParametersSize(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number; - _OrtTrainingGetParametersSize?(trainingHandle: number, paramSizeT: number, trainableOnly: boolean): number; + _OrtTrainingCopyParametersToBuffer( + trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number; - _OrtTrainingCopyParametersToBuffer? - (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number; + _OrtTrainingCopyParametersFromBuffer( + trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number; - _OrtTrainingCopyParametersFromBuffer? - (trainingHandle: number, parametersBuffer: number, parameterCount: number, trainableOnly: boolean): number; + _OrtTrainingGetModelInputOutputCount( + trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number; + _OrtTrainingGetModelInputOutputName(trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean): + number; + + _OrtTrainingReleaseSession(trainingHandle: number): void; +} - _OrtTrainingGetModelInputOutputCount? - (trainingHandle: number, inputCount: number, outputCount: number, isEvalModel: boolean): number; - _OrtTrainingGetModelInputOutputName? - (trainingHandle: number, index: number, isInput: boolean, isEvalModel: boolean): number; +export interface OrtWasmModule extends EmscriptenModule, OrtInferenceAPIs, Partial, + Partial { + // #region emscripten functions + stackSave(): number; + stackRestore(stack: number): void; + stackAlloc(size: number): number; - _OrtTrainingReleaseSession?(trainingHandle: number): void; + UTF8ToString(offset: number, maxBytesToRead?: number): string; + lengthBytesUTF8(str: string): number; + stringToUTF8(str: string, offset: number, maxBytes: number): void; // #endregion // #region config numThreads?: number; mainScriptUrlOrBlob?: string|Blob; // #endregion - - // #region external data API - mountExternalData?(externalDataFilePath: string, externalDataFileData: Uint8Array): void; - unmountExternalData?(): void; - // #endregion - - // #region JSEP - /** - * This is the entry of JSEP initialization. This function is called once when initializing ONNX Runtime. - * This function initializes WebGPU backend and registers a few callbacks that will be called in C++ code. - */ - jsepInit? - (backend: JSEP.BackendType, alloc: JSEP.AllocFunction, free: JSEP.FreeFunction, upload: JSEP.UploadFunction, - download: JSEP.DownloadFunction, createKernel: JSEP.CreateKernelFunction, - releaseKernel: JSEP.ReleaseKernelFunction, run: JSEP.RunFunction, captureBegin: JSEP.CaptureBeginFunction, - captureEnd: JSEP.CaptureEndFunction, replay: JSEP.ReplayFunction): void; - - /** - * [exported from wasm] Specify a kernel's output when running OpKernel::Compute(). - * - * @param context - specify the kernel context pointer. - * @param index - specify the index of the output. - * @param data - specify the pointer to encoded data of type and dims. - */ - _JsepOutput(context: number, index: number, data: number): number; - /** - * [exported from wasm] Get name of an operator node. - * - * @param kernel - specify the kernel pointer. - * @returns the pointer to a C-style UTF8 encoded string representing the node name. - */ - _JsepGetNodeName(kernel: number): number; - - /** - * [exported from js_internal_api.js] Register a user GPU buffer for usage of a session's input or output. - * - * @param sessionId - specify the session ID. - * @param index - specify an integer to represent which input/output it is registering for. For input, it is the - * input_index corresponding to the session's inputNames. For output, it is the inputCount + output_index - * corresponding to the session's ouputNames. - * @param buffer - specify the GPU buffer to register. - * @param size - specify the original data size in byte. - * @returns the GPU data ID for the registered GPU buffer. - */ - jsepRegisterBuffer: (sessionId: number, index: number, buffer: GPUBuffer, size: number) => number; - /** - * [exported from js_internal_api.js] Get the GPU buffer by GPU data ID. - * - * @param dataId - specify the GPU data ID - * @returns the GPU buffer. - */ - jsepGetBuffer: (dataId: number) => GPUBuffer; - /** - * [exported from js_internal_api.js] Create a function to be used to create a GPU Tensor. - * - * @param gpuBuffer - specify the GPU buffer - * @param size - specify the original data size in byte. - * @param type - specify the tensor type. - * @returns the generated downloader function. - */ - jsepCreateDownloader: - (gpuBuffer: GPUBuffer, size: number, - type: Tensor.GpuBufferDataTypes) => () => Promise; - /** - * [exported from js_internal_api.js] Called when InferenceSession.run started. This function will be called before - * _OrtRun[WithBinding]() is called. - * @param sessionId - specify the session ID. - */ - jsepOnRunStart: (sessionId: number) => void; - /** - * [exported from js_internal_api.js] Release a session. This function will be called before _OrtReleaseSession() is - * called. - * @param sessionId - specify the session ID. - * @returns - */ - jsepOnReleaseSession: (sessionId: number) => void; - // #endregion } declare const moduleFactory: EmscriptenModuleFactory; diff --git a/js/web/lib/wasm/jsep/init.ts b/js/web/lib/wasm/jsep/init.ts index 4936b94ef7a86..adcaa145cdca8 100644 --- a/js/web/lib/wasm/jsep/init.ts +++ b/js/web/lib/wasm/jsep/init.ts @@ -121,7 +121,7 @@ class ComputeContextImpl implements ComputeContext { for (let i = 0; i < dims.length; i++) { this.module.HEAPU32[offset++] = dims[i]; } - return this.module._JsepOutput(this.opKernelContext, index, data); + return this.module._JsepOutput!(this.opKernelContext, index, data); } catch (e) { throw new Error( `Failed to generate kernel's output[${index}] with dims [${dims}]. ` + @@ -136,27 +136,39 @@ class ComputeContextImpl implements ComputeContext { /** * Initialize JSEP with WebGPU backend. * - * This function will be called only once after the WebAssembly module is loaded and initialized ("_OrtInit" is called). - * This function expects: + * This function will be called after the WebAssembly module is loaded and initialized ("_OrtInit" is called), once for + * each of the following EPs if they are specified: + * - "webgpu" + * - "webnn" + * + * For WebGPU, this function expects: * - WebGPU is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false). * - WebGPU is available in current environment. (a valid GPUAdapter is passed in) + * + * For WebNN, this function expects: + * - WebNN is enabled in build (BUILD_DEFS.DISABLE_WEBGPU === false). + * - WebNN is available in current environment. (navigator.ml is not undefined) + * * If the WebAssembly module is not built with JSEP support, this function will throw an error. This will invalidate - * 'webgpu' backend. + * 'webgpu'/'webnn' backend. * + * @param name - the name of the EP, either "webgpu" or "webnn" * @param module - the ORT WebAssembly module * @param env - the ORT environment variable (ort.env) * @param gpuAdapter - the pre-created GPU adapter */ -export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapter): Promise => { +export const init = + async(name: 'webgpu'|'webnn', module: OrtWasmModule, env: Env, gpuAdapter?: GPUAdapter): Promise => { const jsepInit = module.jsepInit; if (!jsepInit) { throw new Error('Failed to initialize JSEP. The WebAssembly module is not built with JSEP support.'); } - const backend = new WebGpuBackend(); - await backend.initialize(env, gpuAdapter); + if (name === 'webgpu') { + const backend = new WebGpuBackend(); + await backend.initialize(env, gpuAdapter!); - jsepInit( + jsepInit('webgpu', [ // backend backend, @@ -190,8 +202,8 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte }, // jsepCreateKernel - (kernelType: string, kernelId: number, attribute: unknown) => - backend.createKernel(kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName(kernelId))), + (kernelType: string, kernelId: number, attribute: unknown) => backend.createKernel( + kernelType, kernelId, attribute, module.UTF8ToString(module._JsepGetNodeName!(kernelId))), // jsepReleaseKernel (kernel: number) => backend.releaseKernel(kernel), @@ -210,5 +222,9 @@ export const init = async(module: OrtWasmModule, env: Env, gpuAdapter: GPUAdapte // jsepCaptureEnd () => backend.captureEnd(), // jsepReplay - () => backend.replay()); + () => backend.replay() + ]); + } else { + jsepInit('webnn'); + } }; diff --git a/js/web/lib/wasm/proxy-wrapper.ts b/js/web/lib/wasm/proxy-wrapper.ts index 86017a4ec6904..6ff4e86b1235e 100644 --- a/js/web/lib/wasm/proxy-wrapper.ts +++ b/js/web/lib/wasm/proxy-wrapper.ts @@ -155,7 +155,7 @@ export const createSession = ensureWorker(); return new Promise((resolve, reject) => { enqueueCallbacks('create', [resolve, reject]); - const message: OrtWasmMessage = {type: 'create', in : {model, options}}; + const message: OrtWasmMessage = {type: 'create', in : {model, options: {...options}}}; const transferable: Transferable[] = []; if (model instanceof Uint8Array) { transferable.push(model.buffer); diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index afab9ba00b0c4..7019758be0efd 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -84,35 +84,44 @@ export const initRuntime = async(env: Env): Promise => { * @param epName */ export const initEp = async(env: Env, epName: string): Promise => { - if (!BUILD_DEFS.DISABLE_WEBGPU && (epName === 'webgpu' || epName === 'webnn')) { - // perform WebGPU availability check - if (typeof navigator === 'undefined' || !navigator.gpu) { - throw new Error('WebGPU is not supported in current environment'); - } - const powerPreference = env.webgpu?.powerPreference; - if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') { - throw new Error(`Invalid powerPreference setting: "${powerPreference}"`); - } - const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter; - if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') { - throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`); - } - const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter}); - if (!adapter) { - throw new Error( - 'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.'); - } + if (!BUILD_DEFS.DISABLE_WEBGPU) { + // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires + const initJsep = require('./jsep/init').init; - if (!env.wasm.simd) { - throw new Error( - 'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP'); - } + if (epName === 'webgpu') { + // perform WebGPU availability check + if (typeof navigator === 'undefined' || !navigator.gpu) { + throw new Error('WebGPU is not supported in current environment'); + } + const powerPreference = env.webgpu?.powerPreference; + if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') { + throw new Error(`Invalid powerPreference setting: "${powerPreference}"`); + } + const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter; + if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') { + throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`); + } + const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter}); + if (!adapter) { + throw new Error( + 'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.'); + } - // init JSEP if available + if (!env.wasm.simd) { + throw new Error( + 'Not supported for WebGPU=ON and SIMD=OFF. Please set `env.wasm.simd` to true when using `webgpu` EP'); + } - // eslint-disable-next-line @typescript-eslint/no-require-imports, @typescript-eslint/no-var-requires - const initJsep = require('./jsep/init').init; - await initJsep(getInstance(), env, adapter); + await initJsep('webgpu', getInstance(), env, adapter); + } + if (epName === 'webnn') { + // perform WebNN availability check + if (typeof navigator === 'undefined' || !(navigator as unknown as {ml: unknown}).ml) { + throw new Error('WebNN is not supported in current environment'); + } + + await initJsep('webnn', getInstance(), env); + } } }; @@ -380,7 +389,12 @@ export const prepareInputOutputTensor = const gpuBuffer = tensor[2].gpuBuffer as GPUBuffer; const elementSizeInBytes = getTensorElementSize(tensorDataTypeStringToEnum(dataType))!; dataByteLength = dims.reduce((a, b) => a * b, 1) * elementSizeInBytes; - rawData = wasm.jsepRegisterBuffer(sessionId, index, gpuBuffer, dataByteLength); + + const registerBuffer = wasm.jsepRegisterBuffer; + if (!registerBuffer) { + throw new Error('Tensor location "gpu-buffer" is not supported without using WebGPU.'); + } + rawData = registerBuffer(sessionId, index, gpuBuffer, dataByteLength); } else { const data = tensor[2]; @@ -595,7 +609,11 @@ export const run = async( // If a certain output's preferred location is GPU but the tensor is empty, we still need to create a CPU // tensor for it. There is no mapping GPU buffer for an empty tensor. if (preferredLocation === 'gpu-buffer' && size > 0) { - const gpuBuffer = wasm.jsepGetBuffer(dataOffset); + const getBuffer = wasm.jsepGetBuffer; + if (!getBuffer) { + throw new Error('preferredLocation "gpu-buffer" is not supported without using WebGPU.'); + } + const gpuBuffer = getBuffer(dataOffset); const elementSize = getTensorElementSize(dataType); if (elementSize === undefined || !isGpuBufferSupportedType(type)) { throw new Error(`Unsupported data type: ${type}`); @@ -607,7 +625,7 @@ export const run = async( output.push([ type, dims, { gpuBuffer, - download: wasm.jsepCreateDownloader(gpuBuffer, size * elementSize, type), + download: wasm.jsepCreateDownloader!(gpuBuffer, size * elementSize, type), dispose: () => { wasm._OrtReleaseTensor(tensor); } diff --git a/onnxruntime/wasm/js_internal_api.js b/onnxruntime/wasm/js_internal_api.js index cbc60c70b57aa..90d8b737252e5 100644 --- a/onnxruntime/wasm/js_internal_api.js +++ b/onnxruntime/wasm/js_internal_api.js @@ -4,39 +4,27 @@ 'use strict'; /** - * Mount external data files of a model to the virtual file system (MEMFS). + * Mount external data files of a model to an internal map, which will be used during session initialization. * * @param {string} externalDataFilesPath * @param {Uint8Array} externalDataFilesData */ Module['mountExternalData'] = (externalDataFilePath, externalDataFileData) => { const files = Module.MountedFiles || (Module.MountedFiles = new Map()); - files.set(externalDataFilePath, externalDataFileData); + files.set(externalDataFilePath, externalDataFileData); }; /** - * Unmount external data files of a model from the virtual file system (MEMFS). + * Unmount external data files of a model. */ Module['unmountExternalData'] = () => { delete Module.MountedFiles; }; /** - * init JSEP + * initialize JSEP for asyncify support. */ -Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, releaseKernel, runKernel, captureBegin, captureEnd, replay) => { - Module.jsepBackend = backend; - Module.jsepAlloc = alloc; - Module.jsepFree = free; - Module.jsepCopy = copy; - Module.jsepCopyAsync = copyAsync; - Module.jsepCreateKernel = createKernel; - Module.jsepReleaseKernel = releaseKernel; - Module.jsepRunKernel = runKernel; - Module.jsepCaptureBegin = captureBegin; - Module.jsepCaptureEnd = captureEnd; - Module.jsepReplay = replay; - +let jsepInitAsync = () => { // This is a simplified version of cwrap() with options.async === true (-sASYNCIFY=1) // It removes some overhead in cwarp() and ccall() that we don't need. // @@ -143,7 +131,7 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea } // Flush the backend. This will submit all pending commands to the GPU. - backend['flush'](); + Module.jsepBackend?.['flush'](); // Await all pending promises. This includes GPU validation promises for diagnostic purposes. const errorPromises = state.errors; @@ -180,20 +168,46 @@ Module['jsepInit'] = (backend, alloc, free, copy, copyAsync, createKernel, relea () => Module['_OrtBindInput'], v => Module['_OrtBindInput'] = v); - // expose webgpu backend functions - Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => { - return backend['registerBuffer'](sessionId, index, buffer, size); - }; - Module['jsepGetBuffer'] = (dataId) => { - return backend['getBuffer'](dataId); - }; - Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => { - return backend['createDownloader'](gpuBuffer, size, type); - }; - Module['jsepOnReleaseSession'] = sessionId => { - backend['onReleaseSession'](sessionId); - }; - Module['jsepOnRunStart'] = sessionId => { - return backend['onRunStart'](sessionId); - }; + // remove this function to make sure it is called only once. + jsepInitAsync = undefined; +}; + + +/** + * initialize JSEP for WebGPU. + */ +Module['jsepInit'] = (name, params) => { + jsepInitAsync?.(); + + if (name === 'webgpu') { + [Module.jsepBackend, + Module.jsepAlloc, + Module.jsepFree, + Module.jsepCopy, + Module.jsepCopyAsync, + Module.jsepCreateKernel, + Module.jsepReleaseKernel, + Module.jsepRunKernel, + Module.jsepCaptureBegin, + Module.jsepCaptureEnd, + Module.jsepReplay] = params; + + // expose webgpu backend functions + const backend = Module.jsepBackend; + Module['jsepRegisterBuffer'] = (sessionId, index, buffer, size) => { + return backend['registerBuffer'](sessionId, index, buffer, size); + }; + Module['jsepGetBuffer'] = (dataId) => { + return backend['getBuffer'](dataId); + }; + Module['jsepCreateDownloader'] = (gpuBuffer, size, type) => { + return backend['createDownloader'](gpuBuffer, size, type); + }; + Module['jsepOnReleaseSession'] = sessionId => { + backend['onReleaseSession'](sessionId); + }; + Module['jsepOnRunStart'] = sessionId => { + return backend['onRunStart'](sessionId); + }; + } }; From 7b46b3155891cbf6c783e2535e413b92ae81b050 Mon Sep 17 00:00:00 2001 From: enximi <70036307+enximi@users.noreply.github.com> Date: Sat, 16 Mar 2024 03:41:44 +0800 Subject: [PATCH 12/55] =?UTF-8?q?fix:=20"UserWarning:=20Unsupported=20Wind?= =?UTF-8?q?ows=20version=20(11).=20ONNX=20Runtime=20sup=E2=80=A6=20(#19845?= =?UTF-8?q?)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix: "UserWarning: Unsupported Windows version (11). ONNX Runtime supports Windows 10 and above, only." ### Description Include Windows 11 in the version check. Now, you will not see the warning “Unsupported Windows version (11). ONNX Runtime supports Windows 10 and above, only.” ### Motivation and Context Warning on Windows 11: Only supports systems above Windows 10, which is somewhat strange. --- onnxruntime/python/onnxruntime_validation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/python/onnxruntime_validation.py b/onnxruntime/python/onnxruntime_validation.py index 16cbc8e8099e1..10d9f469863c4 100644 --- a/onnxruntime/python/onnxruntime_validation.py +++ b/onnxruntime/python/onnxruntime_validation.py @@ -22,7 +22,7 @@ def check_distro_info(): __my_distro__ = __my_system__ __my_distro_ver__ = platform.release().lower() - if __my_distro_ver__ != "10": + if __my_distro_ver__ not in ["10", "11"]: warnings.warn( "Unsupported Windows version (%s). ONNX Runtime supports Windows 10 and above, only." % __my_distro_ver__ From d5c6a2cecf5f5fb9e41c8cf4176cbae0eafeeb22 Mon Sep 17 00:00:00 2001 From: Hector Li Date: Fri, 15 Mar 2024 17:02:01 -0700 Subject: [PATCH 13/55] Enable code in QNN UT to verify the fix for partition issue (#19939) ### Description Enable code in QNN UT to verify the fix for partition issue relate to QDQ model. https://github.com/microsoft/onnxruntime/pull/19723 --- onnxruntime/test/providers/qnn/qnn_ep_context_test.cc | 2 ++ 1 file changed, 2 insertions(+) diff --git a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc index eaef6f6315157..9eb75d297ef78 100644 --- a/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc +++ b/onnxruntime/test/providers/qnn/qnn_ep_context_test.cc @@ -123,6 +123,8 @@ void QnnContextBinaryMultiPartitionTestBody(bool single_ep_node = true) { for (auto& node : ctx_graph.Nodes()) { if (node.OpType() == "EPContext") { ++ep_context_node_count; + // validate the fix for the partition issue relate to QDQ model + ASSERT_EQ(node.InputDefs().size(), 1); } else { ++non_ep_context_node_count; } From acb0df228024c3bc824f6abbef7b14ac21258755 Mon Sep 17 00:00:00 2001 From: Belem Zhang Date: Sat, 16 Mar 2024 10:00:30 +0800 Subject: [PATCH 14/55] Fix #19931 broken Get Started link of "ONNX Runtime JavaScript API" page (#19932) ### Description Fix #19931 broken Get Started link HTTP 404 for "Get Started" link in "ONNX Runtime JavaScript API" page Co-authored-by: Yulong Wang <7679871+fs-eire@users.noreply.github.com> --- js/common/lib/index.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts index d7c98380f3fa4..18cc2aba03f63 100644 --- a/js/common/lib/index.ts +++ b/js/common/lib/index.ts @@ -11,7 +11,7 @@ * - [onnxruntime-react-native](https://www.npmjs.com/package/onnxruntime-react-native) * * See also: - * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript.html) + * - [Get Started](https://onnxruntime.ai/docs/get-started/with-javascript/) * - [Inference examples](https://github.com/microsoft/onnxruntime-inference-examples/tree/main/js) * * @packageDocumentation From b29849a2877527683c0361834e6335698218d07f Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Fri, 15 Mar 2024 19:01:50 -0700 Subject: [PATCH 15/55] [js/common] fix typedoc warnings (#19933) ### Description Fix a few warnings in typedoc (for generating JS API): ``` [warning] The signature TrainingSession.loadParametersBuffer has an @param with name "buffer", which was not used. [warning] NonTensorType, defined in ./lib/onnx-value.ts, is referenced by OnnxValue but not included in the documentation. [warning] TensorFactory, defined in ./lib/tensor-factory.ts, is referenced by Tensor but not included in the documentation. [warning] ExternalDataFileType, defined in ./lib/onnx-model.ts, is referenced by InferenceSession.SessionOptions.externalData but not included in the documentation. [warning] TensorToDataUrlOptions, defined in ./lib/tensor-conversion.ts, is referenced by Tensor.toDataURL.toDataURL.options but not included in the documentation. [warning] TensorToImageDataOptions, defined in ./lib/tensor-conversion.ts, is referenced by Tensor.toImageData.toImageData.options but not included in the documentation. [warning] Failed to resolve link to "GpuBufferType" in comment for Env.WebGpuFlags.adapter. [warning] Failed to resolve link to "GpuBufferType" in comment for Env.WebGpuFlags.device. ``` Changes highlighted: - Merge `CoreMlExecutionProviderOption` and `CoreMLExecutionProviderOption`. They expose 2 set of different options for React-native and ORT nodejs binding. This should be fixed in future. - Fix a few inconsistency of names between JSDoc and parameters - Fix broken type links - Exclude trace functions --- js/common/lib/backend.ts | 6 +-- js/common/lib/env.ts | 4 +- js/common/lib/index.ts | 3 ++ js/common/lib/inference-session.ts | 43 +++++++++++++++---- js/common/lib/onnx-value.ts | 2 +- js/common/lib/tensor-factory.ts | 2 +- js/common/lib/tensor.ts | 4 +- js/common/lib/trace.ts | 9 ++++ js/common/lib/training-session.ts | 16 +++---- .../templates/linux-web-init-and-check.yml | 4 ++ 10 files changed, 68 insertions(+), 25 deletions(-) diff --git a/js/common/lib/backend.ts b/js/common/lib/backend.ts index 9bfcb12206057..8c07bdd5c5c4a 100644 --- a/js/common/lib/backend.ts +++ b/js/common/lib/backend.ts @@ -58,7 +58,7 @@ export interface TrainingSessionHandler extends SessionHandler { options: InferenceSession.RunOptions): Promise; getParametersSize(trainableOnly: boolean): Promise; - loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise; + loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise; getContiguousParameters(trainableOnly: boolean): Promise; } @@ -77,8 +77,8 @@ export interface Backend { Promise; createTrainingSessionHandler? - (checkpointStateUriOrBuffer: TrainingSession.URIorBuffer, trainModelUriOrBuffer: TrainingSession.URIorBuffer, - evalModelUriOrBuffer: TrainingSession.URIorBuffer, optimizerModelUriOrBuffer: TrainingSession.URIorBuffer, + (checkpointStateUriOrBuffer: TrainingSession.UriOrBuffer, trainModelUriOrBuffer: TrainingSession.UriOrBuffer, + evalModelUriOrBuffer: TrainingSession.UriOrBuffer, optimizerModelUriOrBuffer: TrainingSession.UriOrBuffer, options: InferenceSession.SessionOptions): Promise; } diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index dd8bde2b596f4..b139c719e863f 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -173,7 +173,7 @@ export declare namespace Env { * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types". * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type. * - * see comments on {@link GpuBufferType} + * see comments on {@link Tensor.GpuBufferType} */ readonly adapter: unknown; /** @@ -184,7 +184,7 @@ export declare namespace Env { * When use with TypeScript, the type of this property is `GPUDevice` defined in "@webgpu/types". * Use `const device = env.webgpu.device as GPUDevice;` in TypeScript to access this property with correct type. * - * see comments on {@link GpuBufferType} for more details about why not use types defined in "@webgpu/types". + * see comments on {@link Tensor.GpuBufferType} for more details about why not use types defined in "@webgpu/types". */ readonly device: unknown; /** diff --git a/js/common/lib/index.ts b/js/common/lib/index.ts index 18cc2aba03f63..3ed56b3c2e812 100644 --- a/js/common/lib/index.ts +++ b/js/common/lib/index.ts @@ -21,6 +21,9 @@ export * from './backend.js'; export * from './env.js'; export * from './inference-session.js'; export * from './tensor.js'; +export * from './tensor-conversion.js'; +export * from './tensor-factory.js'; export * from './trace.js'; +export * from './onnx-model.js'; export * from './onnx-value.js'; export * from './training-session.js'; diff --git a/js/common/lib/inference-session.ts b/js/common/lib/inference-session.ts index 4f85c3b46e253..4f7fbdcdcf0ca 100644 --- a/js/common/lib/inference-session.ts +++ b/js/common/lib/inference-session.ts @@ -186,22 +186,22 @@ export declare namespace InferenceSession { // #region execution providers // Currently, we have the following backends to support execution providers: - // Backend Node.js binding: supports 'cpu' and 'cuda'. + // Backend Node.js binding: supports 'cpu', 'dml' (win32), 'coreml' (macOS) and 'cuda' (linux). // Backend WebAssembly: supports 'cpu', 'wasm', 'webgpu' and 'webnn'. // Backend ONNX.js: supports 'webgl'. // Backend React Native: supports 'cpu', 'xnnpack', 'coreml' (iOS), 'nnapi' (Android). interface ExecutionProviderOptionMap { + coreml: CoreMLExecutionProviderOption; cpu: CpuExecutionProviderOption; - coreml: CoreMlExecutionProviderOption; cuda: CudaExecutionProviderOption; dml: DmlExecutionProviderOption; + nnapi: NnapiExecutionProviderOption; tensorrt: TensorRtExecutionProviderOption; wasm: WebAssemblyExecutionProviderOption; webgl: WebGLExecutionProviderOption; - xnnpack: XnnpackExecutionProviderOption; webgpu: WebGpuExecutionProviderOption; webnn: WebNNExecutionProviderOption; - nnapi: NnapiExecutionProviderOption; + xnnpack: XnnpackExecutionProviderOption; } type ExecutionProviderName = keyof ExecutionProviderOptionMap; @@ -219,10 +219,6 @@ export declare namespace InferenceSession { readonly name: 'cuda'; deviceId?: number; } - export interface CoreMlExecutionProviderOption extends ExecutionProviderOption { - readonly name: 'coreml'; - coreMlFlags?: number; - } export interface DmlExecutionProviderOption extends ExecutionProviderOption { readonly name: 'dml'; deviceId?: number; @@ -253,8 +249,39 @@ export declare namespace InferenceSession { } export interface CoreMLExecutionProviderOption extends ExecutionProviderOption { readonly name: 'coreml'; + /** + * The bit flags for CoreML execution provider. + * + * ``` + * COREML_FLAG_USE_CPU_ONLY = 0x001 + * COREML_FLAG_ENABLE_ON_SUBGRAPH = 0x002 + * COREML_FLAG_ONLY_ENABLE_DEVICE_WITH_ANE = 0x004 + * COREML_FLAG_ONLY_ALLOW_STATIC_INPUT_SHAPES = 0x008 + * COREML_FLAG_CREATE_MLPROGRAM = 0x010 + * ``` + * + * See include/onnxruntime/core/providers/coreml/coreml_provider_factory.h for more details. + * + * This flag is available only in ONNXRuntime (Node.js binding). + */ + coreMlFlags?: number; + /** + * Specify whether to use CPU only in CoreML EP. + * + * This setting is available only in ONNXRuntime (react-native). + */ useCPUOnly?: boolean; + /** + * Specify whether to enable CoreML EP on subgraph. + * + * This setting is available only in ONNXRuntime (react-native). + */ enableOnSubgraph?: boolean; + /** + * Specify whether to only enable CoreML EP for Apple devices with ANE (Apple Neural Engine). + * + * This setting is available only in ONNXRuntime (react-native). + */ onlyEnableDeviceWithANE?: boolean; } export interface NnapiExecutionProviderOption extends ExecutionProviderOption { diff --git a/js/common/lib/onnx-value.ts b/js/common/lib/onnx-value.ts index a16a30d25d839..72369ce8b4209 100644 --- a/js/common/lib/onnx-value.ts +++ b/js/common/lib/onnx-value.ts @@ -3,7 +3,7 @@ import {Tensor} from './tensor.js'; -type NonTensorType = never; +export type NonTensorType = never; /** * Type OnnxValue Represents both tensors and non-tensors value for model's inputs/outputs. diff --git a/js/common/lib/tensor-factory.ts b/js/common/lib/tensor-factory.ts index 6e19d7fb898a3..431de4c3635c2 100644 --- a/js/common/lib/tensor-factory.ts +++ b/js/common/lib/tensor-factory.ts @@ -253,7 +253,7 @@ export interface TensorFactory { /** * create a tensor from an ImageBitmap object * - * @param bitMap - the ImageBitmap object to create tensor from + * @param bitmap - the ImageBitmap object to create tensor from * @param options - An optional object representing options for creating tensor from URL. * * The following default settings will be applied: diff --git a/js/common/lib/tensor.ts b/js/common/lib/tensor.ts index d5da33640dc7d..20319ebb800c2 100644 --- a/js/common/lib/tensor.ts +++ b/js/common/lib/tensor.ts @@ -160,7 +160,7 @@ export interface Tensor extends TypedTensorBase, TypedTensorUtils { if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) { return; @@ -29,6 +32,9 @@ const TRACE_FUNC = (msg: string, extraMsg?: string) => { } }; +/** + * @ignore + */ export const TRACE_FUNC_BEGIN = (extraMsg?: string) => { if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) { return; @@ -36,6 +42,9 @@ export const TRACE_FUNC_BEGIN = (extraMsg?: string) => { TRACE_FUNC('BEGIN', extraMsg); }; +/** + * @ignore + */ export const TRACE_FUNC_END = (extraMsg?: string) => { if (typeof env.trace === 'undefined' ? !env.wasm.trace : !env.trace) { return; diff --git a/js/common/lib/training-session.ts b/js/common/lib/training-session.ts index e54aed90e702c..f9de77e3ac7d0 100644 --- a/js/common/lib/training-session.ts +++ b/js/common/lib/training-session.ts @@ -11,7 +11,7 @@ export declare namespace TrainingSession { /** * Either URI file path (string) or Uint8Array containing model or checkpoint information. */ - type URIorBuffer = string|Uint8Array; + type UriOrBuffer = string|Uint8Array; } /** @@ -98,13 +98,13 @@ export interface TrainingSession { getParametersSize(trainableOnly: boolean): Promise; /** - * Copies parameter values from the given array to the training state. Currently, only supporting models with + * Copies parameter values from the given buffer to the training state. Currently, only supporting models with * parameters of type Float32. * - * @param buffer - Float32 buffer containing parameters converted to a Uint8Array. + * @param buffer - A Uint8Array representation of Float32 parameters. * @param trainableOnly - True if trainable parameters only to be modified, false otherwise. Default value is true. */ - loadParametersBuffer(array: Uint8Array, trainableOnly: boolean): Promise; + loadParametersBuffer(buffer: Uint8Array, trainableOnly: boolean): Promise; /** * Copies the model parameters to a contiguous buffer. Usually used in the context of Federated Learning. @@ -157,19 +157,19 @@ export interface TrainingSessionCreateOptions { /** * URI or buffer for a .ckpt file that contains the checkpoint for the training model. */ - checkpointState: TrainingSession.URIorBuffer; + checkpointState: TrainingSession.UriOrBuffer; /** * URI or buffer for the .onnx training file. */ - trainModel: TrainingSession.URIorBuffer; + trainModel: TrainingSession.UriOrBuffer; /** * Optional. URI or buffer for the .onnx optimizer model file. */ - optimizerModel?: TrainingSession.URIorBuffer; + optimizerModel?: TrainingSession.UriOrBuffer; /** * Optional. URI or buffer for the .onnx eval model file. */ - evalModel?: TrainingSession.URIorBuffer; + evalModel?: TrainingSession.UriOrBuffer; } /** diff --git a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml index e788e4b3dddaa..a4d5a73118ea2 100644 --- a/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml +++ b/tools/ci_build/github/azure-pipelines/templates/linux-web-init-and-check.yml @@ -31,6 +31,10 @@ steps: node -e "a=require('child_process').execSync('git diff --name-only').toString();if(a)throw new Error('Following source files are not formatted: (did you run \"npm run format\"?)\n'+a)" workingDirectory: '$(Build.SourcesDirectory)/js' displayName: 'Check unformatted files' +- script: | + npx typedoc --emit none --treatWarningsAsErrors + workingDirectory: '$(Build.SourcesDirectory)/js/common' + displayName: 'TypeDoc Validation' - script: | npm run build:doc workingDirectory: '$(Build.SourcesDirectory)/js/web' From 1eb67a07caca2fa9561af03ac47f23f5cc0cdd41 Mon Sep 17 00:00:00 2001 From: wangshuai09 <391746016@qq.com> Date: Sat, 16 Mar 2024 11:28:43 +0800 Subject: [PATCH 16/55] Add cann_dependencies (#19929) ### Description Add `cann_dependencies` ### Motivation and Context The previous [PR](https://github.com/microsoft/onnxruntime/pull/17365) avioded using patchelf but lost `cann_dependencies`, This PR adds `cann_dependencies` to avoid require cann libraries when repairing wheel. --- setup.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index ac7a70b991fbf..ffe2958b357b8 100644 --- a/setup.py +++ b/setup.py @@ -232,6 +232,8 @@ def run(self): tensorrt_dependencies = ["libnvinfer.so.8", "libnvinfer_plugin.so.8", "libnvonnxparser.so.8"] + cann_dependencies = ["libascendcl.so", "libacl_op_compiler.so", "libfmk_onnx_parser.so"] + dest = "onnxruntime/capi/libonnxruntime_providers_openvino.so" if path.isfile(dest): subprocess.run( @@ -255,7 +257,7 @@ def run(self): file = glob(path.join(self.dist_dir, "*linux*.whl"))[0] logger.info("repairing %s for manylinux1", file) auditwheel_cmd = ["auditwheel", "-v", "repair", "-w", self.dist_dir, file] - for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies: + for i in cuda_dependencies + rocm_dependencies + tensorrt_dependencies + cann_dependencies: auditwheel_cmd += ["--exclude", i] logger.info("Running %s", " ".join([shlex.quote(arg) for arg in auditwheel_cmd])) try: From afdab62f53db83a8c248b4088d426cbedfe8eab1 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 16 Mar 2024 18:53:17 -0700 Subject: [PATCH 17/55] Bump follow-redirects from 1.15.4 to 1.15.6 in /js/web (#19949) Bumps [follow-redirects](https://github.com/follow-redirects/follow-redirects) from 1.15.4 to 1.15.6.
Commits
  • 35a517c Release version 1.15.6 of the npm package.
  • c4f847f Drop Proxy-Authorization across hosts.
  • 8526b4a Use GitHub for disclosure.
  • b1677ce Release version 1.15.5 of the npm package.
  • d8914f7 Preserve fragment in responseUrl.
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.4&new-version=1.15.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/microsoft/onnxruntime/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/web/package-lock.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/js/web/package-lock.json b/js/web/package-lock.json index 41c44aaa2679b..5c9113459ff06 100644 --- a/js/web/package-lock.json +++ b/js/web/package-lock.json @@ -52,7 +52,7 @@ "version": "1.18.0", "license": "MIT", "devDependencies": { - "typedoc": "^0.23.22" + "typedoc": "^0.25.7" } }, "node_modules/@chiragrupani/karma-chromium-edge-launcher": { @@ -1351,9 +1351,9 @@ "dev": true }, "node_modules/follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true, "funding": [ { @@ -4595,9 +4595,9 @@ "dev": true }, "follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true }, "from": { @@ -5503,7 +5503,7 @@ "onnxruntime-common": { "version": "file:../common", "requires": { - "typedoc": "^0.23.22" + "typedoc": "^0.25.7" } }, "p-cancelable": { From 4e55242a3031fb86d3366d77e1b753a2c71b1880 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 16 Mar 2024 18:54:06 -0700 Subject: [PATCH 18/55] Bump follow-redirects from 1.15.4 to 1.15.6 in /onnxruntime/test/wasm (#19950) Bumps [follow-redirects](https://github.com/follow-redirects/follow-redirects) from 1.15.4 to 1.15.6.
Commits
  • 35a517c Release version 1.15.6 of the npm package.
  • c4f847f Drop Proxy-Authorization across hosts.
  • 8526b4a Use GitHub for disclosure.
  • b1677ce Release version 1.15.5 of the npm package.
  • d8914f7 Preserve fragment in responseUrl.
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.4&new-version=1.15.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/microsoft/onnxruntime/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- onnxruntime/test/wasm/package-lock.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/onnxruntime/test/wasm/package-lock.json b/onnxruntime/test/wasm/package-lock.json index bfa000fda440a..1beaf3b83ca28 100644 --- a/onnxruntime/test/wasm/package-lock.json +++ b/onnxruntime/test/wasm/package-lock.json @@ -520,9 +520,9 @@ "dev": true }, "node_modules/follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true, "funding": [ { @@ -1972,9 +1972,9 @@ "dev": true }, "follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true }, "fs-extra": { From 28ad6c3955ca8bbcc7ce6ec07d47865e848b8f20 Mon Sep 17 00:00:00 2001 From: "dependabot[bot]" <49699333+dependabot[bot]@users.noreply.github.com> Date: Sat, 16 Mar 2024 18:54:53 -0700 Subject: [PATCH 19/55] Bump follow-redirects from 1.15.4 to 1.15.6 in /js/node (#19951) Bumps [follow-redirects](https://github.com/follow-redirects/follow-redirects) from 1.15.4 to 1.15.6.
Commits
  • 35a517c Release version 1.15.6 of the npm package.
  • c4f847f Drop Proxy-Authorization across hosts.
  • 8526b4a Use GitHub for disclosure.
  • b1677ce Release version 1.15.5 of the npm package.
  • d8914f7 Preserve fragment in responseUrl.
  • See full diff in compare view

[![Dependabot compatibility score](https://dependabot-badges.githubapp.com/badges/compatibility_score?dependency-name=follow-redirects&package-manager=npm_and_yarn&previous-version=1.15.4&new-version=1.15.6)](https://docs.github.com/en/github/managing-security-vulnerabilities/about-dependabot-security-updates#about-compatibility-scores) Dependabot will resolve any conflicts with this PR as long as you don't alter it yourself. You can also trigger a rebase manually by commenting `@dependabot rebase`. [//]: # (dependabot-automerge-start) [//]: # (dependabot-automerge-end) ---
Dependabot commands and options
You can trigger Dependabot actions by commenting on this PR: - `@dependabot rebase` will rebase this PR - `@dependabot recreate` will recreate this PR, overwriting any edits that have been made to it - `@dependabot merge` will merge this PR after your CI passes on it - `@dependabot squash and merge` will squash and merge this PR after your CI passes on it - `@dependabot cancel merge` will cancel a previously requested merge and block automerging - `@dependabot reopen` will reopen this PR if it is closed - `@dependabot close` will close this PR and stop Dependabot recreating it. You can achieve the same result by closing it manually - `@dependabot show ignore conditions` will show all of the ignore conditions of the specified dependency - `@dependabot ignore this major version` will close this PR and stop Dependabot creating any more for this major version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this minor version` will close this PR and stop Dependabot creating any more for this minor version (unless you reopen the PR or upgrade to it yourself) - `@dependabot ignore this dependency` will close this PR and stop Dependabot creating any more for this dependency (unless you reopen the PR or upgrade to it yourself) You can disable automated security fix PRs for this repo from the [Security Alerts page](https://github.com/microsoft/onnxruntime/network/alerts).
Signed-off-by: dependabot[bot] Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com> --- js/node/package-lock.json | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/js/node/package-lock.json b/js/node/package-lock.json index 2d7c39c86097f..62b47698a1438 100644 --- a/js/node/package-lock.json +++ b/js/node/package-lock.json @@ -30,7 +30,7 @@ "version": "1.18.0", "license": "MIT", "devDependencies": { - "typedoc": "^0.23.22" + "typedoc": "^0.25.7" } }, "node_modules/@protobufjs/aspromise": { @@ -336,9 +336,9 @@ "dev": true }, "node_modules/follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true, "funding": [ { @@ -1242,9 +1242,9 @@ "dev": true }, "follow-redirects": { - "version": "1.15.4", - "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.4.tgz", - "integrity": "sha512-Cr4D/5wlrb0z9dgERpUL3LrmPKVDsETIJhaCMeDfuFYcqa5bldGV6wBsAN6X/vxlXQtFBMrXdXxdL8CbDTGniw==", + "version": "1.15.6", + "resolved": "https://registry.npmjs.org/follow-redirects/-/follow-redirects-1.15.6.tgz", + "integrity": "sha512-wWN62YITEaOpSK584EZXJafH1AGpO8RVgElfkuXbTOrPX4fIfOyEpW/CsiNd8JdYrAoOvafRTOEnvsO++qCqFA==", "dev": true }, "form-data": { @@ -1503,7 +1503,7 @@ "onnxruntime-common": { "version": "file:../common", "requires": { - "typedoc": "^0.23.22" + "typedoc": "^0.25.7" } }, "parse-json": { From 7e0d4249343054c59410cd7cca76adb3456de0c1 Mon Sep 17 00:00:00 2001 From: Guenther Schmuelling Date: Mon, 18 Mar 2024 08:28:43 -0700 Subject: [PATCH 20/55] accumulate in fp32 for Reduce* (#19868) --- js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts index a9b28d7c034f3..210b3ee7e2fca 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/reduce-shared.ts @@ -131,7 +131,7 @@ export const createReduceSharedProgramInfo = const workgroupSize = 32; const sharedMemorySnippet = ` - var aBestValues : array<${output.type.storage}, ${workgroupSize}>; + var aBestValues : array; `; const getShaderSource = (shaderHelper: ShaderHelper) => ` @@ -145,10 +145,10 @@ export const createReduceSharedProgramInfo = let outputIndex = global_idx / ${workgroupSize}; let offset = outputIndex * uniforms.reduceSize; - var bestValue = ${output.type.storage}(${reduceInitValues[reduceType]}); + var bestValue = f32(${reduceInitValues[reduceType]}); let Length = uniforms.reduceSize; for (var k = local_idx; k < Length; k = k + ${workgroupSize}) { - let candidate = ${output.type.storage}(${input.getByOffset('offset + k')}); + let candidate = f32(${input.getByOffset('offset + k')}); bestValue = ${reduceOps[reduceType]}; } aBestValues[local_idx] = bestValue; @@ -172,8 +172,8 @@ export const createReduceSharedProgramInfo = output.setByOffset( 'outputIndex', `${ - reduceType === 'mean' ? `bestValue / ${output.type.storage}(uniforms.reduceSize)` : - `${reduceOutputValues[reduceType]}`}`)}; + reduceType === 'mean' ? `${output.type.storage}(bestValue / f32(uniforms.reduceSize))` : + `${output.type.storage}(${reduceOutputValues[reduceType]})`}`)}; } }`; From 4d31076d687560f7cfdada19f6ba9ad5a86612f2 Mon Sep 17 00:00:00 2001 From: Edward Chen <18449977+edgchen1@users.noreply.github.com> Date: Mon, 18 Mar 2024 08:54:24 -0700 Subject: [PATCH 21/55] [objc] Add check for ORTValue being a tensor in ORTValue methods that should only be used with tensors. (#19946) Add check to report error instead of crashing. --- objectivec/ort_value.mm | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/objectivec/ort_value.mm b/objectivec/ort_value.mm index b9dc1a9885c61..c61a7ea809237 100644 --- a/objectivec/ort_value.mm +++ b/objectivec/ort_value.mm @@ -148,6 +148,9 @@ - (nullable ORTValueTypeInfo*)typeInfoWithError:(NSError**)error { - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError**)error { try { const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo(); + if (!tensorTypeAndShapeInfo) { + ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION); + } return CXXAPIToPublicTensorTypeAndShapeInfo(tensorTypeAndShapeInfo); } ORT_OBJC_API_IMPL_CATCH_RETURNING_NULLABLE(error) @@ -156,6 +159,9 @@ - (nullable ORTTensorTypeAndShapeInfo*)tensorTypeAndShapeInfoWithError:(NSError* - (nullable NSMutableData*)tensorDataWithError:(NSError**)error { try { const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo(); + if (!tensorTypeAndShapeInfo) { + ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION); + } if (tensorTypeAndShapeInfo.GetElementType() == ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING) { ORT_CXX_API_THROW( "This ORTValue holds string data. Please call tensorStringDataWithError: " @@ -182,6 +188,9 @@ - (nullable NSMutableData*)tensorDataWithError:(NSError**)error { - (nullable NSArray*)tensorStringDataWithError:(NSError**)error { try { const auto tensorTypeAndShapeInfo = _typeInfo->GetTensorTypeAndShapeInfo(); + if (!tensorTypeAndShapeInfo) { + ORT_CXX_API_THROW("ORTValue is not a tensor.", ORT_RUNTIME_EXCEPTION); + } const size_t elementCount = tensorTypeAndShapeInfo.GetElementCount(); const size_t tensorStringDataLength = _value->GetStringTensorDataLength(); std::vector tensorStringData(tensorStringDataLength, '\0'); From a033df8c31311b6710570a3b7103dd8c2f9f9a64 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Mon, 18 Mar 2024 10:28:39 -0700 Subject: [PATCH 22/55] Implement CustomOp Output Type Inference function (#19906) ### Description This change addresses the following issues with the current CustomOP Output Type inference - The function does not take into account optional inputs. When input is absent the inference is silently aborted, and no output type is inferred (P1 customer issue) - Inferring output type based on the input type for multi-kernel custom ops is done based on the latest in sequence kernel definition. There is not an attempt made to match the kernel based on the input type. - Inference is aborted when variadic inputs/outputs are detected when the generated input/output names fail to obtain type constraints. This is not immediately clear from the code, because custom op schema is not available within the inference function. - No error reporting. ### Motivation and Context Most of CustomOPs lack their own type and shape inference function as it was recently introduced. For that reason, it is important to fix this. This change is inspired by a customer issue. This is a follow up on: - https://github.com/microsoft/onnxruntime/pull/15184 - https://github.com/cbourjau/ort-custom-op/pull/11 - https://github.com/microsoft/onnxruntime-extensions/issues/451 --- .../core/session/onnxruntime_c_api.h | 18 +- .../core/session/onnxruntime_cxx_api.h | 4 + onnxruntime/core/session/custom_ops.cc | 157 ++++++++++++------ .../test/framework/shape_inference_test.cc | 93 ++++++++++- onnxruntime/test/shared_lib/test_inference.cc | 7 +- 5 files changed, 224 insertions(+), 55 deletions(-) diff --git a/include/onnxruntime/core/session/onnxruntime_c_api.h b/include/onnxruntime/core/session/onnxruntime_c_api.h index cef50163f68b0..41b034e9c1dcc 100644 --- a/include/onnxruntime/core/session/onnxruntime_c_api.h +++ b/include/onnxruntime/core/session/onnxruntime_c_api.h @@ -1837,14 +1837,28 @@ struct OrtApi { /** \brief Used for custom operators, get an input of a kernel * - * \see ::OrtCustomOp + * The function attempts fetches the input of the kernel. If the input is optional + * and not present, the function returns success and out is set to nullptr. + * + * \param[in] context ::OrtKernelContext instance + * \param[in] input index. See KernelContext_GetInputCount for boundaries check. + * \param[in, out] returns a ptr to OrtValue if the input is present + * + * \snippet{doc} snippets.dox OrtStatus Return Value */ ORT_API2_STATUS(KernelContext_GetInput, _In_ const OrtKernelContext* context, _In_ size_t index, _Out_ const OrtValue** out); /** \brief Used for custom operators, get an output of a kernel * - * \see ::OrtCustomOp + * The function attempts fetches the output of the kernel. If the output is optional + * and not present, the function returns success and out is set to nullptr. + * + * \param[in] context ::OrtKernelContext instance + * \param[in] output index. See KernelContext_GetOutputCount for boundaries check. + * \param[in, out] returns a ptr to OrtValue if the output is present + * + * \snippet{doc} snippets.dox OrtStatus Return Value */ ORT_API2_STATUS(KernelContext_GetOutput, _Inout_ OrtKernelContext* context, _In_ size_t index, _In_ const int64_t* dim_values, size_t dim_count, _Outptr_ OrtValue** out); diff --git a/include/onnxruntime/core/session/onnxruntime_cxx_api.h b/include/onnxruntime/core/session/onnxruntime_cxx_api.h index ae4c4bef90c64..60540514fbfa6 100644 --- a/include/onnxruntime/core/session/onnxruntime_cxx_api.h +++ b/include/onnxruntime/core/session/onnxruntime_cxx_api.h @@ -2055,7 +2055,11 @@ struct KernelContext { explicit KernelContext(OrtKernelContext* context); size_t GetInputCount() const; size_t GetOutputCount() const; + // If input is optional and is not present, the method returns en empty ConstValue + // which can be compared to nullptr. ConstValue GetInput(size_t index) const; + // If outout is optional and is not present, the method returns en empty UnownedValue + // which can be compared to nullptr. UnownedValue GetOutput(size_t index, const int64_t* dim_values, size_t dim_count) const; UnownedValue GetOutput(size_t index, const std::vector& dims) const; void* GetGPUComputeStream() const; diff --git a/onnxruntime/core/session/custom_ops.cc b/onnxruntime/core/session/custom_ops.cc index 6e9d68d259a5d..513aafcdadb7d 100644 --- a/onnxruntime/core/session/custom_ops.cc +++ b/onnxruntime/core/session/custom_ops.cc @@ -1066,59 +1066,120 @@ Status IsCompatible(const ONNX_NAMESPACE::OpSchema& schema, const OrtCustomOp* o return Status::OK(); } -void InferOutputTypes(const InlinedVector& kernel_defs, - ONNX_NAMESPACE::InferenceContext& infer_ctx) { - for (const auto& kernel_def : kernel_defs) { +// This function attempts to do its best for older custom ops (most of them) who do not have +// they own type and shape inference function. However, it falls short in some cases, and we leave +// those for the user to handle in their own inference function. +static void InferOutputTypes(const ONNX_NAMESPACE::OpSchema& schema, gsl::span kernel_defs, + ONNX_NAMESPACE::InferenceContext& infer_ctx) { + const auto& inputs = schema.inputs(); + const auto node_input_num = infer_ctx.getNumInputs(); + + const KernelDef* def_selected = nullptr; + bool is_variadic_input = false; + bool is_homogeneous_input = false; + int32_t output_propagate{0}; + + for (size_t kernel_index = 0; + kernel_index < kernel_defs.size() && def_selected == nullptr; + ++kernel_index) { + const auto* kernel_def = kernel_defs[kernel_index]; const auto& type_constraints = kernel_def->TypeConstraints(); - auto num_inputs = infer_ctx.getNumInputs(); - bool matched = true; - ONNXTensorElementDataType undef = ONNXTensorElementDataType::ONNX_TENSOR_ELEMENT_DATA_TYPE_UNDEFINED; - // first, make sure there is a constraint for every input - for (size_t i = 0; i < num_inputs && matched; ++i) { - auto input_name = "Input" + std::to_string(i); - auto input_type = infer_ctx.getInputType(i); - if (input_type) { - auto elem_type = static_cast(input_type->tensor_type().elem_type()); - auto tc_iter = type_constraints.find(input_name); - if (tc_iter != type_constraints.end()) { - if (tc_iter->second.size() > 1) { - undef = elem_type; - } else if (tc_iter->second.size() != 1 || - tc_iter->second[0] != DataTypeImpl::TensorTypeFromONNXEnum(elem_type)) { - matched = false; + def_selected = kernel_def; + + for (size_t i = 0; i < node_input_num; ++i) { + const auto input_type = infer_ctx.getInputType(i); + + // Guard against variadic parameter index + const size_t schema_input_index = (i < inputs.size()) ? i : inputs.size() - 1; + const auto& param = inputs[schema_input_index]; + const auto& input_name = param.GetName(); + if (input_type == nullptr) { + if (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Optional) + continue; + + ORT_THROW("[CustomOP type inferencing error]: kernel Input: ", input_name, + " is absent, but not optional. Op : ", schema.Name()); + } + + is_variadic_input = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic); + is_homogeneous_input = param.GetIsHomogeneous(); + + if (!is_variadic_input || is_homogeneous_input) { + auto hit = type_constraints.find(input_name); + if (hit != type_constraints.end()) { + const auto& types = hit->second; + // For custom ops kernel constraints are never empty + assert(!types.empty()); + if (!std::any_of(types.cbegin(), types.cend(), + [input_type](const DataTypeImpl* type) { + return type->IsCompatible(*input_type); + })) { + def_selected = nullptr; + output_propagate = 0; + break; + } + + // If we have multiple types possible from the constraints, + // record the last type and use it to guess the output type if + // output may have different types. Works well for symmetric single input/outputs + // otherwise give up and let the user supply their own function + if (types.size() > 1) { + output_propagate = input_type->tensor_type().elem_type(); } } else { - matched = false; + ORT_THROW("[CustomOP type inferencing error]: no type constraint found for input: ", + input_name, " Op: ", schema.Name()); } - } else { - matched = false; - } - } // for - // next, ensure that there is a constraint for every output - auto num_outputs = infer_ctx.getNumOutputs(); - for (size_t i = 0; i < num_outputs && matched; i++) { - auto output_name = "Output" + std::to_string(i); - auto tc_iter = type_constraints.find(output_name); - if (tc_iter == type_constraints.end() || tc_iter->second.size() < 1) { - matched = false; } } - if (matched) { - for (size_t i = 0; i < num_outputs; i++) { - auto output_name = "Output" + std::to_string(i); - auto output_type = infer_ctx.getOutputType(i); - auto tc_iter = type_constraints.find(output_name); - if (tc_iter->second.size() > 1) { - output_type->mutable_tensor_type()->set_elem_type(undef); - } else { - output_type->mutable_tensor_type()->set_elem_type( - tc_iter->second[0]->GetTypeProto()->tensor_type().elem_type()); - } - } + } + + if (def_selected == nullptr) { + ORT_THROW("[CustomOP type inferencing error]: no kernel def matches node inputs for Op: ", schema.Name()); + } + + const auto& outputs = schema.outputs(); + const auto node_output_num = infer_ctx.getNumOutputs(); + const auto& selected_type_constraints = def_selected->TypeConstraints(); + + for (size_t i = 0; i < node_output_num; ++i) { + auto output_type = infer_ctx.getOutputType(i); + // Account for variadic outputs + const size_t schema_output_index = (i < outputs.size()) ? i : outputs.size() - 1; + const auto& param = outputs[schema_output_index]; + const auto& output_name = param.GetName(); + + const bool is_variadic_output = (param.GetOption() == ONNX_NAMESPACE::OpSchema::FormalParameterOption::Variadic); + const bool is_homogeneous = param.GetIsHomogeneous(); + + // We give up on variadic non-homogeneous outputs + // Let the user handle it in their inference function + if (is_variadic_output && !is_homogeneous) { break; } + + auto hit = selected_type_constraints.find(output_name); + if (hit != selected_type_constraints.end()) { + const auto& types = hit->second; + assert(!types.empty()); + + if (types.size() == 1) { + // Use the constraint type + output_type->mutable_tensor_type()->set_elem_type( + types[0]->GetTypeProto()->tensor_type().elem_type()); + } else if (!is_variadic_input || is_homogeneous_input) { + // If not variadic or homogeneous, and there are multiple types possible, guess from the last input type + // as this works for symmetric varied single input/outputs + // otherwise give up and let the user supply their own function + output_type->mutable_tensor_type()->set_elem_type(output_propagate); + } + } else { + ORT_THROW("[CustomOP type inferencing error]: no type constraint found for output: ", + output_name, " Op: ", schema.Name()); + } } } + #endif common::Status CreateCustomRegistry(gsl::span op_domains, @@ -1178,13 +1239,13 @@ common::Status CreateCustomRegistry(gsl::span op_domai } std::vector schemas; - for (auto schema_iter : schema_map) { - schemas.push_back(schema_iter.second); - InlinedVector kernel_defs = std::move(kernel_def_map[schema_iter.first]); + for (auto& [name, schema] : schema_map) { + schemas.push_back(schema); auto infer_fn = schemas.back().GetTypeAndShapeInferenceFunction(); ONNX_NAMESPACE::InferenceFunction extended_infer_fn = - [infer_fn, kernel_defs](ONNX_NAMESPACE::InferenceContext& infer_ctx) { - InferOutputTypes(kernel_defs, infer_ctx); + [sch = schema, infer_fn = std::move(infer_fn), + kernel_defs = std::move(kernel_def_map[name])](ONNX_NAMESPACE::InferenceContext& infer_ctx) { + InferOutputTypes(sch, kernel_defs, infer_ctx); if (infer_fn) { infer_fn(infer_ctx); } diff --git a/onnxruntime/test/framework/shape_inference_test.cc b/onnxruntime/test/framework/shape_inference_test.cc index bfabcd567803b..f5258760eb20d 100644 --- a/onnxruntime/test/framework/shape_inference_test.cc +++ b/onnxruntime/test/framework/shape_inference_test.cc @@ -5,13 +5,16 @@ #include #include "gtest/gtest.h" +#include "core/common/span_utils.h" #include "core/graph/model.h" +#include "core/session/onnxruntime_cxx_api.h" #include "test/framework/model_builder_utils.h" +#include "test/util/include/asserts.h" #include "test/util/include/test_utils.h" +#include "test/util/include/inference_session_wrapper.h" #include "test/test_environment.h" using namespace ONNX_NAMESPACE; -using namespace std; namespace onnxruntime { namespace test { @@ -22,7 +25,7 @@ class ShapeInferenceTest : public ::testing::Test { protected: onnxruntime::Model model_; int node_count_; - std::unordered_map> name_to_arg_; + std::unordered_map> name_to_arg_; public: ShapeInferenceTest() : model_("Test", false, DefaultLoggingManager().DefaultLogger()), node_count_(0) {} @@ -73,5 +76,91 @@ TEST_F(ShapeInferenceTest, BasicTest) { CheckShapeEquality(InputShape(node), OutputShape(node)); } +namespace { +struct MyCustomKernelWithOptionalInput { + MyCustomKernelWithOptionalInput(const OrtKernelInfo* /*info*/) { + } + + OrtStatusPtr ComputeV2(OrtKernelContext* /* context */) const { + return nullptr; + } +}; + +struct MyCustomOpWithOptionalInput : Ort::CustomOpBase { + explicit MyCustomOpWithOptionalInput(const char* provider) : provider_(provider) {} + + OrtStatusPtr CreateKernelV2(const OrtApi& /* api */, const OrtKernelInfo* info, void** kernel) const { + *kernel = new MyCustomKernelWithOptionalInput(info); + return nullptr; + }; + + const char* GetName() const { return "FooBar"; }; + const char* GetExecutionProviderType() const { return provider_; }; + + size_t GetInputTypeCount() const { return 3; }; + ONNXTensorElementDataType GetInputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; + OrtCustomOpInputOutputCharacteristic GetInputCharacteristic(size_t index) const { + // The second input (index == 1) is optional + if (index == 1) + return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_OPTIONAL; + + return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED; + } + + size_t GetOutputTypeCount() const { return 1; }; + ONNXTensorElementDataType GetOutputType(size_t /*index*/) const { return ONNX_TENSOR_ELEMENT_DATA_TYPE_FLOAT; }; + OrtCustomOpInputOutputCharacteristic GetOutputCharacteristic(size_t /*index*/) const { + return OrtCustomOpInputOutputCharacteristic::INPUT_OUTPUT_REQUIRED; + } + + private: + const char* provider_; +}; + +const ORTCHAR_T* const OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = ORT_TSTR("testdata/foo_bar_2.onnx"); + +} // namespace + +// CustomOps Output type inference function quits if it +// encounters the an output that is optional and absent. +// It quits without any errors or logging. We want to make sure +// that inference proceeds for all of the outputs when absent optional inputs are present +TEST(ShapeInferenceCustomOpTest, custom_op_optional_input_inference_test) { + MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider}; + + const auto& env = GetEnvironment(); + + Ort::CustomOpDomain op_domain("test"); + op_domain.Add(&custom_op); + + std::initializer_list op_domains = {static_cast(op_domain)}; + + SessionOptions sess_opts; + sess_opts.inter_op_param.thread_pool_size = 1; + sess_opts.intra_op_param.thread_pool_size = 1; + + InferenceSessionWrapper session{sess_opts, env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2}; + ASSERT_STATUS_OK(session.AddCustomOpDomains(AsSpan(op_domains))); + + ASSERT_STATUS_OK(session.Load()); + ASSERT_STATUS_OK(session.Initialize()); + + const onnxruntime::Model& model = session.GetModel(); + const auto& graph = model.MainGraph(); + const auto& nodes = graph.Nodes(); + for (const auto& node : nodes) { + if (node.OpType() == "FooBar") { + // check inferred shapes + const auto* node_arg = node.OutputDefs()[0]; + const auto* type_proto = node_arg->TypeAsProto(); + ASSERT_NE(nullptr, type_proto); + ASSERT_EQ(ONNX_NAMESPACE::TypeProto::ValueCase::kTensorType, type_proto->value_case()); + ASSERT_EQ(ONNX_NAMESPACE::TensorProto_DataType_FLOAT, type_proto->tensor_type().elem_type()); + } + } +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/shared_lib/test_inference.cc b/onnxruntime/test/shared_lib/test_inference.cc index 91453102d406f..52dd2a84e383b 100644 --- a/onnxruntime/test/shared_lib/test_inference.cc +++ b/onnxruntime/test/shared_lib/test_inference.cc @@ -208,7 +208,7 @@ static constexpr PATH_TYPE MODEL_WITH_CUSTOM_MODEL_METADATA = TSTR("testdata/mod static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/VariedInputCustomOp.onnx"); static constexpr PATH_TYPE VARIED_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_3.onnx"); static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/foo_bar_1.onnx"); -static constexpr PATH_TYPE OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx"); +static constexpr PATH_TYPE OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2 = TSTR("testdata/foo_bar_2.onnx"); static constexpr PATH_TYPE VARIADIC_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR("testdata/custom_op_variadic_io.onnx"); static constexpr PATH_TYPE VARIADIC_UNDEF_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI = TSTR( "testdata/custom_op_variadic_undef_io.onnx"); @@ -1082,7 +1082,7 @@ TEST(CApiTest, invalid_variadic_input_homogeneity_custom_op) { } } -TEST(CApiTest, optional_input_output_custom_op_handler) { +TEST(CApiTest, optional_input_custom_op_handler) { MyCustomOpWithOptionalInput custom_op{onnxruntime::kCpuExecutionProvider}; // `MyCustomOpFooBar` defines a custom op with atmost 3 inputs and the second input is optional. @@ -1147,7 +1147,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) { { std::vector input_names = {"X1", "X2"}; ort_inputs.erase(ort_inputs.begin() + 2); // remove the last input in the container - Ort::Session session(*ort_env, OPTIONAL_INPUT_OUTPUT_CUSTOM_OP_MODEL_URI_2, session_options); + Ort::Session session(*ort_env, OPTIONAL_INPUT_CUSTOM_OP_MODEL_URI_2, session_options); auto ort_outputs = session.Run(Ort::RunOptions{}, input_names.data(), ort_inputs.data(), ort_inputs.size(), &output_name, 1); ASSERT_EQ(ort_outputs.size(), 1u); @@ -1166,6 +1166,7 @@ TEST(CApiTest, optional_input_output_custom_op_handler) { } } } + TEST(CApiTest, custom_op_with_attributes_handler) { MyCustomOpWithAttributes custom_op{onnxruntime::kCpuExecutionProvider}; From 141966bb69468ce87d717df59fa01fd64bc35112 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Mon, 18 Mar 2024 11:17:34 -0700 Subject: [PATCH 23/55] Disable TF32 in tests of CUDA ep (#19963) Operator or model test result shall not depend on whether NVIDIA_TF32_OVERRIDE environment variable is set or not. This make test results more deterministic. --- .../test/contrib_ops/attention_op_test.cc | 14 --------- .../test/contrib_ops/beam_search_test.cc | 20 ++++++++++--- .../test/contrib_ops/greedy_search_test.cc | 16 ++++++++-- .../contrib_ops/packed_attention_op_test.cc | 3 +- onnxruntime/test/contrib_ops/sampling_test.cc | 9 +++++- onnxruntime/test/onnx/main.cc | 18 ++++++++---- onnxruntime/test/providers/cpu/model_tests.cc | 29 ++++++++----------- onnxruntime/test/util/default_providers.cc | 6 ++-- 8 files changed, 68 insertions(+), 47 deletions(-) diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc index b652e0723f5aa..7fe70fd2d6f09 100644 --- a/onnxruntime/test/contrib_ops/attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/attention_op_test.cc @@ -2013,13 +2013,6 @@ TEST(AttentionTest, AttentionMaskIndexOutOfRange) { #if !defined(__wasm__) // TODO: fix in web assembly TEST(AttentionTest, AttentionPastState_dynamic) { - // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test. - // Do not run this test unless TF32 is disabled explicitly. - if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault("NVIDIA_TF32_OVERRIDE", 1) != 0) { - GTEST_SKIP() << "Skipping AttentionPastState_dynamic in A100 since TF32 is enabled"; - return; - } - // create rand inputs RandomValueGenerator random{}; @@ -2101,13 +2094,6 @@ static void RunModelWithRandomInput( std::vector& mask_index_data, std::string& onnx_model, bool is_float16) { - // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test. - // Do not run this test unless TF32 is disabled explicitly. - if (HasCudaEnvironment(800) && ParseEnvironmentVariableWithDefault("NVIDIA_TF32_OVERRIDE", 1) != 0) { - GTEST_SKIP() << "Skipping RunModelWithRandomInput in A100 since TF32 is enabled"; - return; - } - RandomValueGenerator random{234}; constexpr int hidden_size = 768; diff --git a/onnxruntime/test/contrib_ops/beam_search_test.cc b/onnxruntime/test/contrib_ops/beam_search_test.cc index 156ed3799fc22..6ce9f5de68f11 100644 --- a/onnxruntime/test/contrib_ops/beam_search_test.cc +++ b/onnxruntime/test/contrib_ops/beam_search_test.cc @@ -8,6 +8,10 @@ #include "core/session/onnxruntime_cxx_api.h" #include "test/common/cuda_op_test_utils.h" +#ifdef USE_CUDA +#include "core/providers/cuda/cuda_provider_options.h" +#endif + extern std::unique_ptr ort_env; namespace onnxruntime { @@ -70,7 +74,9 @@ TEST(BeamSearchTest, GptBeamSearchFp32) { Ort::SessionOptions session_options; #ifdef USE_CUDA - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); #endif #ifdef USE_ROCM @@ -161,7 +167,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16) { if (enable_cuda || enable_rocm) { Ort::SessionOptions session_options; #ifdef USE_CUDA - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); #endif #ifdef USE_ROCM @@ -254,7 +262,9 @@ TEST(BeamSearchTest, GptBeamSearchWithInitDecoderFp16) { if (enable_cuda || enable_rocm) { Ort::SessionOptions session_options; #ifdef USE_CUDA - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); #endif #ifdef USE_ROCM @@ -346,7 +356,9 @@ TEST(BeamSearchTest, GptBeamSearchFp16_VocabPadded) { if (enable_cuda || enable_rocm) { Ort::SessionOptions session_options; #ifdef USE_CUDA - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); #endif #ifdef USE_ROCM diff --git a/onnxruntime/test/contrib_ops/greedy_search_test.cc b/onnxruntime/test/contrib_ops/greedy_search_test.cc index 1baf50c1ba616..73da82d4bb039 100644 --- a/onnxruntime/test/contrib_ops/greedy_search_test.cc +++ b/onnxruntime/test/contrib_ops/greedy_search_test.cc @@ -8,6 +8,10 @@ #include "core/session/onnxruntime_cxx_api.h" #include "test/common/cuda_op_test_utils.h" +#ifdef USE_CUDA +#include "core/providers/cuda/cuda_provider_options.h" +#endif + extern std::unique_ptr ort_env; namespace onnxruntime { @@ -64,9 +68,13 @@ TEST(GreedySearchTest, GptGreedySearchFp16_VocabPadded) { if (is_cuda || is_rocm) { Ort::SessionOptions session_options; +#ifdef USE_CUDA if (is_cuda) { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); } +#endif if (is_rocm) { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0)); } @@ -145,9 +153,13 @@ TEST(GreedySearchTest, GptGreedySearchFp32) { if (is_cuda || is_rocm) { Ort::SessionOptions session_options; +#ifdef USE_CUDA if (is_cuda) { - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); } +#endif if (is_rocm) { Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_ROCM(session_options, 0)); } diff --git a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc index 31ef62e69bb88..09baf8def05f6 100644 --- a/onnxruntime/test/contrib_ops/packed_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/packed_attention_op_test.cc @@ -433,8 +433,7 @@ static void RunModelWithRandomInput( std::vector token_offset_dims{batch_size, sequence_length}; std::vector cum_seq_len_dims{batch_size + 1}; - // TF32 in SM >= 80 is enabled by default, need larger threshold for float when TF32 is enabled. - float gpu_threshold = is_float16 ? 0.15f : (HasCudaEnvironment(800) ? 0.05f : 0.005f); + float gpu_threshold = is_float16 ? 0.15f : 0.005f; gpu_threshold *= sequence_length > 1024 ? 4.0f : 1.0f; // threshold should increase with sequence length bool enable_cuda = HasCudaEnvironment(is_float16 ? 530 : 0); if (enable_cuda) { diff --git a/onnxruntime/test/contrib_ops/sampling_test.cc b/onnxruntime/test/contrib_ops/sampling_test.cc index 733bc9f01fd11..d987a1cae427d 100644 --- a/onnxruntime/test/contrib_ops/sampling_test.cc +++ b/onnxruntime/test/contrib_ops/sampling_test.cc @@ -8,6 +8,10 @@ #include "core/session/onnxruntime_cxx_api.h" #include "test/common/cuda_op_test_utils.h" +#ifdef USE_CUDA +#include "core/providers/cuda/cuda_provider_options.h" +#endif + extern std::unique_ptr ort_env; namespace onnxruntime { @@ -65,7 +69,10 @@ TEST(SamplingTest, Gpt2Sampling_GPU) { LOGS_DEFAULT(WARNING) << "Hardware NOT support current architecture"; return; } - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_CUDA(session_options, 0)); + + OrtCUDAProviderOptionsV2 cuda_options; + cuda_options.use_tf32 = false; + session_options.AppendExecutionProvider_CUDA_V2(cuda_options); #else // USE_ROCM OrtROCMProviderOptions rocm_options; // TODO - verify the default settings diff --git a/onnxruntime/test/onnx/main.cc b/onnxruntime/test/onnx/main.cc index 9c2c24e3c337d..0d55fd19b918a 100644 --- a/onnxruntime/test/onnx/main.cc +++ b/onnxruntime/test/onnx/main.cc @@ -25,6 +25,10 @@ #include "core/session/onnxruntime_session_options_config_keys.h" #include "nlohmann/json.hpp" +#ifdef USE_CUDA +#include "core/providers/cuda/cuda_provider_options.h" +#endif + using namespace onnxruntime; namespace { @@ -401,12 +405,15 @@ int real_main(int argc, char* argv[], Ort::Env& env) { if (enable_tensorrt) { #ifdef USE_TENSORRT - OrtCUDAProviderOptions cuda_options; + Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); +#ifdef USE_CUDA + OrtCUDAProviderOptionsV2 cuda_options; cuda_options.device_id = device_id; cuda_options.do_copy_in_default_stream = true; + cuda_options.use_tf32 = false; // TODO: Support arena configuration for users of test runner - Ort::ThrowOnError(OrtSessionOptionsAppendExecutionProvider_Tensorrt(sf, device_id)); - sf.AppendExecutionProvider_CUDA(cuda_options); + sf.AppendExecutionProvider_CUDA_V2(cuda_options); +#endif #else fprintf(stderr, "TensorRT is not supported in this build"); return -1; @@ -424,10 +431,11 @@ int real_main(int argc, char* argv[], Ort::Env& env) { } if (enable_cuda) { #ifdef USE_CUDA - OrtCUDAProviderOptions cuda_options; + OrtCUDAProviderOptionsV2 cuda_options; cuda_options.do_copy_in_default_stream = true; + cuda_options.use_tf32 = false; // TODO: Support arena configuration for users of test runner - sf.AppendExecutionProvider_CUDA(cuda_options); + sf.AppendExecutionProvider_CUDA_V2(cuda_options); #else fprintf(stderr, "CUDA is not supported in this build"); return -1; diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index af71fe5cf79ae..00d96a0664fa0 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -98,21 +98,6 @@ TEST_P(ModelTest, Run) { std::unique_ptr model_info = std::make_unique(model_path.c_str()); -#if defined(__linux__) - // ORT enables TF32 in GEMM for A100. TF32 will cause precsion loss and fail this test. - if (HasCudaEnvironment(800) && provider_name == "cuda") { - per_sample_tolerance = 1e-1; - if (model_path.find(ORT_TSTR("SSD")) > 0 || - model_path.find(ORT_TSTR("ssd")) > 0 || - model_path.find(ORT_TSTR("yolov3")) > 0 || - model_path.find(ORT_TSTR("mask_rcnn")) > 0 || - model_path.find(ORT_TSTR("FNS")) > 0) { - SkipTest("Skipping SSD test for big tolearance failure or other errors"); - return; - } - } -#endif - if (model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_TRAINING_DOMAIN) || model_info->HasDomain(ONNX_NAMESPACE::AI_ONNX_PREVIEW_TRAINING_DOMAIN)) { SkipTest("it has the training domain. No pipeline should need to run these tests."); @@ -192,12 +177,14 @@ TEST_P(ModelTest, Run) { ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options)); std::unique_ptr rel_cuda_options( cuda_options, &OrtApis::ReleaseCUDAProviderOptions); - std::vector keys{"device_id"}; + std::vector keys{"device_id", "use_tf32"}; std::vector values; std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID"); values.push_back(device_id.empty() ? "0" : device_id.c_str()); - ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 1)); + values.push_back("0"); + ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2)); + ortso.AppendExecutionProvider_CUDA_V2(*cuda_options); } else if (provider_name == "rocm") { OrtROCMProviderOptions ep_options; @@ -229,6 +216,14 @@ TEST_P(ModelTest, Run) { ASSERT_ORT_STATUS_OK(OrtApis::CreateCUDAProviderOptions(&cuda_options)); std::unique_ptr rel_cuda_options( cuda_options, &OrtApis::ReleaseCUDAProviderOptions); + + std::vector keys{"device_id", "use_tf32"}; + std::vector values; + std::string device_id = Env::Default().GetEnvironmentVar("ONNXRUNTIME_TEST_GPU_DEVICE_ID"); + values.push_back(device_id.empty() ? "0" : device_id.c_str()); + values.push_back("0"); + ASSERT_ORT_STATUS_OK(OrtApis::UpdateCUDAProviderOptions(cuda_options, keys.data(), values.data(), 2)); + ortso.AppendExecutionProvider_CUDA_V2(*cuda_options); } else if (provider_name == "migraphx") { OrtMIGraphXProviderOptions ep_options; diff --git a/onnxruntime/test/util/default_providers.cc b/onnxruntime/test/util/default_providers.cc index c12a52c4356aa..6ad2d41edb562 100644 --- a/onnxruntime/test/util/default_providers.cc +++ b/onnxruntime/test/util/default_providers.cc @@ -8,7 +8,7 @@ #ifdef USE_COREML #include "core/providers/coreml/coreml_provider_factory.h" #endif -#if defined(ENABLE_CUDA_NHWC_OPS) +#ifdef USE_CUDA #include #endif #include "core/session/onnxruntime_cxx_api.h" @@ -113,8 +113,9 @@ std::unique_ptr DefaultOpenVINOExecutionProvider() { std::unique_ptr DefaultCudaExecutionProvider() { #ifdef USE_CUDA - OrtCUDAProviderOptions provider_options{}; + OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; + provider_options.use_tf32 = false; if (auto factory = CudaProviderFactoryCreator::Create(&provider_options)) return factory->CreateProvider(); #endif @@ -126,6 +127,7 @@ std::unique_ptr DefaultCudaNHWCExecutionProvider() { #if defined(USE_CUDA) OrtCUDAProviderOptionsV2 provider_options{}; provider_options.do_copy_in_default_stream = true; + provider_options.use_tf32 = false; provider_options.prefer_nhwc = true; if (auto factory = CudaProviderFactoryCreator::Create(&provider_options)) return factory->CreateProvider(); From a4ac727cbbf1c1d5fa1483972591b6693afbb2d6 Mon Sep 17 00:00:00 2001 From: Guenther Schmuelling Date: Mon, 18 Mar 2024 13:42:51 -0700 Subject: [PATCH 24/55] handle fp16 for where op (#19969) this prevents falling back from webgpu to cpu, aka helps performance --- .../core/providers/js/operators/where.cc | 26 ++++++++++--------- 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/onnxruntime/core/providers/js/operators/where.cc b/onnxruntime/core/providers/js/operators/where.cc index 2f8f5e275aa98..dcdf9bee2f783 100644 --- a/onnxruntime/core/providers/js/operators/where.cc +++ b/onnxruntime/core/providers/js/operators/where.cc @@ -6,18 +6,19 @@ namespace onnxruntime { namespace js { -#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS) \ - ONNX_OPERATOR_KERNEL_EX( \ - OP_TYPE, \ - kOnnxDomain, \ - VERSION, \ - kJsExecutionProvider, \ - KernelDefBuilder() \ - .TypeConstraint("T", \ - {DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType(), \ - DataTypeImpl::GetTensorType()}), \ +#define REG_ELEMENTWISE_KERNEL(OP_TYPE, VERSION, KERNEL_CLASS) \ + ONNX_OPERATOR_KERNEL_EX( \ + OP_TYPE, \ + kOnnxDomain, \ + VERSION, \ + kJsExecutionProvider, \ + KernelDefBuilder() \ + .TypeConstraint("T", \ + {DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType()}), \ KERNEL_CLASS); #define REG_ELEMENTWISE_VERSIONED_KERNEL(OP_TYPE, VERSION_FROM, VERSION_TO, KERNEL_CLASS) \ @@ -29,6 +30,7 @@ namespace js { KernelDefBuilder() \ .TypeConstraint("T", \ {DataTypeImpl::GetTensorType(), \ + DataTypeImpl::GetTensorType(), \ DataTypeImpl::GetTensorType(), \ DataTypeImpl::GetTensorType(), \ DataTypeImpl::GetTensorType()}), \ From 6bb64683f8f937da7af86bc61df7a4fb28dee5aa Mon Sep 17 00:00:00 2001 From: Ted Themistokleous <107195283+TedThemistokleous@users.noreply.github.com> Date: Mon, 18 Mar 2024 22:40:40 -0400 Subject: [PATCH 25/55] Use version instead of version-dev for ROCm (#19967) --- cmake/CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 02b568abdf8da..655ca1c42ef93 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -325,8 +325,8 @@ if (onnxruntime_USE_ROCM) # replicate strategy used by pytorch to get ROCM_VERSION # https://github.com/pytorch/pytorch/blob/5c5b71b6eebae76d744261715231093e62f0d090/cmake/public/LoadHIP.cmake # with modification - if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version-dev") - file(READ "${onnxruntime_ROCM_HOME}/.info/version-dev" ROCM_VERSION_DEV_RAW) + if (EXISTS "${onnxruntime_ROCM_HOME}/.info/version") + file(READ "${onnxruntime_ROCM_HOME}/.info/version" ROCM_VERSION_DEV_RAW) string(REGEX MATCH "^([0-9]+)\.([0-9]+)\.([0-9]+)-.*$" ROCM_VERSION_MATCH ${ROCM_VERSION_DEV_RAW}) elseif (EXISTS "${onnxruntime_ROCM_HOME}/include/rocm_version.h") file(READ "${onnxruntime_ROCM_HOME}/include/rocm_version.h" ROCM_VERSION_H_RAW) @@ -345,7 +345,7 @@ if (onnxruntime_USE_ROCM) else() message(FATAL_ERROR "Cannot determine ROCm version string") endif() - message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version-dev ****\n") + message("\n***** ROCm version from ${onnxruntime_ROCM_HOME}/.info/version ****\n") message("ROCM_VERSION_DEV: ${ROCM_VERSION_DEV}") message("ROCM_VERSION_DEV_MAJOR: ${ROCM_VERSION_DEV_MAJOR}") message("ROCM_VERSION_DEV_MINOR: ${ROCM_VERSION_DEV_MINOR}") From 4c6a6a37f77dae7b54a826527a0d688c7ca46834 Mon Sep 17 00:00:00 2001 From: Xu Xing Date: Tue, 19 Mar 2024 13:59:32 +0800 Subject: [PATCH 26/55] [js/webgpu] Fix NAN caused by un-initialized buffer in instance-norm (#19387) The added case will be NAN because of the un-initialized buffer. --- .../lib/wasm/jsep/webgpu/ops/instance-norm.ts | 2 +- js/web/test/data/ops/instance-norm.jsonc | 80 +++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts index 2f652dbd310ab..2c72def089144 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/instance-norm.ts @@ -207,7 +207,7 @@ const computeMean = let offset = currentImageNumber * uniforms.image_size; var sum = ${fillVector('f32', components)}; var squaredSum = ${fillVector('f32', components)}; - for (var i: u32 = 0; i < ${WG}; i++) { + for (var i: u32 = 0; i < min(${WG}, uniforms.H); i++) { let value = input[offset + i + currentChannelNumber * ${WG}]; sum += value[0]; squaredSum += value[1]; diff --git a/js/web/test/data/ops/instance-norm.jsonc b/js/web/test/data/ops/instance-norm.jsonc index e89ac2da3795f..f28b016d47ab9 100644 --- a/js/web/test/data/ops/instance-norm.jsonc +++ b/js/web/test/data/ops/instance-norm.jsonc @@ -224,5 +224,85 @@ ] } ] + }, + { + "name": "Simple test with NHWC, components 1, buffer reuse", + "operator": "InstanceNormalization", + "inputShapeDefinitions": "rankOnly", + "opset": { + "domain": "", + "version": 17 + }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6], + "dims": [2, 3, 1, 1], + "type": "float32" + }, + { + "data": [1, 2, 3], + "dims": [3], + "type": "float32" + }, + { + "data": [4, 5, 6], + "dims": [3], + "type": "float32" + } + ], + "outputs": [ + { + "data": [4, 5, 6, 4, 5, 6], + "dims": [2, 3, 1, 1], + "type": "float32" + } + ] + } + ] + }, + { + "name": "Simple test with NHWC, components 2, buffer reuse", + "operator": "InstanceNormalization", + "inputShapeDefinitions": "rankOnly", + "opset": { + "domain": "", + "version": 17 + }, + "cases": [ + { + "name": "Simple test", + "inputs": [ + { + "data": [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 9, 8, 7, 6, 5, 4, 3, 2], + "dims": [1, 6, 1, 3], + "type": "float32" + }, + { + "data": [1, 2, 3, 4, 5, 6], + "dims": [6], + "type": "float32" + }, + { + "data": [4, 5, 6, 7, 8, 9], + "dims": [6], + "type": "float32" + } + ], + "outputs": [ + { + "data": [ + 2.775264263153076, 4, 5.224735260009766, 2.5505285263061523, 5, 7.449470520019531, 2.325794219970703, 6, + 9.674205780029297, 11.898944854736328, 7, 2.1010589599609375, 14.123676300048828, 8, 1.876321792602539, + 16.348413467407227, 9, 1.6515865325927734 + ], + "dims": [1, 6, 1, 3], + "type": "float32" + } + ] + } + ] } ] From 26cd3c1fb0245d05e3beb8a9f33ce5f5d274d111 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Tue, 19 Mar 2024 09:33:06 -0700 Subject: [PATCH 27/55] add kernel tests for ops that changed in opset18 (#19767) ### Description - [x] Pad operator has introduced a new input called "axes" which specifies which axis to pad. But it defaults to input_rank if axes is not provided which was the behavior before the opset upgrade. - [x] ReduceMean - [x] ReduceL2 - [x] ReduceLogSumExp - [x] ReduceSum - Reduction ops all had the axes attribute switched to an input and a new attribute called "noop_with_empty_axes" was added to define what to do when axes is not specified. - [x] Resize has had two new attributes introduced: antialias and keep_aspect_ratio_policy. From Operators.md I've gathered: "Antialiasing is achieved by stretching the resampling filter by a factor max(1, 1 / scale), which means that when downsampling, more input pixels contribute to an output pixel." keep_aspect_ratio_policy "describes how to interpret the `sizes` input with regard to keeping the original aspect ratio of the input." there are a couple enum-type options that specify different policies and what to do in each case. - NOTE: Baiju already included opset18 tests in https://github.com/microsoft/onnxruntime/pull/17772 - [x] ScatterElements/ScatterND has had a new attribute introduced called "reduction." This specifies the type of reduction to apply: none (default), add, mul, max, min. - [x] Split introduced a new attribute called "num_outputs" which specifies how many outputs to split the input tensor into. This is in contrast to the previous, default behavior of specifying a "split" input which defines the size of each resultant tensor of the output. ### Motivation and Context --- .../core/graph/gradient_builder.cc | 37 ++++++++++++++----- .../test/gradient/gradient_ops_test.cc | 30 +++++++++++++-- 2 files changed, 55 insertions(+), 12 deletions(-) diff --git a/orttraining/orttraining/core/graph/gradient_builder.cc b/orttraining/orttraining/core/graph/gradient_builder.cc index e675b55c8af8f..22dcf4eb92411 100755 --- a/orttraining/orttraining/core/graph/gradient_builder.cc +++ b/orttraining/orttraining/core/graph/gradient_builder.cc @@ -1112,6 +1112,7 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) { ArgDef grad = GO(0); if (!keepdims) { + size_t numInputs = GetSrcNodeInputSize(); if (attributes.find("axes") != attributes.end()) { std::vector axes_values = RetrieveValues(attributes.at("axes")); grad = IA("Unqueezed_Grad"); @@ -1122,6 +1123,9 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceMeanGradient) { result.push_back(axes_values_node); result.push_back(NodeDef(OpDef{"Unsqueeze", kOnnxDomain, 13}, {GO(0), axes_values_node.output_args[0]}, {grad})); } + } else if (numInputs == 2) { // optional input 'axes' is available as input I(1) + grad = IA("Unqueezed_Grad"); + result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad})); } } @@ -1152,12 +1156,21 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceLogSumExpGradient) { } ArgDef grad = GO(0); - if (!keepdims && attributes.find("axes") != attributes.end()) { - std::vector axes_values = RetrieveValues(attributes.at("axes")); - grad = IA("Unsqueezed_Grad"); - result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)})); + if (!keepdims) { + size_t numInputs = GetSrcNodeInputSize(); + if (attributes.find("axes") != attributes.end()) { + std::vector axes_values = RetrieveValues(attributes.at("axes")); + grad = IA("Unsqueezed_Grad"); - result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)})); + result.push_back(NodeDef("Unsqueeze", {GO(0)}, {grad}, {MakeAttribute("axes", axes_values)})); + + result.push_back(NodeDef("Unsqueeze", {O(0)}, {IA("Unsqueezed_Output")}, {MakeAttribute("axes", axes_values)})); + } else if (numInputs == 2) { // optional input 'axes' is available as input I(1) + grad = IA("Unsqueezed_Grad"); + result.push_back(NodeDef("Unsqueeze", {GO(0), I(1)}, {grad})); + + result.push_back(NodeDef("Unsqueeze", {O(0), I(1)}, {IA("Unsqueezed_Output")})); + } result.push_back(NodeDef("Sub", {I(0), IA("Unsqueezed_Output")}, {IA("Self_Sub_Result")})); } else { result.push_back(NodeDef("Sub", {I(0), O(0)}, {IA("Self_Sub_Result")})); @@ -1188,11 +1201,17 @@ IMPLEMENT_GRADIENT_BUILDER(GetReduceL2Gradient) { ArgDef scaled_dy_arg_def = IA("Masked_Scaled_dY"); result.emplace_back(NodeDef("Where", {IA("Masked_Y"), ZERO, IA("Scaled_dY")}, {scaled_dy_arg_def})); - if (!keepdims && attributes.find("axes") != attributes.end()) { - std::vector axes_values = RetrieveValues(attributes.at("axes")); + if (!keepdims) { + size_t numInputs = GetSrcNodeInputSize(); scaled_dy_arg_def = IA("Unsqueezed_Masked_Scaled_dY"); - result.emplace_back( - NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)})); + if (attributes.find("axes") != attributes.end()) { + std::vector axes_values = RetrieveValues(attributes.at("axes")); + result.emplace_back( + NodeDef("Unsqueeze", {IA("Masked_Scaled_dY")}, {scaled_dy_arg_def}, {MakeAttribute("axes", axes_values)})); + } else if (numInputs == 2) { // optional input 'axes' is available as input I(1) + result.emplace_back( + NodeDef("Unsqueeze", {IA("Masked_Scaled_dY"), I(1)}, {scaled_dy_arg_def})); + } } result.emplace_back(NodeDef("Mul", {I(0), scaled_dy_arg_def}, {GI(0)})); diff --git a/orttraining/orttraining/test/gradient/gradient_ops_test.cc b/orttraining/orttraining/test/gradient/gradient_ops_test.cc index feca94ae27c13..94ca96c68f2ce 100644 --- a/orttraining/orttraining/test/gradient/gradient_ops_test.cc +++ b/orttraining/orttraining/test/gradient/gradient_ops_test.cc @@ -607,6 +607,10 @@ TEST(GradientCheckerTest, ReduceMeanGrad) { OpDef op_def_opset13{"ReduceMean", kOnnxDomain, 13}; RunReductionTests(op_def_opset13); + + // axes is input from opset 18. + OpDef op_def_opset18{"ReduceMean", kOnnxDomain, 18}; + RunReductionTests(op_def_opset18, true, true); } TEST(GradientCheckerTest, ReduceSumGrad) { @@ -619,6 +623,10 @@ TEST(GradientCheckerTest, ReduceSumGrad) { OpDef op_def_13{"ReduceSum", kOnnxDomain, 13}; RunReductionTests(op_def_13, true, true); + + OpDef op_def_18{"ReduceSum", kOnnxDomain, 18}; + + RunReductionTests(op_def_18, true, true); } TEST(GradientCheckerTest, ReduceL2Grad) { @@ -641,6 +649,11 @@ TEST(GradientCheckerTest, ReduceL2Grad) { {MakeAttribute("axes", axes)})); EXPECT_IS_TINY(max_error); } + + // axes is input from opset 18 + OpDef op_def_18{"ReduceL2", kOnnxDomain, 18}; + + RunReductionTests(op_def_18, true, true); } TEST(GradientCheckerTest, ReduceLogSumExpGrad) { @@ -648,6 +661,10 @@ TEST(GradientCheckerTest, ReduceLogSumExpGrad) { OpDef op_def{"ReduceLogSumExp", kOnnxDomain, 11}; RunReductionTests(op_def); + + OpDef op_def_opset18{"ReduceLogSumExp", kOnnxDomain, 18}; + + RunReductionTests(op_def_opset18, true, true); } TEST(GradientCheckerTest, ReluGrad) { @@ -698,6 +715,13 @@ TEST(GradientCheckerTest, SplitGrad) { ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_13, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error, {MakeAttribute("axis", int64_t(0))})); EXPECT_IS_TINY(max_error); + + // opset18 test + OpDef op_def_18{"Split", kOnnxDomain, 18}; + ASSERT_STATUS_OK(gradient_checker.ComputeGradientError(op_def_18, {shape}, {{3, 5}, {3, 5}, {3, 5}}, &max_error, + {MakeAttribute("axis", int64_t(0)), + MakeAttribute("num_outputs", int64_t(3))})); + EXPECT_IS_TINY(max_error); } template @@ -2733,7 +2757,7 @@ TEST(GradientCheckerTest, TileGrad) { TEST(GradientCheckerTest, PadGrad) { float max_error; GradientChecker gradient_checker; - OpDef op_def{"Pad", kOnnxDomain, 11}; + OpDef op_def{"Pad", kOnnxDomain, 18}; { TensorInfo x_info({2, 4}, true); @@ -2803,7 +2827,7 @@ TEST(GradientCheckerTest, PadGrad) { TEST(GradientCheckerTest, ScatterNDGrad) { float max_error; GradientChecker gradient_checker; - OpDef op_def{"ScatterND", kOnnxDomain, 11}; + OpDef op_def{"ScatterND", kOnnxDomain, 18}; { TensorInfo data_info({8}, true); @@ -2887,7 +2911,7 @@ TEST(GradientCheckerTest, ScatterNDGrad) { TEST(GradientCheckerTest, ScatterElementsGrad) { float max_error; GradientChecker gradient_checker; - OpDef op_def{"ScatterElements", kOnnxDomain, 13}; + OpDef op_def{"ScatterElements", kOnnxDomain, 18}; { // without axis TensorInfo data_info({3, 3}, true); From d4c8bc359e321cdabdd87b70b392dd0e7a14502e Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Wed, 20 Mar 2024 00:33:24 +0800 Subject: [PATCH 28/55] Fix Training CPU docker image name to avoid unnecessary rebuilding (#19973) ### Description The docker image name was fixed, but the docker argument was different in different job. It would trigger rebuilding the docker image almost every time!!! --- .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml index bf1ba71b7b818..0e6e5bd53fab3 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml @@ -46,7 +46,7 @@ stages: --build-arg PYTHON_VERSION=$(PythonVersion) --build-arg INSTALL_DEPS_EXTRA_ARGS=-tu --build-arg BUILD_UID=$(id -u) - Repository: onnxruntimetrainingcpubuild + Repository: onnxruntimetrainingcpubuild_$(PythonVersion) - task: CmdLine@2 displayName: 'build onnxruntime' From 8293aa156414946428f635c71edea88cb20b4925 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 19 Mar 2024 11:36:42 -0700 Subject: [PATCH 29/55] Exclude TRT provider in tests crashed in A100 (#19972) TensorRT EP segmentation fault on A100 for some tests. Exclude TRT EP in those tests on A100 to unblock developing. ### Motivation and Context https://github.com/microsoft/onnxruntime/issues/19530 --- onnxruntime/test/common/cuda_op_test_utils.cc | 36 +++++++++ onnxruntime/test/common/cuda_op_test_utils.h | 27 ++----- onnxruntime/test/common/trt_op_test_utils.h | 33 ++++++++ .../test/providers/cpu/math/einsum_test.cc | 75 +++++++++--------- .../cpu/math/element_wise_ops_test.cc | 6 +- .../cpu/object_detection/roialign_test.cc | 7 +- .../providers/cpu/tensor/onehot_op_test.cc | 14 ++-- .../providers/cpu/tensor/resize_op_test.cc | 76 +++++++++++-------- .../providers/cpu/tensor/upsample_op_test.cc | 5 +- 9 files changed, 178 insertions(+), 101 deletions(-) create mode 100644 onnxruntime/test/common/cuda_op_test_utils.cc create mode 100644 onnxruntime/test/common/trt_op_test_utils.h diff --git a/onnxruntime/test/common/cuda_op_test_utils.cc b/onnxruntime/test/common/cuda_op_test_utils.cc new file mode 100644 index 0000000000000..bab4e9a60e2ed --- /dev/null +++ b/onnxruntime/test/common/cuda_op_test_utils.cc @@ -0,0 +1,36 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#ifdef USE_CUDA +#include "cuda_runtime_api.h" +#endif + +namespace onnxruntime { +namespace test { + +int GetCudaArchitecture() { + // This will cache the result so we only call cudaGetDeviceProperties once. + // Usually, we test on a single GPU or multiple GPUs of same architecture, so it's fine to cache the result. + static int cuda_arch = -1; + +#ifdef USE_CUDA + if (cuda_arch == -1) { + int current_device_id = 0; + cudaGetDevice(¤t_device_id); + // must wait GPU idle, otherwise cudaGetDeviceProperties might fail + cudaDeviceSynchronize(); + cudaDeviceProp prop; + + // When cudaGetDeviceProperties fails, just return -1 and no error is raised. + // If cuda device has issue, test will fail anyway so no need to raise error here. + if (cudaSuccess == cudaGetDeviceProperties(&prop, current_device_id)) { + cuda_arch = prop.major * 100 + prop.minor * 10; + } + } +#endif + + return cuda_arch; +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/common/cuda_op_test_utils.h b/onnxruntime/test/common/cuda_op_test_utils.h index 043e3059c38d7..6f3e460628566 100644 --- a/onnxruntime/test/common/cuda_op_test_utils.h +++ b/onnxruntime/test/common/cuda_op_test_utils.h @@ -4,37 +4,20 @@ #pragma once #include "test/util/include/default_providers.h" -#ifdef USE_CUDA -#include "cuda_runtime_api.h" -#endif namespace onnxruntime { namespace test { +// CUDA architecture of the current device like 100 * major + 10 * minor. +// Please call this function after CUDA EP is enabled. +int GetCudaArchitecture(); + inline bool HasCudaEnvironment(int min_cuda_architecture) { if (DefaultCudaExecutionProvider().get() == nullptr) { return false; } - if (min_cuda_architecture == 0) { - return true; - } - - int cuda_architecture = 0; - -#ifdef USE_CUDA - int currentCudaDevice = 0; - cudaGetDevice(¤tCudaDevice); - cudaDeviceSynchronize(); - cudaDeviceProp prop; - if (cudaSuccess != cudaGetDeviceProperties(&prop, currentCudaDevice)) { - return false; - } - - cuda_architecture = prop.major * 100 + prop.minor * 10; -#endif - - return cuda_architecture >= min_cuda_architecture; + return GetCudaArchitecture() >= min_cuda_architecture; } inline bool NeedSkipIfCudaArchLowerThan(int min_cuda_architecture) { diff --git a/onnxruntime/test/common/trt_op_test_utils.h b/onnxruntime/test/common/trt_op_test_utils.h new file mode 100644 index 0000000000000..a0b0b9bb1931f --- /dev/null +++ b/onnxruntime/test/common/trt_op_test_utils.h @@ -0,0 +1,33 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include "test/common/cuda_op_test_utils.h" + +namespace onnxruntime { +namespace test { + +// TensorRT EP Segmentation fault on A100: https://github.com/microsoft/onnxruntime/issues/19530 +inline const std::unordered_set ExcludeTrtOnA100() { + // Note: GetCudaArchitecture need USE_CUDA to be defined. Currently, it is defined when TRT EP is enabled. + // If we want to make TRT EP independent of CUDA EP, we need to change the implementation of GetCudaArchitecture. + if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) { + return {kTensorrtExecutionProvider}; + } + + return {}; +} + +// Add TensorRT EP to an excluded provider list when running on A100 +inline const std::unordered_set& ExcludeTrtOnA100(std::unordered_set& excluded_providers) { + if (DefaultTensorrtExecutionProvider() != nullptr && GetCudaArchitecture() == 800) { + excluded_providers.insert(kTensorrtExecutionProvider); + return excluded_providers; + } + + return excluded_providers; +} + +} // namespace test +} // namespace onnxruntime diff --git a/onnxruntime/test/providers/cpu/math/einsum_test.cc b/onnxruntime/test/providers/cpu/math/einsum_test.cc index 4e968d3de6b8a..423ea3f682f4c 100644 --- a/onnxruntime/test/providers/cpu/math/einsum_test.cc +++ b/onnxruntime/test/providers/cpu/math/einsum_test.cc @@ -4,6 +4,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" #include "test/common/cuda_op_test_utils.h" +#include "test/common/trt_op_test_utils.h" #include "core/framework/data_types.h" #include "core/util/math.h" @@ -50,7 +51,7 @@ TEST(Einsum, ExplicitEinsumAsTransposeOp_2D_input_With_Broadcasting) { test.AddAttribute("equation", "...i->i..."); test.AddInput("x", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("y", {2, 2}, {1.f, 3.f, 2.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) { @@ -58,7 +59,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedTransposeOp_3D_input) { test.AddAttribute("equation", "...ji->...ij"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Implicit @@ -75,7 +76,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedTransposeOp_3D_input) { test.AddAttribute("equation", "...ji"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("y", {2, 2, 2}, {1.f, 3.f, 2.f, 4.f, 1.f, 3.f, 2.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Theme: Axis/Axes reduction @@ -102,7 +103,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_0) { test.AddAttribute("equation", "...ji->...j"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("y", {2, 2}, {3.f, 7.f, 3.f, 7.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) { @@ -110,7 +111,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedReduceOp_3D_input_1) { test.AddAttribute("equation", "...ji->..."); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("y", {2}, {10.f, 10.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Implicit @@ -144,7 +145,7 @@ TEST(Einsum, ExplicitEinsumAsOuterProductWithTransposeOp_Multi_Input) { test.AddInput("y", {2}, {3.f, 4.f}); test.AddInput("z", {2}, {5.f, 6.f}); test.AddOutput("o", {2, 2, 2}, {15.f, 18.f, 30.f, 36.f, 20.f, 24.f, 40.f, 48.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Implicit @@ -155,7 +156,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_2D_input) { test.AddInput("y", {2}, {3.f, 4.f}); test.AddInput("z", {2}, {5.f, 6.f}); test.AddOutput("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) { @@ -165,7 +166,7 @@ TEST(Einsum, ImplicitEinsumAsOuterProductOp_Multi_Input) { test.AddInput("y", {2}, {3.f, 4.f}); test.AddInput("z", {2}, {5.f, 6.f}); test.AddOutput("o", {2, 2, 2}, {15.f, 18.f, 20.f, 24.f, 30.f, 36.f, 40.f, 48.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Theme: MatMul @@ -233,7 +234,7 @@ TEST(Einsum, ExplicitEinsumAsMatmul_Multi_Input) { test.AddInput("y", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddInput("z", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {37.f, 81.f, 54.f, 118.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsBatchedMatmul) { @@ -251,7 +252,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_0) { test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddInput("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) { @@ -260,7 +261,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedMatmulWithBroadcasting_1) { test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddInput("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2, 2}, {14.f, 20.f, 30.f, 44.f, 14.f, 20.f, 30.f, 44.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsMatmul_OutputTransposed) { @@ -303,7 +304,7 @@ TEST(Einsum, ImplicitEinsumAsMatmul_Multi_Input) { test.AddInput("y", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddInput("z", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {37.f, 54.f, 81.f, 118.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsBatchedMatmul) { OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); @@ -320,7 +321,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedMatmulWithBroadcasting_0) { test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddInput("y", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2, 2}, {7.f, 10.f, 15.f, 22.f, 7.f, 10.f, 15.f, 22.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsMatmul_2) { @@ -343,7 +344,7 @@ TEST(Einsum, DiagonalWithMatmul) { test.AddInput("x", {2, 2, 3}, {1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f, 1.f, 2.f, 3.f}); test.AddInput("y", {3, 3}, {1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f}); test.AddOutput("o", {3}, {60.f, 72.f, 84.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Theme: Diagonal parsing @@ -354,7 +355,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp) { test.AddAttribute("equation", "ii->i"); test.AddInput("x", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2}, {1.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) { @@ -362,7 +363,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_1) { test.AddAttribute("equation", "iii->i"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2}, {1.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) { @@ -370,7 +371,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisReduced) { test.AddAttribute("equation", "iji->j"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2}, {3.f, 7.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) { @@ -378,7 +379,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithAxisPreserved) { test.AddAttribute("equation", "iji->ij"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {1.f, 3.f, 2.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) { @@ -386,7 +387,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose) { test.AddAttribute("equation", "iji->ji"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {1.f, 2.f, 3.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // ROCm doesn't support double @@ -396,7 +397,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_double) { test.AddAttribute("equation", "iji->ji"); test.AddInput("x", {2, 2, 2}, {1., 2., 3., 4., 1., 2., 3., 4.}); test.AddOutput("o", {2, 2}, {1., 2., 3., 4.}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } #endif @@ -405,7 +406,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int32) { test.AddAttribute("equation", "iji->ji"); test.AddInput("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4}); test.AddOutput("o", {2, 2}, {1, 2, 3, 4}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) { @@ -413,14 +414,14 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOpWithTranspose_int64) { test.AddAttribute("equation", "iji->ji"); test.AddInput("x", {2, 2, 2}, {1, 2, 3, 4, 1, 2, 3, 4}); test.AddOutput("o", {2, 2}, {1, 2, 3, 4}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp) { OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); test.AddAttribute("equation", "...ii->...i"); test.AddInput("x", {3, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {3, 2}, {1.f, 4.f, 1.f, 4.f, 1.f, 4.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) { @@ -428,7 +429,7 @@ TEST(Einsum, ExplicitEinsumAsBatchedDiagonalOp_1) { test.AddAttribute("equation", "...iij->...j"); test.AddInput("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {4.f, 6.f, 4.f, 6.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Implicit (Implicit diagonal ops will sum up diagonal values) @@ -442,7 +443,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp) { test.AddAttribute("equation", "ii"); test.AddInput("x", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {}, {5.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) { @@ -455,7 +456,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOp_1) { test.AddAttribute("equation", "iii"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {}, {5.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) { @@ -463,7 +464,7 @@ TEST(Einsum, ImplicitEinsumAsDiagonalOpWithAxisReduced) { test.AddAttribute("equation", "iji"); test.AddInput("x", {2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2}, {3.f, 7.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) { @@ -471,7 +472,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp) { test.AddAttribute("equation", "...ii"); test.AddInput("x", {2, 1, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 1}, {5.f, 5.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) { @@ -479,7 +480,7 @@ TEST(Einsum, ImplicitEinsumAsBatchedDiagonalOp_1) { test.AddAttribute("equation", "...iij"); test.AddInput("x", {2, 2, 2, 2}, {1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f, 1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {4.f, 6.f, 4.f, 6.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Theme: Scalar inputs and outputs @@ -491,7 +492,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar) { test.AddInput("x", {}, {10.f}); test.AddInput("y", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {10.f, 20.f, 30.f, 40.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) { @@ -501,7 +502,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithTwoScalars_Multi_Input) { test.AddInput("y", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddInput("z", {}, {10.f}); test.AddOutput("o", {2, 2}, {100.f, 200.f, 300.f, 400.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithAllScalars) { OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); @@ -527,7 +528,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithOneScalar) { test.AddInput("x", {}, {10.f}); test.AddInput("y", {2, 2}, {1.f, 2.f, 3.f, 4.f}); test.AddOutput("o", {2, 2}, {10.f, 20.f, 30.f, 40.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) { @@ -538,7 +539,7 @@ TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithThreeScalars_Multi_Input) { test.AddInput("c", {}, {10.f}); test.AddInput("d", {}, {10.f}); test.AddOutput("o", {2, 2}, {1000.f, 2000.f, 3000.f, 4000.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ImplicitEinsumAsElementwiseMulOpWithAllScalars) { OpTester test("Einsum", 12, onnxruntime::kOnnxDomain); @@ -568,7 +569,7 @@ TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeFinal) { test.AddInput("y", {2, 2}, {1.f, 2.f, -6.f, 2.f}); test.AddInput("z", {2, 2}, {3.f, 4.f, 5.f, 6.f}); test.AddOutput("o", {2, 2, 2}, {63.f, -132.f, 63.f, -132.f, 63.f, -132.f, 63.f, -132.f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsTensorContractionReshapeLeft) { @@ -720,7 +721,7 @@ TEST(Einsum, ExplicitEinsumAsDiagonalOp_Half) { ConvertFloatToMLFloat16(output_f.data(), output.data(), 2); test.AddInput("x", {2, 2}, input_x); test.AddOutput("o", {2}, output); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) { @@ -741,7 +742,7 @@ TEST(Einsum, ExplicitEinsumAsElementwiseMulOpWithOneScalar_Half) { test.AddInput("x", {}, input_x); test.AddInput("y", {2, 2}, input_y); test.AddOutput("o", {2, 2}, output); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(Einsum, ExplicitEinsumAsTensorContraction_Half) { @@ -2093,7 +2094,7 @@ TEST_P(EinsumTransposeMatMulThreeInputsTest, EinsumTransposeMatMulThreeInputsTes std::vector v1(tst.shape.begin(), tst.shape.end()); std::vector v2(tst.expected.begin(), tst.expected.end()); test.AddOutput("o", v1, v2); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } INSTANTIATE_TEST_SUITE_P(EinsumTransposeMatMulThreeInputsTests, EinsumTransposeMatMulThreeInputsTest, testing::ValuesIn(case1)); diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index d35e5c78cfd69..0e99b2306873e 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -6,6 +6,7 @@ #include "test/util/include/default_providers.h" #include "test/common/dnnl_op_test_utils.h" #include "test/common/cuda_op_test_utils.h" +#include "test/common/trt_op_test_utils.h" #include "core/util/math.h" #include #include @@ -1370,7 +1371,8 @@ static void TestSumMultipleInputsNoBroadcasting(size_t num_inputs, const TensorS test.AddOutput("sum", dims, expected_output_data); - test.Run(); + // TRT EP segmentation fault in A100 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(MathOpTest, SumMultipleInputsNoBroadcasting) { @@ -2639,6 +2641,7 @@ void TrigFloatTest(OpTester& test, std::initializer_list input) { test.AddInput("X", dims, input); test.AddOutput("Y", dims, output); + test.Run(); } @@ -2708,6 +2711,7 @@ TEST(MathOpTest, CosFloat16) { TrigFloat16Test<::cosf>(test, {1.1f, -1.1f, 2.2f, -2.2f}); } } + TEST(MathOpTest, Tan) { OpTester test("Tan"); TrigFloatTest<::tanf>(test, {-100.0f, -50.0f, 0.0f, 50.0f, 100.0f}); diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc index 2f97f6e71e92b..0bff46edccc12 100644 --- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc +++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc @@ -4,6 +4,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" +#include "test/common/trt_op_test_utils.h" namespace onnxruntime { namespace test { @@ -713,7 +714,8 @@ TEST(RoiAlignTest, AvgModeNegativeInvalidMode) { test.AddInput("batch_indices", {5}, {0, 0, 0, 0, 0}); test.AddOutput("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f}); - test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode"); + // TRT EP segmentation fault in A100 + test.Run(OpTester::ExpectResult::kExpectFailure, "Invalid mode", ExcludeTrtOnA100()); } TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) { @@ -738,7 +740,8 @@ TEST(RoiAlignTest, AvgModeNegativeSamplingRatio) { test.AddInput("batch_indices", {5}, {0, 0, 0, 0, 0}); test.AddOutput("Y", {5, 3, 3, 4}, {2.95833f, 3.20833f, 3.45833f, 3.70833f, 4.625f, 4.875f, 5.125f, 5.375f, 6.29167f, 6.54167f, 6.79167f, 7.04167f, 27.9583f, 28.2083f, 28.4583f, 28.7083f, 29.625f, 29.875f, 30.125f, 30.375f, 31.2917f, 31.5417f, 31.7917f, 32.0417f, 52.9583f, 53.2083f, 53.4583f, 53.7083f, 54.625f, 54.875f, 55.125f, 55.375f, 56.2917f, 56.5417f, 56.7917f, 57.0417f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 0.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 25.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 50.f, 7.39583f, 7.39583f, 7.42708f, 7.64583f, 9.0625f, 9.0625f, 9.09375f, 9.3125f, 10.7292f, 10.7292f, 10.7604f, 10.9792f, 32.3958f, 32.3958f, 32.4271f, 32.6458f, 34.0625f, 34.0625f, 34.0938f, 34.3125f, 35.7292f, 35.7292f, 35.7604f, 35.9792f, 57.3958f, 57.3958f, 57.4271f, 57.6458f, 59.0625f, 59.0625f, 59.0938f, 59.3125f, 60.7292f, 60.7292f, 60.7604f, 60.9792f, 4.27083f, 4.52083f, 4.77083f, 5.02083f, 5.9375f, 6.1875f, 6.4375f, 6.6875f, 7.60417f, 7.85417f, 8.10417f, 8.35417f, 29.2708f, 29.5208f, 29.7708f, 30.0208f, 30.9375f, 31.1875f, 31.4375f, 31.6875f, 32.6042f, 32.8542f, 33.1042f, 33.3542f, 54.2708f, 54.5208f, 54.7708f, 55.0208f, 55.9375f, 56.1875f, 56.4375f, 56.6875f, 57.6042f, 57.8542f, 58.1042f, 58.3542f, 6.77083f, 6.77083f, 6.77083f, 6.80208f, 8.4375f, 8.4375f, 8.4375f, 8.46875f, 10.1042f, 10.1042f, 10.1042f, 10.1354f, 31.7708f, 31.7708f, 31.7708f, 31.8021f, 33.4375f, 33.4375f, 33.4375f, 33.4688f, 35.1042f, 35.1042f, 35.1042f, 35.1354f, 56.7708f, 56.7708f, 56.7708f, 56.8021f, 58.4375f, 58.4375f, 58.4375f, 58.4688f, 60.1042f, 60.1042f, 60.1042f, 60.1354f}); - test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0"); + // TRT EP segmentation fault in A100 + test.Run(OpTester::ExpectResult::kExpectFailure, "Sampling ratio should be >=0", ExcludeTrtOnA100()); } TEST(RoiAlignTest, AvgModeNegativeInvalidNumRoiDims) { diff --git a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc index a2ffbdcc0bdf1..55c247e4c2fea 100644 --- a/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/onehot_op_test.cc @@ -3,6 +3,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" +#include "test/common/trt_op_test_utils.h" using namespace std; @@ -36,7 +37,8 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float /*indices, output, depth*/) { 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.}); - test.Run(); + // TRT EP segmentation fault in A100 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) { @@ -51,7 +53,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float /*indices, output, depth*/) { 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(OneHotOpTest, DefaultAxis_int64_float_int64 /*indices, output, depth*/) { @@ -81,7 +83,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float /*indices, output, depth*/) { 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 1.0f, 0.0f, 0.0f, 0.0f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(OneHotOpTest, DefaultAxis_int32_float_int32 /*indices, output, depth*/) { @@ -231,7 +233,7 @@ TEST(OneHotOpTest, DefaultAxis_float_float_float_NonZeroOffValue /*indices, outp 2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 2., 3., 2., 2., 2.}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, output, depth*/) { @@ -246,7 +248,7 @@ TEST(OneHotOpTest, DefaultAxis_int64_int32_float_NonZeroOffValue /*indices, outp 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(OneHotOpTest, DefaultAxis_int64_float_int64_NonZeroOffValue /*indices, output, depth*/) { @@ -276,7 +278,7 @@ TEST(OneHotOpTest, DefaultAxis_int32_float_float_NonZeroOffValue /*indices, outp 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 2.0f, 3.0f, 2.0f, 2.0f, 2.0f}); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(OneHotOpTest, DefaultAxis_int32_float_int32_NonZeroOffValue /*indices, output, depth*/) { diff --git a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc index 062f25b989a70..496f2213e9d32 100644 --- a/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/resize_op_test.cc @@ -5,9 +5,11 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" +#include "test/common/trt_op_test_utils.h" namespace onnxruntime { namespace test { + TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_tf_crop_and_resize) { // TODO: Unskip when fixed #41968513 if (DefaultDmlExecutionProvider().get() != nullptr) { @@ -243,7 +245,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear) { std::vector Y = {2.66666651f, 4.3333331f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); // QNN: result diff + // QNN: result diff + // TRT: Segmentation fault in A100 + std::unordered_set excluded_providers({kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers)); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) { @@ -267,8 +272,9 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear) { test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); // CUDA: result mismatch due to not implementing NHWC support // ROCm: results mismatch - test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider}); + // TRT: Segmentation fault in A100 + std::unordered_set excluded_providers({kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers)); } TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_uint8) { @@ -315,7 +321,7 @@ TEST(ResizeOpTest, NhwcResizeOpLinearDownSampleTest_4DBilinear_int8) { std::vector Y = {0, 0}; test.AddOutput("Y", {N, static_cast(H * scales[1]), static_cast(W * scales[2]), C}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } // Since NNAPI(TFLite) only using the scale calculate using the input/output size @@ -347,7 +353,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear1) { std::vector Y = {3.5f, 5.5f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); }; run_test(false); @@ -405,7 +411,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_align_corners) { std::vector Y = {1.0f, 4.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); }; run_test(false); @@ -608,7 +614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_asymmetric_scales) { 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); }; run_test(false); @@ -725,7 +731,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_align_corners) { 4.0f, 4.5714290f, 5.142857f, 5.714286f, 6.285714f, 6.8571430f, 7.428571f, 8.0f}; test.AddOutput("Y", {static_cast(H * scales[0]), static_cast(W * scales[1])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_3DTrilinear_pytorch_half_pixel) { @@ -819,7 +825,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest) { 7.0f, 11.0f}; test.AddOutput("Y", {N, C, H, W}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); }; run_test(false); @@ -845,7 +851,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest) { std::vector Y = {1.0f, 3.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) { @@ -867,7 +873,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Opset12) { std::vector Y = {1.0f, 3.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_WithSizes) { @@ -920,7 +926,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_half_pixel) { 14.0f, 16.0f}; test.AddOutput("Y", {N, C, sizes[2], sizes[3]}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_tf_crop_and_resize_with_extrapolation) { @@ -1000,7 +1006,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest) { 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_WithSizes_CeilMode) { @@ -1093,7 +1099,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Floor_Align_Corners) { 13.0f, 13.0f, 13.0f, 14.0f, 14.0f, 15.0f, 15.0f, 16.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearest_OneToOneMappingBetweenInputAndOutputDataDims) { @@ -1197,7 +1203,7 @@ TEST(ResizeOpTest, ResizeOpNearestUpSample_Nearest2xOptimization_Scales) { 3.0f, 3.0f, 4.0f, 4.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); }; run_test(false); @@ -1262,7 +1268,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest) { 11.9165f, 13.2266f, 14.5278f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) { @@ -1292,7 +1298,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_exclude_outside) { 11.949f, 13.2503f, 14.5942f}; test.AddOutput("Y", {static_cast(H * scales[0]), static_cast(W * scales[1])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) { @@ -1319,7 +1325,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_coeff) { 11.8701f, 13.168f, 14.4912f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_with_roi) { @@ -1373,7 +1379,7 @@ TEST(ResizeOpTest, ResizeOpCubicDownSampleTest_asymmetric) { 11.375f, 12.6719f, 13.9688f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) { @@ -1405,7 +1411,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest) { 13.375f, 13.7813f, 14.375f, 14.875f, 15.375f, 15.9688f, 16.375f, 16.4688f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_MultiChannel) { @@ -1486,7 +1492,7 @@ TEST(ResizeOpTest, ResizeOpCubicUpSampleTest_tf_half_pixel_for_nn) { 13.332f, 13.8086f, 14.4375f, 14.8438f, 15.4727f, 15.9492f, 16.2461f, 16.1758f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) { @@ -1512,7 +1518,10 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_4DBilinear_Ver10) { std::vector Y = {1.0f, 2.66666651f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); // QNN: result diff + // QNN: result diff + // TRT: segmentation fault in A100 + std::unordered_set excluded_providers({kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers)); } TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) { @@ -1538,7 +1547,7 @@ TEST(ResizeOpTest, ResizeOpLinearDownSampleTest_2DBilinear_Ver10) { std::vector Y = {1.0f, 2.66666651f}; test.AddOutput("Y", {static_cast(H * scales[0]), static_cast(W * scales[1])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) { @@ -1574,7 +1583,10 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_4DBilinear_Ver10) { 7.0f, 8.0f, 9.0f, 10.0f, 11.0f, 11.0f, 11.0f, 11.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kQnnExecutionProvider}); // QNN: result diff + // QNN: result diff + // TRT: segmentation fault in A100 + std::unordered_set excluded_providers({kQnnExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100(excluded_providers)); } TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) { @@ -1602,7 +1614,7 @@ TEST(ResizeOpTest, ResizeOpLinearUpSampleTest_2DBilinear_Ver10) { 4.0f, 5.0f, 6.0f, 7.0f, 8.0f, 8.0f, 8.0f, 8.0f}; test.AddOutput("Y", {static_cast(H * scales[0]), static_cast(W * scales[1])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) { @@ -1627,7 +1639,7 @@ TEST(ResizeOpTest, ResizeOpLinearScalesNoOpTest_Ver10) { 7.0f, 11.0f}; test.AddOutput("Y", {N, C, H, W}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) { @@ -1647,7 +1659,7 @@ TEST(ResizeOpTest, ResizeOpNearestDownSampleTest_Ver10) { std::vector Y = {1.0f, 3.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) { @@ -1668,10 +1680,10 @@ TEST(ResizeOpTest, ResizeOpNearestUpSampleTest_Ver10) { 3.0f, 3.0f, 3.0f, 4.0f, 4.0f, 4.0f}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } -TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) { +TEST(ResizeOpTest, ResizeOpNearestNoScaleTest_Ver10) { OpTester test("Resize", 10); std::vector scales{1.0f, 1.0f, 1.0f, 1.0f}; @@ -1686,7 +1698,7 @@ TEST(UpsampleOpTest, ResizeOpNearestNoScaleTest_Ver10) { std::vector Y = {1.0f, 2.0f, 3.0f, 4.0f}; test.AddOutput("Y", {N, C, H, W}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOp_MissingRoiAndMissingScalesOptionalInputs) { @@ -1737,7 +1749,7 @@ void ResizeOpTypeCheck_Ver_10() { 3, 3, 3, 4, 4, 4}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpTypeCheck_Ver_10) { @@ -1768,7 +1780,7 @@ void ResizeOpTypeCheck_Ver_11_13_18(int opset_version) { 3, 3, 3, 4, 4, 4}; test.AddOutput("Y", {N, C, static_cast(H * scales[2]), static_cast(W * scales[3])}, Y); - test.Run(); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(ResizeOpTest, ResizeOpTypeCheck_Ver11) { diff --git a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc index 188532cfa350a..3ac8053aef95e 100644 --- a/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/upsample_op_test.cc @@ -4,6 +4,7 @@ #include "gtest/gtest.h" #include "test/providers/provider_test_utils.h" #include "test/util/include/default_providers.h" +#include "test/common/trt_op_test_utils.h" namespace onnxruntime { namespace test { @@ -939,7 +940,9 @@ TEST(UpsampleOpTest, UpsampleOpNearest2XTest_opset9) { 7, 7, 9, 9}; test.AddOutput("Y", {N, C, (int64_t)(H * scales[2]), (int64_t)(W * scales[3])}, Y); - test.Run(); + + // TRT: segmentation fault in A100 + test.Run(OpTester::ExpectResult::kExpectSuccess, "", ExcludeTrtOnA100()); } TEST(UpsampleOpTest, NhwcUpsampleOpNearest2XTest_opset9) { From 01c7aaf6aa75c88a0fd7e9aacc13ebeb958674aa Mon Sep 17 00:00:00 2001 From: Yulong Wang <7679871+fs-eire@users.noreply.github.com> Date: Tue, 19 Mar 2024 12:55:00 -0700 Subject: [PATCH 30/55] [js/webgpu] allow setting env.webgpu.adapter (#19940) ### Description Allow user to set `env.webgpu.adapter` before creating the first inference session. Feature request: https://github.com/microsoft/onnxruntime/pull/19857#issuecomment-1999984753 @xenova --- js/common/lib/env.ts | 10 +++++--- js/web/lib/wasm/jsep/backend-webgpu.ts | 6 +++-- js/web/lib/wasm/wasm-core-impl.ts | 35 ++++++++++++++++++-------- 3 files changed, 35 insertions(+), 16 deletions(-) diff --git a/js/common/lib/env.ts b/js/common/lib/env.ts index b139c719e863f..c8df1613b3268 100644 --- a/js/common/lib/env.ts +++ b/js/common/lib/env.ts @@ -166,16 +166,20 @@ export declare namespace Env { */ forceFallbackAdapter?: boolean; /** - * Get the adapter for WebGPU. + * Set or get the adapter for WebGPU. * - * This property is only available after the first WebGPU inference session is created. + * Setting this property only has effect before the first WebGPU inference session is created. The value will be + * used as the GPU adapter for the underlying WebGPU backend to create GPU device. + * + * If this property is not set, it will be available to get after the first WebGPU inference session is created. The + * value will be the GPU adapter that created by the underlying WebGPU backend. * * When use with TypeScript, the type of this property is `GPUAdapter` defined in "@webgpu/types". * Use `const adapter = env.webgpu.adapter as GPUAdapter;` in TypeScript to access this property with correct type. * * see comments on {@link Tensor.GpuBufferType} */ - readonly adapter: unknown; + adapter: unknown; /** * Get the device for WebGPU. * diff --git a/js/web/lib/wasm/jsep/backend-webgpu.ts b/js/web/lib/wasm/jsep/backend-webgpu.ts index d92b8ac68dbe7..b36dc73330d46 100644 --- a/js/web/lib/wasm/jsep/backend-webgpu.ts +++ b/js/web/lib/wasm/jsep/backend-webgpu.ts @@ -252,8 +252,10 @@ export class WebGpuBackend { } }; - Object.defineProperty(this.env.webgpu, 'device', {value: this.device}); - Object.defineProperty(this.env.webgpu, 'adapter', {value: adapter}); + Object.defineProperty( + this.env.webgpu, 'device', {value: this.device, writable: false, enumerable: true, configurable: false}); + Object.defineProperty( + this.env.webgpu, 'adapter', {value: adapter, writable: false, enumerable: true, configurable: false}); // init queryType, which is necessary for InferenceSession.create this.setQueryType(); diff --git a/js/web/lib/wasm/wasm-core-impl.ts b/js/web/lib/wasm/wasm-core-impl.ts index 7019758be0efd..9b27051f1b9fe 100644 --- a/js/web/lib/wasm/wasm-core-impl.ts +++ b/js/web/lib/wasm/wasm-core-impl.ts @@ -93,18 +93,31 @@ export const initEp = async(env: Env, epName: string): Promise => { if (typeof navigator === 'undefined' || !navigator.gpu) { throw new Error('WebGPU is not supported in current environment'); } - const powerPreference = env.webgpu?.powerPreference; - if (powerPreference !== undefined && powerPreference !== 'low-power' && powerPreference !== 'high-performance') { - throw new Error(`Invalid powerPreference setting: "${powerPreference}"`); - } - const forceFallbackAdapter = env.webgpu?.forceFallbackAdapter; - if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') { - throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`); - } - const adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter}); + + let adapter = env.webgpu.adapter as GPUAdapter | null; if (!adapter) { - throw new Error( - 'Failed to get GPU adapter. You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.'); + // if adapter is not set, request a new adapter. + const powerPreference = env.webgpu.powerPreference; + if (powerPreference !== undefined && powerPreference !== 'low-power' && + powerPreference !== 'high-performance') { + throw new Error(`Invalid powerPreference setting: "${powerPreference}"`); + } + const forceFallbackAdapter = env.webgpu.forceFallbackAdapter; + if (forceFallbackAdapter !== undefined && typeof forceFallbackAdapter !== 'boolean') { + throw new Error(`Invalid forceFallbackAdapter setting: "${forceFallbackAdapter}"`); + } + adapter = await navigator.gpu.requestAdapter({powerPreference, forceFallbackAdapter}); + if (!adapter) { + throw new Error( + 'Failed to get GPU adapter. ' + + 'You may need to enable flag "--enable-unsafe-webgpu" if you are using Chrome.'); + } + } else { + // if adapter is set, validate it. + if (typeof adapter.limits !== 'object' || typeof adapter.features !== 'object' || + typeof adapter.requestDevice !== 'function') { + throw new Error('Invalid GPU adapter set in `env.webgpu.adapter`. It must be a GPUAdapter object.'); + } } if (!env.wasm.simd) { From 18a7f34ba052d183a254dcdcc9a939790e8c73e0 Mon Sep 17 00:00:00 2001 From: Adrian Lizarraga Date: Tue, 19 Mar 2024 13:48:04 -0700 Subject: [PATCH 31/55] [NhwcTransformerTests] Fix linker error due to explicit template instantiation of ModelBuilder methods (#19980) Currently, the nhwc_transformer_test.cc compilation unit defines explicit FP16 versions of `ModelTestBuilder::MakeInput` and `ModelTestBuilder::MakeInitializer` outside of the ModelTestBuilder class's header file. These explicit template instantiations cause linker errors when other compilation units also instantiate these functions due to duplicate definitions. Additionally, the versions defined in nhwc_transformer_test.cc do not really conform to the expected behavior in the original ModelTestBuilder class, which is to make random input/initializer values. Instead, the versions in nhwc_transformer_test.cc create a range of values. The solution is to edit nhwc_transformer_test.cc to use stand-alone static functions that do not change the ModelTestBuilder class. **Note**: This linker error cannot currently be replicated in our CIs because it requires a QNN-HTP-enabled Windows ARM64 environment with `MLAS_F16VEC_INTRINSICS_SUPPORTED` defined. I can replicate on a local build. The linker error/conflict happens with with this new FP16 QNN test: https://github.com/microsoft/onnxruntime/blob/d4c8bc359e321cdabdd87b70b392dd0e7a14502e/onnxruntime/test/providers/qnn/clip_op_test.cc#L186 --- .../test/optimizer/nhwc_transformer_test.cc | 34 +++++++++---------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/onnxruntime/test/optimizer/nhwc_transformer_test.cc b/onnxruntime/test/optimizer/nhwc_transformer_test.cc index c254d340cdcb8..e6f0a259805e5 100644 --- a/onnxruntime/test/optimizer/nhwc_transformer_test.cc +++ b/onnxruntime/test/optimizer/nhwc_transformer_test.cc @@ -518,7 +518,7 @@ TEST(NhwcTransformerTests, ConvMixTensorRanks) { #ifdef MLAS_F16VEC_INTRINSICS_SUPPORTED -std::vector randomfp16(const std::vector& shape, MLFloat16 min, MLFloat16 max) { +static std::vector ARangeOfFP16Values(const std::vector& shape, MLFloat16 min, MLFloat16 max) { std::vector val(detail::SizeFromDims(shape)); float start = min.ToFloat(); float end = max.ToFloat(); @@ -534,22 +534,22 @@ std::vector randomfp16(const std::vector& shape, MLFloat16 m return val; } -template <> -NodeArg* ModelTestBuilder::MakeInput(const std::vector& shape, MLFloat16 min, MLFloat16 max) { - return MakeInput(shape, randomfp16(shape, min, max)); +static NodeArg* MakeInputARangeFP16(ModelTestBuilder& builder, const std::vector& shape, + MLFloat16 min, MLFloat16 max) { + return builder.MakeInput(shape, ARangeOfFP16Values(shape, min, max)); } -template <> -NodeArg* ModelTestBuilder::MakeInitializer(const std::vector& shape, MLFloat16 min, MLFloat16 max) { - return MakeInitializer(shape, randomfp16(shape, min, max)); +static NodeArg* MakeInitializerARangeFP16(ModelTestBuilder& builder, const std::vector& shape, + MLFloat16 min, MLFloat16 max) { + return builder.MakeInitializer(shape, ARangeOfFP16Values(shape, min, max)); } TEST(NhwcTransformerTests, ConvFp16) { auto test_case = [&](const std::vector& input_shape, const std::vector& weights_shape) { auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); auto* output_arg = builder.MakeOutput(); - auto* weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); builder.AddConvNode(input_arg, weight_arg, output_arg); }; @@ -575,10 +575,10 @@ TEST(NhwcTransformerTests, ConvFp16) { TEST(NhwcTransformerTests, ConvMaxPoolFp16) { auto test_case = [&](const std::vector& input_shape, const std::vector& weights_shape) { auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput(input_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* input_arg = MakeInputARangeFP16(builder, input_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); auto* conv_output_arg = builder.MakeIntermediate(); auto* output_arg = builder.MakeOutput(); - auto* conv_weight_arg = builder.MakeInitializer(weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* conv_weight_arg = MakeInitializerARangeFP16(builder, weights_shape, MLFloat16(-1.5f), MLFloat16(1.5f)); builder.AddConvNode(input_arg, conv_weight_arg, conv_output_arg); Node& pool_node = builder.AddNode("MaxPool", {conv_output_arg}, {output_arg}); @@ -609,13 +609,13 @@ TEST(NhwcTransformerTests, ConvMaxPoolFp16) { TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) { auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f)); auto* conv1_output_arg = builder.MakeIntermediate(); auto* conv2_output_arg = builder.MakeIntermediate(); auto* gavgpool1_output_arg = builder.MakeIntermediate(); auto* output_arg = builder.MakeOutput(); - auto* conv1_weight_arg = builder.MakeInitializer({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f)); - auto* conv2_weight_arg = builder.MakeInitializer({16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 1, 1}, MLFloat16(-1.5f), MLFloat16(1.5f)); Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg); conv1_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); @@ -640,13 +640,13 @@ TEST(NhwcTransformerTests, ConvGlobalAveragePoolFp16) { TEST(NhwcTransformerTests, ConvAveragePoolFp16) { auto build_test_case = [&](ModelTestBuilder& builder) { - auto* input_arg = builder.MakeInput({1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* input_arg = MakeInputARangeFP16(builder, {1, 23, 13, 13}, MLFloat16(-1.5f), MLFloat16(1.5f)); auto* conv1_output_arg = builder.MakeIntermediate(); auto* conv2_output_arg = builder.MakeIntermediate(); auto* avgpool1_output_arg = builder.MakeIntermediate(); auto* output_arg = builder.MakeOutput(); - auto* conv1_weight_arg = builder.MakeInitializer({30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f)); - auto* conv2_weight_arg = builder.MakeInitializer({16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* conv1_weight_arg = MakeInitializerARangeFP16(builder, {30, 23, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f)); + auto* conv2_weight_arg = MakeInitializerARangeFP16(builder, {16, 30, 3, 3}, MLFloat16(-1.5f), MLFloat16(1.5f)); Node& conv1_node = builder.AddConvNode(input_arg, conv1_weight_arg, conv1_output_arg); conv1_node.AddAttribute("pads", std::vector{1, 1, 1, 1}); From cd6ec50b50f25ff46e71978db53050fedeceee86 Mon Sep 17 00:00:00 2001 From: Hariharan Seshadri Date: Tue, 19 Mar 2024 14:54:58 -0700 Subject: [PATCH 32/55] Switch a portion of CI/packaging jobs to MacOS12 (#19908) --- onnxruntime/test/framework/inference_session_test.cc | 6 ++++++ .../github/azure-pipelines/mac-coreml-ci-pipeline.yml | 4 +++- .../ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml | 4 ++-- .../github/azure-pipelines/mac-ios-packaging-pipeline.yml | 2 +- .../templates/mac-cpu-packaging-pipeline.yml | 2 +- .../azure-pipelines/templates/mac-cpu-packing-jobs.yml | 4 +++- .../templates/stages/mac-ios-packaging-build-stage.yml | 4 ++-- 7 files changed, 18 insertions(+), 8 deletions(-) diff --git a/onnxruntime/test/framework/inference_session_test.cc b/onnxruntime/test/framework/inference_session_test.cc index 60effda9ec772..d0520ebbcba5a 100644 --- a/onnxruntime/test/framework/inference_session_test.cc +++ b/onnxruntime/test/framework/inference_session_test.cc @@ -2944,6 +2944,11 @@ TEST(InferenceSessionTests, GlobalThreadPoolWithDenormalAsZero) { } // test inter thread pool with setting denormal as zero +#if !defined(__APPLE__) +// TODO (hasesh): Debug this test failure on MacOS 12 with XCode 14.2 +// It seemingly passes on MacOS 13 with XCode 15.x but we had to drop down to Mac OS 12 +// because at the time of writing this, Mac OS 13 images were making CI/Packaging pipelines +// very unstable. TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) { if constexpr (!SessionOptions::DEFAULT_USE_PER_SESSION_THREADS) { GTEST_SKIP() << "Skipping the test"; @@ -3001,6 +3006,7 @@ TEST(InferenceSessionTests, InterThreadPoolWithDenormalAsZero) { VerifyThreadPoolWithDenormalAsZero(session2.GetIntraOpThreadPoolToUse(), false); VerifyThreadPoolWithDenormalAsZero(session2.GetInterOpThreadPoolToUse(), false); } +#endif } // namespace test } // namespace onnxruntime diff --git a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml index a3f56f5c448a9..f0a35d809c700 100644 --- a/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-coreml-ci-pipeline.yml @@ -32,7 +32,7 @@ jobs: workspace: clean: all pool: - vmImage: 'macOS-13' + vmImage: 'macOS-latest' variables: MACOSX_DEPLOYMENT_TARGET: '11.0' TODAY: $[format('{0:dd}{0:MM}{0:yyyy}', pipeline.startTime)] @@ -43,6 +43,8 @@ jobs: displayName: Install coreutils and ninja - template: templates/use-xcode-version.yml + parameters: + xcodeVersion: 14.2 - template: templates/mac-build-step-with-cache.yml parameters: diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml index a1ca68c8279e7..255531681b039 100644 --- a/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-ios-ci-pipeline.yml @@ -30,7 +30,7 @@ pr: jobs: - job: iOS_CI_on_Mac pool: - vmImage: 'macOS-13' + vmImage: 'macOS-latest' variables: PROTO_CACHE_DIR: $(Pipeline.Workspace)/proto_ccache ORT_CACHE_DIR: $(Pipeline.Workspace)/ort_ccache @@ -39,7 +39,7 @@ jobs: steps: - template: templates/use-xcode-version.yml parameters: - xcodeVersion: 14.3 + xcodeVersion: 14.2 - template: templates/mac-build-step-with-cache.yml parameters: WithCache: true diff --git a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml index 5fd15b64e03b6..881023e1c1186 100644 --- a/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/mac-ios-packaging-pipeline.yml @@ -53,7 +53,7 @@ stages: displayName: "Set common variables" pool: - vmImage: "macOS-13" + vmImage: "macOS-latest" timeoutInMinutes: 5 diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml index 080079388a76c..945fbb7c4a094 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packaging-pipeline.yml @@ -71,7 +71,7 @@ stages: ${{ if eq(parameters.DoESRP, true)}}: vmImage: 'macOS-12' ${{ else }}: - vmImage: 'macOS-13' + vmImage: 'macOS-latest' steps: - checkout: none - template: flex-downloadPipelineArtifact.yml diff --git a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml index fd2113502478a..9e192716c3ffd 100644 --- a/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml +++ b/tools/ci_build/github/azure-pipelines/templates/mac-cpu-packing-jobs.yml @@ -37,7 +37,7 @@ jobs: PROTO_CACHE_DIR: $(Pipeline.Workspace)/ccache_proto ORT_CACHE_DIR: $(Pipeline.Workspace)/ccache_ort pool: - vmImage: 'macOS-13' + vmImage: 'macOS-latest' timeoutInMinutes: 300 steps: - checkout: self @@ -55,6 +55,8 @@ jobs: - template: set-version-number-variables-step.yml - template: use-xcode-version.yml + parameters: + xcodeVersion: 14.2 - template: mac-build-step-with-cache.yml parameters: diff --git a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml index ed32c5d0e15be..b1cdb498bb4ae 100644 --- a/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml +++ b/tools/ci_build/github/azure-pipelines/templates/stages/mac-ios-packaging-build-stage.yml @@ -16,10 +16,10 @@ stages: displayName: "Build iOS package for variant: ${{ parameters.packageVariant}}" pool: - vmImage: "macOS-13" + vmImage: "macOS-latest" variables: - xcodeVersion: "14.3" + xcodeVersion: "14.2" ortPodVersion: $[stageDependencies.IosPackaging_SetCommonVariables.j.outputs['SetCommonVariables.ORT_POD_VERSION']] ${{ if eq(parameters.packageVariant, 'Mobile') }}: From 597e828aaea52d0bcf995858ee29ff0f41488c20 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Tue, 19 Mar 2024 15:50:13 -0700 Subject: [PATCH 33/55] Adjust test tolerance (#19947) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description Improve the precision of tests. Changes include: (1) Update checkers.cc to use consistent default tolerance. (2) Allow different default tolerances for different providers at runtime (Previously, threshold of a test is decided during compiling). (3) Explicitly set absolute and relative error tolerances for tests that failed to pass new default threshold. #### Default Thresholds Change Note that the formula of testing is `abs(expected - value) < absolute + relative * expected` Default test thresholds when both absolute and relative tolerance are not set: type | provider | absolute (before) | absolute (after) | relative (before) | relative (after) -- | -- | -- | -- | -- | -- double | CPU | 0.001 | 0.00001 | 0 | 0.00001 double | CUDA | 0.005 | 0.00001 | 0 | 0.00001 double | TRT | 0.005 | 0.00001 | 0 | 0.00001 double | ROCM | 0.005 | 0.00001 | 0 | 0.00001 double | DML | 0.005 | 0.00001 | 0 | 0.00001   |   |   |   |   |   float | CPU | 0.0001 | 0.00001 | 0 | 0.0001 float | CUDA | 0.005 | 0.00001 | 0 | 0.0001 float | TRT | 0.005 | 0.00001 | 0 | 0.0001 float | ROCM | 0.005 | 0.00001 | 0 | 0.0001 float | DML | 0.005 | 0.00001 | 0 | 0.0001 float | Training* | 0.005 | 0.001 | 0 | 0.0001   |   |   |   |   |   half | CPU | 0.001 | 0.0025 | 0 | 0.001 half | CUDA | 0.005 | 0.0025 | 0 | 0.001 half | TRT | 0.005 | 0.0025 | 0 | 0.001 half | ROCM | 0.005 | 0.0025 | 0 | 0.001 half | DML | 0.02 | 0.005 | 0 | 0.001 half | Training* | 0.005 | 0.005 | 0 | 0.001   |   |   |   |   |   bfloat16 | CPU | 0.0001 | 0.02 | 0 | 0.01 bfloat16 | CUDA | 0.0001 | 0.02 | 0.05 | 0.01 bfloat16 | TRT | 0.0001 | 0.02 | 0.05 | 0.01 bfloat16 | ROCM | 0.0001 | 0.02 | 0.05 | 0.01 bfloat16 | DML | 0.0001 | 0.02 | 0.05 | 0.01 bfloat16 | Training* | 0.0001 | 0.02 | 0.05 | 0.01 *Training mean a build flag ENABLE_TRAINING_CORE is defined. The provider can be any one. #### Threshold for provider Previously, the threshold might change according to build flags: ``` #if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) constexpr float threshold = 0.005f; #else constexpr float threshold = 0.0001f; #endif ``` For a cpu only build, the threshold is 0.0001. For a cuda build, the threshold for CPU provider (some tests in cuda build actually run with CPU provider) is changed to 0.005. After this change, the threshold only depends on data type and provider used in the test. It will not change by build flags for non-training builds. Default thresholds for training might be different from inference (please refer to the above table). There are a few factors there: Training has gradient outputs; TF32 is not disabled in training; Some training tests has iterations, and error might accumulate. How to set different thresholds based on these factors could be a future task. --- .../test/contrib_ops/attention_op_test.cc | 9 ++ .../contrib_ops/decoder_attention_op_test.cc | 7 +- ...oder_masked_multihead_attention_op_test.cc | 6 +- onnxruntime/test/contrib_ops/fft_op_test.cc | 2 + .../test/contrib_ops/gemm_fastgelu_op_test.cc | 6 +- .../test/contrib_ops/gridsample_test.cc | 1 + .../test/contrib_ops/layer_norm_op_test.cc | 6 + .../matmul_integer_to_float_test.cc | 2 +- onnxruntime/test/contrib_ops/moe_test.cc | 2 + .../packed_multihead_attention_op_test.cc | 2 + .../contrib_ops/quantize_attention_op_test.cc | 2 + onnxruntime/test/providers/base_tester.cc | 14 +++ onnxruntime/test/providers/base_tester.h | 11 ++ onnxruntime/test/providers/checkers.cc | 117 ++++++++++-------- .../cpu/activation/activation_op_test.h | 5 + .../cpu/math/element_wise_ops_test.cc | 9 +- .../providers/cpu/math/logsoftmax_test.cc | 9 +- .../providers/cpu/nn/batch_norm_op_test.cc | 9 +- .../test/providers/cpu/nn/pool_op_test.cc | 1 + .../cpu/object_detection/roialign_test.cc | 2 + .../cpu/rnn/deep_cpu_lstm_op_test.cc | 2 + .../providers/cpu/tensor/affine_grid_test.cc | 17 +++ .../mean_variance_normalization_test.cc | 5 + .../test/gradient/optimizer_ops_test.cc | 19 +++ .../cpu/nn/batchnorm_internal_test.cc | 2 + .../cuda/batch_norm_internal_test.cc | 1 + 26 files changed, 204 insertions(+), 64 deletions(-) diff --git a/onnxruntime/test/contrib_ops/attention_op_test.cc b/onnxruntime/test/contrib_ops/attention_op_test.cc index 7fe70fd2d6f09..a8e2fccdd0462 100644 --- a/onnxruntime/test/contrib_ops/attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/attention_op_test.cc @@ -227,6 +227,12 @@ static void RunAttentionTest( tester.AddOptionalInputEdge(); } + if (use_float16) { + tester.SetOutputTolerance(0.005f); + } else { + tester.SetOutputTolerance(0.001f, 0.001f); + } + if (enable_cuda) { std::vector> execution_providers; execution_providers.push_back(DefaultCudaExecutionProvider()); @@ -254,6 +260,9 @@ static void RunAttentionTest( if (enable_dml) { std::vector> execution_providers; execution_providers.push_back(DefaultDmlExecutionProvider()); + if (use_float16) { + tester.SetOutputTolerance(0.02f); + } tester.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } } diff --git a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc index 88a2bdf6a4849..8a37ef921fd2b 100644 --- a/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/decoder_attention_op_test.cc @@ -31,10 +31,8 @@ static void RunAttentionTest( const std::vector* new_value_cache = nullptr, const std::vector* key_cache = nullptr, const std::vector* value_cache = nullptr, - const std::initializer_list* key_padding_mask_data = nullptr, - bool use_float16 = false) { - int min_cuda_architecture = use_float16 ? 530 : 0; - bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); + const std::initializer_list* key_padding_mask_data = nullptr) { + bool enable_cuda = HasCudaEnvironment(0); bool enable_rocm = (nullptr != DefaultRocmExecutionProvider().get()); bool enable_cpu = false; @@ -99,6 +97,7 @@ static void RunAttentionTest( tester.AddOutput("new_key_cache", output_cache_dims, *new_key_cache); tester.AddOutput("new_value_cache", output_cache_dims, *new_value_cache); } + tester.SetOutputTolerance(0.001f, 0.001f); std::vector> execution_providers; if (enable_cuda) { diff --git a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc index acaae2dcd9712..17c9e8592f64e 100644 --- a/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/decoder_masked_multihead_attention_op_test.cc @@ -754,9 +754,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp32) { // Output(s) tester.AddOutput("output", input_dims, output); - tester.AddOutput("present", past_dims, present); + tester.SetOutputTolerance(0.001f, 0.001f); + // Run - Regular kernel execution path { std::vector> execution_providers; @@ -897,9 +898,10 @@ TEST(DecoderMaskedSelfAttentionTest, Test_fp16) { // Output(s) tester.AddOutput("output", input_dims, output); - tester.AddOutput("present", past_dims, present); + tester.SetOutputTolerance(0.005f); + // Run - Regular kernel execution path { std::vector> execution_providers; diff --git a/onnxruntime/test/contrib_ops/fft_op_test.cc b/onnxruntime/test/contrib_ops/fft_op_test.cc index 56a6466c760f6..7a6b6cca6425a 100644 --- a/onnxruntime/test/contrib_ops/fft_op_test.cc +++ b/onnxruntime/test/contrib_ops/fft_op_test.cc @@ -25,6 +25,7 @@ TEST(ContribOpTest, Rfft) { // Target values conputed using PyTorch torch.fft.rfft(X, dim=-1, norm="backward") test.AddInput("X", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f}); test.AddOutput("Y", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f}); + test.SetOutputTolerance(0.0001f); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } @@ -45,6 +46,7 @@ TEST(ContribOpTest, Irfft) { test.AddAttribute("normalized", static_cast(0)); test.AddInput("X", {4, 3, 2}, {0.0400f, 0.0000f, 1.6919f, -2.5154f, -0.1722f, 0.0000f, 0.2627f, 0.0000f, -0.4218f, 1.4748f, 1.2454f, 0.0000f, -1.7779f, 0.0000f, 3.8766f, -1.8512f, -0.9730f, 0.0000f, 0.9740f, 0.0000f, -1.4290f, 0.8341f, -4.8699f, 0.0000f}); test.AddOutput("Y", {4, 4}, {0.8129f, 1.3108f, -0.8790f, -1.2046f, 0.1661f, -0.9831f, 0.5879f, 0.4918f, 1.2506f, 0.7244f, -2.6260f, -1.1268f, -1.6885f, 1.0439f, -0.2595f, 1.8780f}); + test.SetOutputTolerance(0.0001f); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); } } // namespace test diff --git a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc index a24f3b6b441e1..d9d2681dd3b3f 100644 --- a/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc +++ b/onnxruntime/test/contrib_ops/gemm_fastgelu_op_test.cc @@ -50,6 +50,8 @@ static void RunGemmFastGeluGpuTest(const std::vector& input_data, const s tester.AddOutput("Y", output_dims, output_data); } + tester.SetOutputTolerance(use_float16 ? 0.005f : 0.0025f); + tester.Config(run_with_tunable_op) .RunWithConfig(); } @@ -154,7 +156,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithoutBiasFloat16) { RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data, input_dims, weight_dims, bias_dims, output_dims, - false); + false, true); } TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) { @@ -189,7 +191,7 @@ TEST(GemmFastGeluTest, GemmFastGeluWithBiasFloat16) { RunGemmFastGeluGpuTest(input_data, weight_data, bias_data, output_data, input_dims, weight_dims, bias_dims, output_dims, - true); + true, true); } TEST(GemmFastGeluTest, GemmFastGeluWithBias_bfloat16) { diff --git a/onnxruntime/test/contrib_ops/gridsample_test.cc b/onnxruntime/test/contrib_ops/gridsample_test.cc index 46ed04301a9e8..d970178e29ab8 100644 --- a/onnxruntime/test/contrib_ops/gridsample_test.cc +++ b/onnxruntime/test/contrib_ops/gridsample_test.cc @@ -126,6 +126,7 @@ TEST(GridsampleContribOpTest, gridsample_mode_bicubic) { 0.5000f, 0.5000f, 1.0000f, 1.0000f}); test.AddAttribute("mode", "bicubic"); test.AddOutput("Y", {1, 1, 2, 4}, {-0.1406f, 0.3828f, 1.7556f, 2.9688f, 2.9688f, 1.7556f, 5.1445f, 1.3906f}); + test.SetOutputTolerance(0.0001f); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); } diff --git a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc index 98fb62e435f31..655c4951f262d 100644 --- a/onnxruntime/test/contrib_ops/layer_norm_op_test.cc +++ b/onnxruntime/test/contrib_ops/layer_norm_op_test.cc @@ -160,6 +160,7 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias) { test.AddInput("gamma", {2}, {-0.6953f, 5.1824f}); test.AddInput("bias", {2}, {0.6435f, -0.3964f}); test.AddOutput("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -172,6 +173,8 @@ TEST(LayerNormTest, LayerNorm_Scale_Bias_Float16Input) { test.AddInput("gamma", {2}, {-0.6953f, 5.1824f}); test.AddInput("bias", {2}, {0.6435f, -0.3964f}); test.AddOutput("output", dims, {-0.0516f, -5.5776f, -0.0518f, -5.5788f, -0.0518f, -5.5788f}); + test.SetOutputTolerance(0.0001f); + // TRT, DNNL, OpenVINO and NNAPI, CoreML don't support this combination of datatypes test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kDnnlExecutionProvider, kQnnExecutionProvider, @@ -228,6 +231,9 @@ TEST(LayerNormTest, LayerNorm17_double) { test.AddInput("x", dims, {1.0, 2.0, 3.0, 4.0, 5.0, 6.0}); test.AddInput("gamma", {3}, {1.0, 1.0, 1.0}); test.AddOutput("output", dims, {-1.2247, 0.0, 1.2247, -1.2247, 0.0, 1.2247}); + + test.SetOutputTolerance(0.0001f); + // DNNL does not support double test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kDnnlExecutionProvider}); } diff --git a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc index 72a5ba4dcefbf..8d7629b5fda1c 100644 --- a/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc +++ b/onnxruntime/test/contrib_ops/matmul_integer_to_float_test.cc @@ -127,7 +127,7 @@ void TestMatMulIntegerToFloat(bool is_matrix_b_constant, if (std::is_same_v) { test.AddOutput("Y", {M, N}, Y_data); - test.SetOutputAbsErr("Y", 0.0001f); + test.SetOutputAbsErr("Y", 0.001f); test.SetOutputRelErr("Y", 0.02f); } else { test.AddOutput("Y", {M, N}, ToFloat16(Y_data)); diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc index ebb0261deefa5..e88ef7794cd07 100644 --- a/onnxruntime/test/contrib_ops/moe_test.cc +++ b/onnxruntime/test/contrib_ops/moe_test.cc @@ -47,6 +47,7 @@ static void RunMoETest( tester.AddInput("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias)); tester.AddInput("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias)); tester.AddOutput("output", output_dims, ToFloat16(output_data)); + tester.SetOutputTolerance(0.005f); } else { tester.AddInput("input", input_dims, input); tester.AddInput("router_probs", router_probs_dims, router_probs); @@ -55,6 +56,7 @@ static void RunMoETest( tester.AddInput("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias); tester.AddInput("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias); tester.AddOutput("output", output_dims, output_data); + tester.SetOutputTolerance(0.001f); } std::vector> execution_providers; diff --git a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc index 22253955566f2..5f811c8cf35f6 100644 --- a/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/packed_multihead_attention_op_test.cc @@ -107,6 +107,7 @@ static void RunPackedMultiHeadAttentionTest( } tester.AddOutput("output", output_dims, ToFloat16(output_data)); + tester.SetOutputTolerance(0.005f); } else { if (is_packed_qkv) { tester.AddInput("query", packed_qkv_dims, query_data); @@ -131,6 +132,7 @@ static void RunPackedMultiHeadAttentionTest( } tester.AddOutput("output", output_dims, output_data); + tester.SetOutputTolerance(0.001f, 0.001f); } std::vector> execution_providers; diff --git a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc index fd222583ac67f..54dd831fe2fc2 100644 --- a/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc +++ b/onnxruntime/test/contrib_ops/quantize_attention_op_test.cc @@ -90,11 +90,13 @@ void RunQAttention(const std::vector& input_data, tester.AddInput("input_scale", {1}, ToFloat16({input_quant_params.scale})); tester.AddInput("weight_scale", {1}, ToFloat16({weight_quant_params.scale})); tester.AddOutput("output", output_dims, ToFloat16(output_data)); + tester.SetOutputTolerance(0.01f); } else { tester.AddInput("bias", bias_dims, bias_data); tester.AddInput("input_scale", {1}, {input_quant_params.scale}); tester.AddInput("weight_scale", {1}, {weight_quant_params.scale}); tester.AddOutput("output", output_dims, output_data); + tester.SetOutputTolerance(0.005f); } if (mask_index_data.size() > 0) { diff --git a/onnxruntime/test/providers/base_tester.cc b/onnxruntime/test/providers/base_tester.cc index e94f8c2673be3..8d84c689cd23e 100644 --- a/onnxruntime/test/providers/base_tester.cc +++ b/onnxruntime/test/providers/base_tester.cc @@ -120,6 +120,20 @@ void BaseTester::SetOutputRelErr(const char* name, float v) { it->validation_params.relative_error = optional(v); } +void BaseTester::SetOutputTolerance(float abs_error, float rel_error) { + for (auto& output : output_data_) { + if (output.def.Exists()) { + if (abs_error >= 0.0f) { + output.validation_params.absolute_error = optional(abs_error); + } + + if (rel_error >= 0.0f) { + output.validation_params.relative_error = optional(rel_error); + } + } + } +} + std::vector BaseTester::GetDimsForProto(gsl::span dims) { std::vector dims_for_proto{dims.begin(), dims.end()}; if (add_symbolic_dim_to_tensor_data_ >= 0 && diff --git a/onnxruntime/test/providers/base_tester.h b/onnxruntime/test/providers/base_tester.h index 5607e58315a12..c276ae494df43 100644 --- a/onnxruntime/test/providers/base_tester.h +++ b/onnxruntime/test/providers/base_tester.h @@ -519,9 +519,20 @@ class BaseTester { custom_session_registries_.push_back(registry); } + // For floating types (double/float/half/bfloat16), tolerance is similar to numpy.isclose: + // absolute(expected_value - actual_value) <= abs_error + rel_error * absolute(expected_value) + // For integer types, tolerance parameters are ignored except the following cases: + // For uint8, tolerance is only applied to NNAPI/XNNPACK/DML providers. + // For int8, only abs_error is used, and rel_error is ignored. See checkers.cc for detail. + // If abs_error or rel_error is not set, a default value is used (search DefaultTolerance for detail). void SetOutputAbsErr(const char* name, float v); void SetOutputRelErr(const char* name, float v); + // Set absolute and relative tolerance for all existed outputs. + // Negative value will be ignored. + // Note that it will not set tolerance for new outputs added after this call. + void SetOutputTolerance(float abs_error, float rel_error = -1.0f); + // Number of times to call InferenceSession::Run. The same feeds are used each time. // e.g. used to verify the generator ops behave as expected void SetNumRunCalls(int n) { diff --git a/onnxruntime/test/providers/checkers.cc b/onnxruntime/test/providers/checkers.cc index c97e6d9de4911..47c18c478dd9c 100644 --- a/onnxruntime/test/providers/checkers.cc +++ b/onnxruntime/test/providers/checkers.cc @@ -20,46 +20,87 @@ struct DefaultTolerance; template <> struct DefaultTolerance { - static constexpr float absolute = 1e-6f; + static constexpr float absolute = 1e-5f; static constexpr float relative = 1e-5f; + + // Allow to have different default absolute tolerance for different providers. + static float get_absolute(const std::string& /*provider_type*/) { + return absolute; + } }; template <> struct DefaultTolerance { +#if defined(ENABLE_TRAINING) + static constexpr float absolute = 1e-3f; +#else static constexpr float absolute = 1e-5f; +#endif + static constexpr float relative = 1e-4f; + + static float get_absolute(const std::string& /*provider_type*/) { + return absolute; + } }; template <> struct DefaultTolerance { - // The thresholds are estimated with PyTorch script like the following: +#if defined(ENABLE_TRAINING) + static constexpr float absolute = 0.005f; +#else + // The thresholds for inference are estimated with PyTorch script like the following: // x = torch.rand(1000, 1000) // absolute = ((x + 1e-6).to(torch.float16) - x).abs().max() * 10 // x[abs(x) < absolute] = absolute // relative = ((x - x.to(torch.float16)) / x).abs().max() * 2 static constexpr float absolute = 0.0025f; +#endif + static constexpr float relative = 0.001f; + + static float get_absolute(const std::string& provider_type) { + if (provider_type == kDmlExecutionProvider) { + return 0.005f; + } + return absolute; + } }; template <> struct DefaultTolerance { + // The thresholds for inference are estimated with PyTorch script like the following: + // x = torch.rand(1000, 1000) + // absolute = ((x + 1e-6).to(torch.bfloat16) - x).abs().max() * 10 + // x[abs(x) < absolute] = absolute + // relative = ((x - x.to(torch.bfloat16)) / x).abs().max() * 2 static constexpr float absolute = 0.02f; static constexpr float relative = 0.01f; + + static float get_absolute(const std::string& /*provider_type*/) { + return absolute; + } +}; + +struct ToleranceParams { + float absolute; + float relative; }; template -T get_tolerance(float absolute, float relative, T expected_value) { +ToleranceParams get_tolerance_params(const ValidateOutputParams& params, const std::string& provider_type) { + ToleranceParams new_params; + new_params.absolute = params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance::get_absolute(provider_type); + new_params.relative = params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance::relative; + return new_params; +} + +template +T get_tolerance(const ToleranceParams& params, T expected_value) { static_assert(std::is_floating_point::value, "T must be a floating point type"); // The formula is similar to numpy.isclose: https://numpy.org/doc/stable/reference/generated/numpy.isclose.html - return static_cast(absolute) + static_cast(relative) * std::abs(expected_value); -} - -template // D is the original data type -T get_tolerance(const ValidateOutputParams& params, T expected_value) { - float absolute = (params.absolute_error.has_value() ? *(params.absolute_error) : DefaultTolerance::absolute); - float relative = (params.relative_error.has_value() ? *(params.relative_error) : DefaultTolerance::relative); - return get_tolerance(absolute, relative, expected_value); + return static_cast(params.absolute) + static_cast(params.relative) * std::abs(expected_value); } template @@ -201,7 +242,10 @@ struct TensorCheck { cur_actual = actual.template Data(); } - const bool has_abs_err = params.absolute_error.has_value(); + // When absolute error is less than 1 for int8, it has same effect as no tolerance. + const bool has_abs_err = params.absolute_error.has_value() && *(params.absolute_error) >= 1.0f; + + // TODO: the relative error is not used for int8 yet. if (has_abs_err) { double threshold = *(params.absolute_error); @@ -221,11 +265,9 @@ struct TensorCheck { void operator()(const Tensor& expected, const Tensor& actual, const ValidateOutputParams& params, - const std::string& /*provider_type*/) const { + const std::string& provider_type) const { auto size = actual.Shape().Size(); - const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value(); - // deal with rare cases in which order of output data from a kernel MAY be // undefined Tensor expected_sorted, actual_sorted; @@ -240,10 +282,7 @@ struct TensorCheck { cur_actual = actual.Data(); } - double threshold = 0.001; -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) - threshold = 0.005; -#endif + auto tolerance_params = get_tolerance_params(params, provider_type); for (int64_t i = 0; i < size; ++i) { // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified. @@ -253,7 +292,7 @@ struct TensorCheck { } else if (std::isinf(cur_expected[i])) { // Test infinity for equality EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i; } else { - double tolerance = has_tolerance ? get_tolerance(params, cur_expected[i]) : threshold; + double tolerance = get_tolerance(tolerance_params, cur_expected[i]); EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i; } } @@ -264,9 +303,7 @@ template void InternalNumericalCheck(const Tensor& expected, const Tensor& actual, const ValidateOutputParams& params, - const std::string& /*provider_type*/) { - const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value(); - + const std::string& provider_type) { // deal with rare cases in which order of output data from a kernel MAY be // undefined Tensor expected_sorted, actual_sorted; @@ -282,11 +319,7 @@ void InternalNumericalCheck(const Tensor& expected, cur_actual = actual.Data(); } -#if defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) - constexpr float threshold = 0.005f; -#else - constexpr float threshold = 0.0001f; -#endif + auto tolerance_params = get_tolerance_params(params, provider_type); for (int64_t i = 0; i < size; ++i) { // NOTE: Check isnan first to work around MSVC linker bug when /LTCG:incremental is specified. @@ -296,7 +329,7 @@ void InternalNumericalCheck(const Tensor& expected, } else if (std::isinf(cur_expected[i])) { // Test infinity for equality EXPECT_EQ(cur_expected[i], cur_actual[i]) << "Expected infinity. i:" << i; } else { - T tolerance = has_tolerance ? get_tolerance(params, cur_expected[i]) : threshold; + T tolerance = get_tolerance(tolerance_params, cur_expected[i]); EXPECT_NEAR(cur_expected[i], cur_actual[i], tolerance) << "i:" << i; } } @@ -317,7 +350,7 @@ struct TensorCheck { void operator()(const Tensor& expected, const Tensor& actual, const ValidateOutputParams& params, - const std::string& /*provider_type*/) const { + const std::string& provider_type) const { auto* cur_expected = expected.Data(); auto* cur_actual = actual.Data(); auto size = actual.Shape().Size(); @@ -333,21 +366,15 @@ struct TensorCheck { sort_expected_and_actual_buffers(f_expected, f_actual); } - const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value(); + auto tolerance_params = get_tolerance_params(params, provider_type); - float threshold = 0.001f; -#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) - threshold = 0.005f; -#elif defined(USE_DML) - threshold = 0.02f; -#endif for (int64_t i = 0; i < size; ++i) { if (std::isnan(f_expected[i])) { EXPECT_TRUE(std::isnan(f_expected[i])) << "Expected NaN. i:" << i; } else if (std::isinf(f_expected[i])) { // Test infinity for equality EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i; } else { - float tolerance = has_tolerance ? get_tolerance(params, f_expected[i]) : threshold; + float tolerance = get_tolerance(tolerance_params, f_expected[i]); EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i; } } @@ -359,7 +386,7 @@ struct TensorCheck { void operator()(const Tensor& expected, const Tensor& actual, const ValidateOutputParams& params, - const std::string& /*provider_type*/) const { + const std::string& provider_type) const { auto* cur_expected = expected.Data(); auto* cur_actual = actual.Data(); auto size = actual.Shape().Size(); @@ -375,13 +402,7 @@ struct TensorCheck { sort_expected_and_actual_buffers(f_expected, f_actual); } - const bool has_tolerance = params.absolute_error.has_value() || params.relative_error.has_value(); - - float abs_threshold = 0.0001f; - float rel_threshold = 0.001f; -#if defined(USE_TENSORRT) || defined(ENABLE_TRAINING_CORE) || defined(USE_CUDA) || defined(USE_ROCM) || defined(USE_DML) || defined(USE_DNNL) - rel_threshold = 0.05f; // expect at least 95% close -#endif + auto tolerance_params = get_tolerance_params(params, provider_type); for (int64_t i = 0; i < size; ++i) { if (std::isnan(f_expected[i])) { @@ -389,9 +410,7 @@ struct TensorCheck { } else if (std::isinf(f_expected[i])) { // Test infinity for equality EXPECT_EQ(f_expected[i], f_actual[i]) << "Expected infinity. i:" << i; } else { - float tolerance = has_tolerance - ? get_tolerance(params, f_expected[i]) - : get_tolerance(abs_threshold, rel_threshold, f_expected[i]); + float tolerance = get_tolerance(tolerance_params, f_expected[i]); EXPECT_NEAR(f_expected[i], f_actual[i], tolerance) << "i:" << i; } } diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.h b/onnxruntime/test/providers/cpu/activation/activation_op_test.h index 984b8f4437a3b..9a74d763a13e3 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.h +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.h @@ -69,6 +69,11 @@ inline void TestActivationOp(const char* szOp, const std::vector> test.SetOutputRelErr("Y", .000001f); } #endif + + if (strcmp(szOp, "QuickGelu") == 0) { + test.SetOutputTolerance(0.0001f); + } + test.Run(OpTester::ExpectResult::kExpectSuccess, "", excluded_providers); } } diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index 0e99b2306873e..c73dfcbce1b53 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -2632,7 +2632,7 @@ TEST(MathOpTest, Mean_8) { #endif template -void TrigFloatTest(OpTester& test, std::initializer_list input) { +void TrigFloatTest(OpTester& test, std::initializer_list input, float abs_error = -1.0f) { std::vector dims{static_cast(input.size())}; std::vector output; @@ -2642,6 +2642,10 @@ void TrigFloatTest(OpTester& test, std::initializer_list input) { test.AddInput("X", dims, input); test.AddOutput("Y", dims, output); + if (abs_error >= 0.0f) { + test.SetOutputTolerance(abs_error); + } + test.Run(); } @@ -2719,7 +2723,8 @@ TEST(MathOpTest, Tan) { TEST(MathOpTest, Asin) { OpTester test("Asin"); - TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}); + float abs_error = DefaultDmlExecutionProvider().get() != nullptr ? 0.0001f : -1.0f; + TrigFloatTest<::asinf>(test, {-1.0f, -0.5f, 0.0f, 0.5f, 1.0f}, abs_error); } TEST(MathOpTest, Acos) { diff --git a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc index 273503e7bf6af..f057e4a071bd9 100644 --- a/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc +++ b/onnxruntime/test/providers/cpu/math/logsoftmax_test.cc @@ -15,7 +15,8 @@ static void RunTest(const std::vector& x_vals, int64_t axis = 1, bool is_tensorrt_supported = true, OpTester::ExpectResult expect_result = OpTester::ExpectResult::kExpectSuccess, - const std::string& error_msg = "") { + const std::string& error_msg = "", + float tolerance = 0.0f) { OpTester tester("LogSoftmax", opset); if (opset < 13) { @@ -31,6 +32,10 @@ static void RunTest(const std::vector& x_vals, tester.AddInput("X", dimensions, x_vals); tester.AddOutput("Y", dimensions, expected_vals); + if (tolerance != 0.0f) { + tester.SetOutputAbsErr("Y", tolerance); + } + std::unordered_set excluded_providers; if (!is_tensorrt_supported) { excluded_providers.insert(kTensorrtExecutionProvider); @@ -62,7 +67,7 @@ TEST(LogSoftmaxOperator, LargeNumber) { -3.4401896f, -2.4401896f, -1.44018972f, -0.44018969f}; std::vector dimensions = {2, 4}; - RunTest(x_vals, expected_vals, dimensions); + RunTest(x_vals, expected_vals, dimensions, 7, 1, true, OpTester::ExpectResult::kExpectSuccess, "", 0.0005f); } // np.random.seed(123) # Use a seed so we can replicate the input and expected values here and in python diff --git a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc index 3d30fc62a945d..d91a1de3faa6e 100644 --- a/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/batch_norm_op_test.cc @@ -905,14 +905,16 @@ TEST(BatchNormTest, ForwardTrainingTestWithSavedOutputsOpset9) { test.AddInput("var", channel_dims, {1.0f, 2.0f}); test.AddOutput("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f}); - test.AddOutput("running_mean", channel_dims, {-0.1754f, 0.303106f}); test.AddOutput("running_var", channel_dims, {0.696052f, 1.41316f}); + // mean and variance of X across channel dimension // With Opset9 we output saved_inv_std instead of saved_var to match CUDA EP test.AddOutput("saved_mean", channel_dims, {-0.306f, 0.114562f}); test.AddOutput("saved_inv_std", channel_dims, {1.2288f, 0.861317f}); + test.SetOutputTolerance(0.0001f); + // exclude CUDA Execution Provider due to flakiness // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm() test.Run(OpTester::ExpectResult::kExpectSuccess, "", @@ -938,10 +940,11 @@ TEST(BatchNormTest, ForwardTrainingTestOpset14) { test.AddInput("var", channel_dims, {1.0f, 2.0f}); test.AddOutput("Y", input_output_dims, {0.0131f, 0.5210f, 1.7244f, 0.1387f, -0.2708f, -0.1191f, 1.2089f, -0.0922f, -0.9548f, -1.5203f, 0.9077f, -0.8298f, 0.5796f, -0.4501f, -2.0921f, 1.2358f}); - test.AddOutput("running_mean", channel_dims, {-0.1754f, 0.303106f}); test.AddOutput("running_var", channel_dims, {0.696052f, 1.41316f}); + test.SetOutputTolerance(0.0001f); + // exclude CUDA Execution Provider due to flakiness // exclude TRT and OpenVINO for same reasons as seen in TestBatchNorm() test.Run(OpTester::ExpectResult::kExpectSuccess, "", @@ -970,6 +973,8 @@ TEST(BatchNormTest, ForwardTrainingTestOpset15) { test.AddOutput("running_mean", channel_dims, {-0.1754f, 0.303106f}); test.AddOutput("running_var", channel_dims, {0.696052f, 1.41316f}); + test.SetOutputTolerance(0.0001f); + // Same exclusions as the opset 14 test test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kRocmExecutionProvider, diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index e24cda17166ed..f98b18ddb17eb 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -888,6 +888,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); + test.SetOutputTolerance(0.0001f); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); } diff --git a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc index 0bff46edccc12..58a616717316e 100644 --- a/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc +++ b/onnxruntime/test/providers/cpu/object_detection/roialign_test.cc @@ -464,6 +464,7 @@ static void BasicTest() { 0.3661f, 0.2349f, }); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -690,6 +691,7 @@ TEST(RoiAlignTest, MaxModePositive) { });*/ test.Run(); } + TEST(RoiAlignTest, AvgModeNegativeInvalidMode) { // TODO: Unskip when fixed #41968513 if (DefaultDmlExecutionProvider().get() != nullptr) { diff --git a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc index 7e81fc80ddf85..e73a1b492cc05 100644 --- a/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc +++ b/onnxruntime/test/providers/cpu/rnn/deep_cpu_lstm_op_test.cc @@ -143,6 +143,8 @@ static void RunLstmTest(const std::vector& X_data, test.AddOptionalOutputEdge(); } + test.SetOutputTolerance(0.0001f); + // TensorRT failed on LSTM tests test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } diff --git a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc index e37e784f28930..1ffe6c73d4fa4 100644 --- a/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/affine_grid_test.cc @@ -13,6 +13,7 @@ TEST(AffineGridTest, 2d) { test.AddInput("size", {4}, {1, 1, 2, 3}); test.AddOutput("grid", {1, 2, 3, 2}, {-0.6667f, -0.5000f, 0.0000f, -0.5000f, 0.6667f, -0.5000f, -0.6667f, 0.5000f, 0.0000f, 0.5000f, 0.6667f, 0.5000f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -24,6 +25,7 @@ TEST(AffineGridTest, test_2d_0) { test.AddInput("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f}); test.AddInput("size", {4}, {1, 1, 3, 2}); test.AddOutput("grid", {1, 3, 2, 2}, {-0.3228f, -0.9151f, 1.1544f, -0.7414f, -0.4386f, -0.5868f, 1.0386f, -0.4132f, -0.5544f, -0.2586f, 0.9228f, -0.0849f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -33,6 +35,7 @@ TEST(AffineGridTest, test_2d_1) { test.AddInput("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f}); test.AddInput("size", {4}, {2, 10, 2, 3}); test.AddOutput("grid", {2, 2, 3, 2}, {-0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f, -0.5980f, -0.8620f, 0.3868f, -0.7462f, 1.3716f, -0.6304f, -0.7716f, -0.3696f, 0.2132f, -0.2538f, 1.1980f, -0.1380f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -42,6 +45,7 @@ TEST(AffineGridTest, test_2d_2) { test.AddInput("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f}); test.AddInput("size", {4}, {1, 1, 3, 2}); test.AddOutput("grid", {1, 3, 2, 2}, {-0.6726f, -2.7663f, 0.8274f, -1.9003f, -1.2500f, -0.9330f, 0.2500f, -0.0670f, -1.8274f, 0.9003f, -0.3274f, 1.7663f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -51,6 +55,7 @@ TEST(AffineGridTest, test_2d_3) { test.AddInput("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f}); test.AddInput("size", {4}, {2, 10, 2, 3}); test.AddOutput("grid", {2, 2, 3, 2}, {-1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f, -1.0670f, -2.4524f, -0.0670f, -1.8750f, 0.9330f, -1.2976f, -1.9330f, 0.2976f, -0.9330f, 0.8750f, 0.0670f, 1.4524f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -60,6 +65,7 @@ TEST(AffineGridTest, test_2d_4) { test.AddInput("theta", {1, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f}); test.AddInput("size", {4}, {1, 1, 3, 2}); test.AddOutput("grid", {1, 3, 2, 2}, {-1.0036f, -1.1661f, 1.9509f, -0.8188f, -1.1772f, -0.6736f, 1.7772f, -0.3264f, -1.3509f, -0.1812f, 1.6036f, 0.1661f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -69,6 +75,7 @@ TEST(AffineGridTest, test_2d_5) { test.AddInput("theta", {2, 2, 3}, {1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f, 1.477212f, -0.173648f, 0.300000f, 0.173648f, 0.492404f, -0.500000f}); test.AddInput("size", {4}, {2, 10, 2, 3}); test.AddOutput("grid", {2, 2, 3, 2}, {-1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f, -1.0036f, -1.1661f, 0.4736f, -0.9924f, 1.9509f, -0.8188f, -1.3509f, -0.1812f, 0.1264f, -0.0076f, 1.6036f, 0.1661f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -78,6 +85,7 @@ TEST(AffineGridTest, test_2d_6) { test.AddInput("theta", {1, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f}); test.AddInput("size", {4}, {1, 1, 3, 2}); test.AddOutput("grid", {1, 3, 2, 2}, {-1.1340f, -4.1160f, 1.8660f, -2.3840f, -2.0000f, -1.3660f, 1.0000f, 0.3660f, -2.8660f, 1.3840f, 0.1340f, 3.1160f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -87,6 +95,7 @@ TEST(AffineGridTest, test_2d_7) { test.AddInput("theta", {2, 2, 3}, {1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f, 1.500000f, -0.866025f, -0.500000f, 0.866025f, 2.750000f, -0.500000f}); test.AddInput("size", {4}, {2, 10, 2, 3}); test.AddOutput("grid", {2, 2, 3, 2}, {-1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f, -1.1340f, -4.1160f, 0.3660f, -3.2500f, 1.8660f, -2.3840f, -2.8660f, 1.3840f, -1.3660f, 2.2500f, 0.1340f, 3.1160f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -96,6 +105,7 @@ TEST(AffineGridTest, test_3d_0) { test.AddInput("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f}); test.AddInput("size", {5}, {1, 1, 3, 2, 2}); test.AddOutput("grid", {1, 3, 2, 2, 3}, {-0.7468f, -1.3266f, 1.5323f, 0.6627f, -1.2078f, 1.3639f, -0.7468f, 0.6430f, 1.6191f, 0.6627f, 0.7618f, 1.4507f, -0.4048f, -1.5442f, 1.8408f, 1.0048f, -1.4254f, 1.6724f, -0.4048f, 0.4254f, 1.9276f, 1.0048f, 0.5442f, 1.7592f, -0.0627f, -1.7618f, 2.1493f, 1.3468f, -1.6430f, 1.9809f, -0.0627f, 0.2078f, 2.2361f, 1.3468f, 0.3266f, 2.0677f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -105,6 +115,7 @@ TEST(AffineGridTest, test_3d_1) { test.AddInput("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f}); test.AddInput("size", {5}, {2, 10, 2, 2, 3}); test.AddOutput("grid", {2, 2, 2, 3, 3}, {-0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f, -0.8962f, -1.4008f, 1.6375f, 0.0435f, -1.3216f, 1.5252f, 0.9832f, -1.2424f, 1.4130f, -0.8962f, 0.5688f, 1.7243f, 0.0435f, 0.6480f, 1.6121f, 0.9832f, 0.7272f, 1.4998f, -0.3832f, -1.7272f, 2.1002f, 0.5565f, -1.6480f, 1.9879f, 1.4962f, -1.5688f, 1.8757f, -0.3832f, 0.2424f, 2.1870f, 0.5565f, 0.3216f, 2.0748f, 1.4962f, 0.4008f, 1.9625f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -114,6 +125,7 @@ TEST(AffineGridTest, test_3d_2) { test.AddInput("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f}); test.AddInput("size", {5}, {1, 1, 3, 2, 2}); test.AddOutput("grid", {1, 3, 2, 2, 3}, {-0.5299f, 0.8995f, -4.3568f, -0.2701f, -0.3995f, -2.9818f, -0.5299f, 2.3995f, 0.4064f, -0.2701f, 1.1005f, 1.7814f, -0.6299f, -0.6005f, -2.7691f, -0.3701f, -1.8995f, -1.3941f, -0.6299f, 0.8995f, 1.9941f, -0.3701f, -0.3995f, 3.3691f, -0.7299f, -2.1005f, -1.1814f, -0.4701f, -3.3995f, 0.1936f, -0.7299f, -0.6005f, 3.5818f, -0.4701f, -1.8995f, 4.9568f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -123,6 +135,7 @@ TEST(AffineGridTest, test_3d_3) { test.AddInput("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f}); test.AddInput("size", {5}, {2, 10, 2, 2, 3}); test.AddOutput("grid", {2, 2, 2, 3, 3}, {-0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f, -0.5982f, 0.7410f, -4.1890f, -0.4250f, -0.1250f, -3.2724f, -0.2518f, -0.9910f, -2.3557f, -0.5982f, 2.2410f, 0.5741f, -0.4250f, 1.3750f, 1.4908f, -0.2518f, 0.5090f, 2.4075f, -0.7482f, -1.5090f, -1.8075f, -0.5750f, -2.3750f, -0.8908f, -0.4018f, -3.2410f, 0.0259f, -0.7482f, -0.0090f, 2.9557f, -0.5750f, -0.8750f, 3.8724f, -0.4018f, -1.7410f, 4.7890f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -132,6 +145,7 @@ TEST(AffineGridTest, test_3d_4) { test.AddInput("theta", {1, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f}); test.AddInput("size", {5}, {1, 1, 3, 2, 2}); test.AddOutput("grid", {1, 3, 2, 2, 3}, {-1.6226f, -2.2620f, 1.4189f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, 1.1965f, 1.9147f, 1.2557f, -1.1095f, -2.5884f, 1.8816f, 1.7095f, -2.3508f, 1.5448f, -1.1095f, 1.3508f, 2.0552f, 1.7095f, 1.5884f, 1.7184f, -0.5965f, -2.9147f, 2.3443f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 2.2226f, 1.2620f, 2.1811f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -141,6 +155,7 @@ TEST(AffineGridTest, test_3d_5) { test.AddInput("theta", {2, 3, 4}, {1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f, 1.409539f, 0.000000f, 0.513030f, 0.300000f, 0.118782f, 1.969615f, -0.326352f, -0.500000f, -0.168412f, 0.086824f, 0.462708f, 1.800000f}); test.AddInput("size", {5}, {2, 10, 2, 2, 3}); test.AddOutput("grid", {2, 2, 2, 3, 3}, {-1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f, -1.6226f, -2.2620f, 1.4189f, -0.2130f, -2.1433f, 1.2505f, 1.1965f, -2.0245f, 1.0821f, -1.6226f, 1.6772f, 1.5925f, -0.2130f, 1.7960f, 1.4241f, 1.1965f, 1.9147f, 1.2557f, -0.5965f, -2.9147f, 2.3443f, 0.8130f, -2.7960f, 2.1759f, 2.2226f, -2.6772f, 2.0075f, -0.5965f, 1.0245f, 2.5179f, 0.8130f, 1.1433f, 2.3495f, 2.2226f, 1.2620f, 2.1811f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -150,6 +165,7 @@ TEST(AffineGridTest, test_3d_6) { test.AddInput("theta", {1, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f}); test.AddInput("size", {5}, {1, 1, 3, 2, 2}); test.AddOutput("grid", {1, 3, 2, 2, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.0902f, 1.9510f, 4.0566f, -0.7598f, -0.7010f, -5.8381f, -0.2402f, -3.2990f, -3.0881f, -0.7598f, 2.2990f, 3.6881f, -0.2402f, -0.2990f, 6.4381f, -0.9098f, -2.9510f, -3.4566f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.3902f, -2.5490f, 8.8197f}); + test.SetOutputTolerance(0.0001f); test.Run(); } @@ -159,6 +175,7 @@ TEST(AffineGridTest, test_3d_7) { test.AddInput("theta", {2, 3, 4}, {0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f, 0.259808f, 0.000000f, -0.150000f, -0.500000f, -1.299038f, 1.500000f, -2.250000f, -0.500000f, 1.375000f, 4.763140f, 2.381570f, 0.300000f}); test.AddInput("size", {5}, {2, 10, 2, 2, 3}); test.AddOutput("grid", {2, 2, 2, 3, 3}, {-0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f, -0.6098f, 1.5490f, -8.2197f, -0.3500f, 0.2500f, -6.8447f, -0.0902f, -1.0490f, -5.4697f, -0.6098f, 4.5490f, 1.3066f, -0.3500f, 3.2500f, 2.6816f, -0.0902f, 1.9510f, 4.0566f, -0.9098f, -2.9510f, -3.4566f, -0.6500f, -4.2500f, -2.0816f, -0.3902f, -5.5490f, -0.7066f, -0.9098f, 0.0490f, 6.0697f, -0.6500f, -1.2500f, 7.4447f, -0.3902f, -2.5490f, 8.8197f}); + test.SetOutputTolerance(0.0001f); test.Run(); } } // namespace test diff --git a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc index b6720ae2a9a7d..8dcb15cbc6926 100644 --- a/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc +++ b/onnxruntime/test/providers/cpu/tensor/mean_variance_normalization_test.cc @@ -5,6 +5,7 @@ #include "test/common/tensor_op_test_utils.h" #include "test/providers/provider_test_utils.h" +#include "test/util/include/default_providers.h" namespace onnxruntime::test { @@ -155,6 +156,10 @@ TEST(MeanVarianceNormalizationTest, AxesSubsets5D) { test.AddInput("input", shape, X.data(), X.size()); test.AddOutput("output", shape, Y.data(), Y.size()); + if (DefaultDmlExecutionProvider().get() != nullptr) { + test.SetOutputTolerance(0.001f); + } + test.Run(); }; diff --git a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc index bfb59f1525e47..18c1364f5d1f6 100644 --- a/orttraining/orttraining/test/gradient/optimizer_ops_test.cc +++ b/orttraining/orttraining/test/gradient/optimizer_ops_test.cc @@ -144,6 +144,8 @@ TEST(OptimizerTest, AdamBiasCorrection) { test.AddOutput("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f}); test.AddOutput("W_Out", {3}, {-1.4634f, -0.6416f, -1.2121f}); + test.SetOutputTolerance(0.0001f); + test.AddAttribute("do_bias_correction", static_cast(1)); test.AddAttribute("weight_decay_mode", static_cast(0)); @@ -167,6 +169,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0NoBiasCorrection) { test.AddOutput("W_Out", {3}, {-3.6210f, -2.8075f, -3.3723f}); test.AddOutput("G_Out", {3}, {-3.1576f, -3.1658f, -3.1601f}); + test.SetOutputTolerance(0.0001f); + test.AddAttribute("do_bias_correction", static_cast(0)); test.AddAttribute("lambda", 0.01f); test.AddAttribute("weight_decay_mode", static_cast(0)); @@ -191,6 +195,8 @@ TEST(OptimizerTest, AdamWeightDecayMode0WithBiasCorrection) { test.AddOutput("W_Out", {3}, {-1.4587f, -0.6452f, -1.2099f}); test.AddOutput("G_Out", {3}, {-0.9954f, -1.0036f, -0.9979f}); + test.SetOutputTolerance(0.0001f); + test.AddAttribute("do_bias_correction", static_cast(1)); test.AddAttribute("lambda", 0.01f); test.AddAttribute("weight_decay_mode", static_cast(0)); @@ -214,6 +220,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1NoBiasCorrection) { test.AddOutput("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f}); test.AddOutput("W_Out", {3}, {-3.5894f, -2.7758f, -3.3406f}); + test.SetOutputTolerance(0.0001f); + test.AddAttribute("do_bias_correction", static_cast(0)); test.AddAttribute("lambda", 0.01f); test.AddAttribute("weight_decay_mode", static_cast(1)); @@ -237,6 +245,8 @@ TEST(OptimizerTest, AdamWeightDecayMode1WithBiasCorrection) { test.AddOutput("Moment_2_Out", {3}, {1.7400e-04f, 8.9966e-04f, 1.5102e-03f}); test.AddOutput("W_Out", {3}, {-1.4488f, -0.6352f, -1.1999f}); + test.SetOutputTolerance(0.0001f); + test.AddAttribute("do_bias_correction", static_cast(1)); test.AddAttribute("lambda", 0.01f); test.AddAttribute("weight_decay_mode", static_cast(1)); @@ -368,6 +378,11 @@ TEST(OptimizerTest, AdamOptimizerMixPrecision_FP16Weight_ClipNorm_Test) { test.AddOptionalOutputEdge(); test.AddOutput("FP16_W_Out", {3}, w_new_half); + test.SetOutputAbsErr("Moment_1_Out", 0.005f); + test.SetOutputAbsErr("Moment_2_Out", 0.005f); + test.SetOutputAbsErr("W_Out", 0.001f); + test.SetOutputAbsErr("FP16_W_Out", 0.005f); + test.AddAttribute("do_bias_correction", static_cast(0)); test.AddAttribute("weight_decay_mode", static_cast(0)); test.AddAttribute("max_norm_clip", 0.001f); @@ -617,6 +632,8 @@ void run_lamb_test_with_baseline( test.AddOptionalOutputEdge(); } + test.SetOutputTolerance(0.005f); + test.Run(); } @@ -737,6 +754,8 @@ void run_multi_tensor_lamb_test_with_baseline( test.AddAttribute("ratio_min", ratio_min); test.AddAttribute("ratio_max", ratio_max); + test.SetOutputTolerance(0.005f); + test.Run(); } diff --git a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc index e9795a24681cb..e89883bfd4d94 100644 --- a/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc +++ b/orttraining/orttraining/test/training_ops/cpu/nn/batchnorm_internal_test.cc @@ -37,6 +37,8 @@ TEST(BatchNormInternalTest, ForwardTrainingTest) { test.AddOutput("saved_mean", channel_dims, {-0.306f, 0.114562f}); test.AddOutput("saved_inv_std", channel_dims, {1.2288f, 0.861317f}); + test.SetOutputTolerance(0.0001f); + std::vector> execution_providers; execution_providers.emplace_back(DefaultCpuExecutionProvider()); diff --git a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc index 6335a666e0381..d842d4f1ea736 100644 --- a/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/batch_norm_internal_test.cc @@ -68,6 +68,7 @@ static void TestBatchNormInternal(bool test_double = false, bool T_is_half = fal test.AddOutput("running_var", channel_dims, running_var_double); test.AddOutput("saved_mean", channel_dims, saved_mean_double); test.AddOutput("saved_inv_std", channel_dims, saved_inv_std_double); + test.SetOutputTolerance(0.0001f); } else { if (T_is_half) { std::vector X_half(X.size()); From c45cff60cfd10e6c35dbcff3d6dc7e4da16bace2 Mon Sep 17 00:00:00 2001 From: Guenther Schmuelling Date: Tue, 19 Mar 2024 16:15:49 -0700 Subject: [PATCH 34/55] [js/webgpu] fix maxpool / fp16 (#19981) --- js/web/lib/wasm/jsep/webgpu/ops/pool.ts | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts index 4e933573b9137..5521650e8ded4 100644 --- a/js/web/lib/wasm/jsep/webgpu/ops/pool.ts +++ b/js/web/lib/wasm/jsep/webgpu/ops/pool.ts @@ -381,8 +381,9 @@ const createMaxPoolProgramInfo = programUniforms }), getShaderSource: shaderHelper => generatePoolingCode( - shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2, -1e5, uniforms, - hasPads, pwStartEndNotZero, phStartEndNotZero), + shaderHelper, x, input.dims.length, outputShape.length, adjustedAttributes, op1, op2, + (input.dataType === DataType.float16) ? -65504 : -1e5, uniforms, hasPads, pwStartEndNotZero, + phStartEndNotZero), }; }; From 6fe02068af2ec9e7b0f49214e3ca84ed1d7cf6df Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Tue, 19 Mar 2024 17:00:44 -0700 Subject: [PATCH 35/55] Add const cast for DLManagedTensor (#19982) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ### Description Add Const Cast for DLManagedTensor as PyTorch has changed it's [code](https://github.com/pytorch/pytorch/pull/121102) which creates incompatibility. ### Motivation and Context Fix the below error while configuring ORT-training with nightly PyTorch ``` aten_op_executor.cc:60:40: error: invalid conversion from ‘const DLManagedTensor*’ to ‘DLManagedTensor*’ [-fpermissive] 60 | at::Tensor tensor = at::fromDLPack(dlpack); | ^~~~~~ | | | const DLManagedTensor* ``` --- .../torch_cpp_extensions/aten_op_executor/aten_op_executor.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc index e8be98cbfc0e4..4148e63d58619 100644 --- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc +++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc @@ -57,7 +57,7 @@ struct ATenOperator { c10::IValue i_value; // Create the torch tensor from this DLPack no matter we need it or not below, // so that the dlpack's deleter will be triggered when torch tensor is out of scope. - at::Tensor tensor = at::fromDLPack(dlpack); + at::Tensor tensor = at::fromDLPack(const_cast(dlpack)); switch (elem_kinds[index]) { case c10::TypeKind::TensorType: { i_value = is_optional ? c10::IValue(c10::optional(tensor)) : c10::IValue(tensor); From 3dfe4a5e6d5075a976516a9c43fe5f92da8614a9 Mon Sep 17 00:00:00 2001 From: mindest <30493312+mindest@users.noreply.github.com> Date: Wed, 20 Mar 2024 08:35:18 +0800 Subject: [PATCH 36/55] [ROCm] Remove MPI dependency and collectives to use NCCL (#19830) ### Description * Remove MPI dependency to use NCCL AllReduce, etc. * Exclude unsupported collectives in hipify --- cmake/onnxruntime_rocm_hipify.cmake | 24 +++++-------------- .../contrib_ops/rocm/rocm_contrib_kernels.cc | 4 ++-- .../cuda/communication/nccl_service.cc | 2 ++ .../linux-migraphx-ci-pipeline.yml | 1 + 4 files changed, 11 insertions(+), 20 deletions(-) diff --git a/cmake/onnxruntime_rocm_hipify.cmake b/cmake/onnxruntime_rocm_hipify.cmake index 6f54943f09afe..cadb06bb38707 100644 --- a/cmake/onnxruntime_rocm_hipify.cmake +++ b/cmake/onnxruntime_rocm_hipify.cmake @@ -94,30 +94,18 @@ set(contrib_ops_excluded_files "bert/group_query_attention.cc" "bert/group_query_attention_impl.h" "bert/group_query_attention_impl.cu" + "collective/distributed_*" + "collective/shard*" ) -if (NOT onnxruntime_ENABLE_ATEN) - list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc") -endif() if (NOT onnxruntime_USE_NCCL) # Those are string patterns to exclude. Do NOT use stars such as # collective/*.cc or *.h. list(APPEND contrib_ops_excluded_files "collective/nccl_kernels.cc") - list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h") - list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc") - list(APPEND contrib_ops_excluded_files "collective/sharding.cc") - list(APPEND contrib_ops_excluded_files "collective/sharding_spec.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_matmul.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_slice.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_reshape.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_expand.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_reduce.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_unsqueeze.cc") - list(APPEND contrib_ops_excluded_files "collective/distributed_squeeze.cc") -else() - # moe not supported for ROCm EP - list(APPEND contrib_ops_excluded_files "collective/sharded_moe.h") - list(APPEND contrib_ops_excluded_files "collective/sharded_moe.cc") +endif() + +if (NOT onnxruntime_ENABLE_ATEN) + list(APPEND contrib_ops_excluded_files "aten_ops/aten_op.cc") endif() set(provider_excluded_files diff --git a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc index 382a3951f3a83..e19a976f3141c 100644 --- a/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc +++ b/onnxruntime/contrib_ops/rocm/rocm_contrib_kernels.cc @@ -151,7 +151,7 @@ class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kPytorchAtenDomain class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, ShrunkenGather); #endif -#if defined(USE_MPI) && defined(ORT_USE_NCCL) +#ifdef ORT_USE_NCCL class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllReduce); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllGather); class ONNX_OPERATOR_KERNEL_CLASS_NAME(kRocmExecutionProvider, kMSDomain, 1, AllToAll); @@ -311,7 +311,7 @@ Status RegisterRocmContribKernels(KernelRegistry& kernel_registry) { BuildKernelCreateInfo, #endif -#if defined(USE_MPI) && defined(ORT_USE_NCCL) +#ifdef ORT_USE_NCCL BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, diff --git a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc index f604e4c4aaf3e..c642a87e22de6 100644 --- a/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc +++ b/orttraining/orttraining/training_ops/cuda/communication/nccl_service.cc @@ -233,6 +233,7 @@ void NcclService::Initialize() { // CPUs // Other devices +#ifdef USE_MPI const int mpi_rank = onnxruntime::training::MPIContext::GetInstance().GetWorldRank(); const int mpi_local_rank = onnxruntime::training::MPIContext::GetInstance().GetLocalRank(); const int mpi_size = onnxruntime::training::MPIContext::GetInstance().GetWorldSize(); @@ -248,6 +249,7 @@ void NcclService::Initialize() { if (mpi_rank == 0) NCCL_CALL_THROW(ncclGetUniqueId(&id)); MPI_CHECK(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); NCCL_CALL_THROW(ncclCommInitRank(&comm_, mpi_size, id, mpi_rank)); +#endif // USE_MPI } void NcclService::Launch() { diff --git a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml index 9cf7a3fb42397..8b58d958ba899 100644 --- a/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-migraphx-ci-pipeline.yml @@ -109,6 +109,7 @@ jobs: --rocm_version=$(RocmVersion) \ --rocm_home /opt/rocm \ --nccl_home /opt/rocm \ + --enable_nccl \ --update \ --build_dir /build \ --build \ From 6ff31e06d5757779b9c8d53e9d02a3b62b3e3438 Mon Sep 17 00:00:00 2001 From: Ye Wang <52801275+wangyems@users.noreply.github.com> Date: Tue, 19 Mar 2024 21:28:15 -0700 Subject: [PATCH 37/55] [MoE] Add TP and Mixtral MoE (#19945) ### Description 1.Support Tensor Parallelism in ShardedMoE. 2.Make necessary code changes to support Mixtral MoE. 3.Fix a bug related to using IOBinding in test script. 4.Fix the input size limitation ### Motivation and Context --- docs/ContribOperators.md | 16 +- docs/OperatorKernels.md | 2 +- .../cuda/collective/sharded_moe.cc | 113 ++++-- .../contrib_ops/cuda/collective/sharded_moe.h | 1 + .../cuda/moe/ft_moe/epilogue_helpers.h | 33 +- .../cuda/moe/ft_moe/moe_gemm_kernels.h | 9 +- .../moe/ft_moe/moe_gemm_kernels_template.h | 48 ++- .../contrib_ops/cuda/moe/ft_moe/moe_kernel.cu | 304 +++++++++++---- .../contrib_ops/cuda/moe/ft_moe/moe_kernel.h | 22 +- onnxruntime/contrib_ops/cuda/moe/moe.cc | 58 ++- onnxruntime/contrib_ops/cuda/moe/moe_base.h | 50 ++- .../core/graph/contrib_ops/collective_defs.cc | 32 +- .../core/graph/contrib_ops/contrib_defs.cc | 11 +- .../core/providers/cuda/cu_inc/common.cuh | 4 +- onnxruntime/test/contrib_ops/moe_test.cc | 177 ++++++++- .../sharded_moe/test_sharded_moe.py | 260 ++++++++++--- .../transformers/test_parity_mixtral_moe.py | 365 ++++++++++++++++++ .../python/transformers/test_parity_moe.py | 13 +- 18 files changed, 1272 insertions(+), 246 deletions(-) create mode 100644 onnxruntime/test/python/transformers/test_parity_mixtral_moe.py diff --git a/docs/ContribOperators.md b/docs/ContribOperators.md index 5f0100fad95a2..32a4ca16b7824 100644 --- a/docs/ContribOperators.md +++ b/docs/ContribOperators.md @@ -2931,8 +2931,8 @@ This version of the operator has been available since version 1 of the 'com.micr ### **com.microsoft.MoE** Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1, - GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf) - usually uses top 32 experts. + GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf) + usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral). #### Version @@ -2946,9 +2946,11 @@ This version of the operator has been available since version 1 of the 'com.micr
Activation function to use. Choose from relu, gelu, silu and identity. Default is relu
k : int
Number of top experts to select from expert pool
+
normalize_routing_weights : int
+
Whether to normalize routing weights
-#### Inputs (4 - 6) +#### Inputs (5 - 8)
input : T
@@ -2957,12 +2959,16 @@ This version of the operator has been available since version 1 of the 'com.micr
2D input tensor with shape (num_rows, num_experts)
fc1_experts_weights : T
3D input tensor with shape (num_experts, hidden_size, inter_size)
-
fc2_experts_weights : T
-
3D input tensor with shape (num_experts, inter_size, hidden_size)
fc1_experts_bias (optional) : T
2D optional input tensor with shape (num_experts, inter_size)
+
fc2_experts_weights : T
+
3D input tensor with shape (num_experts, inter_size, hidden_size)
fc2_experts_bias (optional) : T
2D optional input tensor with shape (num_experts, hidden_size)
+
fc3_experts_weights (optional) : T
+
3D optional input tensor with shape (num_experts, hidden_size, inter_size)
+
fc3_experts_bias (optional) : T
+
2D optional input tensor with shape (num_experts, inter_size)
#### Outputs diff --git a/docs/OperatorKernels.md b/docs/OperatorKernels.md index eddc3b7873d80..bca8e17b3dfd4 100644 --- a/docs/OperatorKernels.md +++ b/docs/OperatorKernels.md @@ -861,7 +861,7 @@ Do not modify directly.* |LongformerAttention|*in* input:**T**
*in* weight:**T**
*in* bias:**T**
*in* mask:**T**
*in* global_weight:**T**
*in* global_bias:**T**
*in* global:**G**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| |MatMulBnb4|*in* A:**T1**
*in* B:**T2**
*in* absmax:**T1**
*out* Y:**T1**|1+|**T1** = tensor(bfloat16), tensor(float), tensor(float16)
**T2** = tensor(uint8)| |MatMulNBits|*in* A:**T1**
*in* B:**T2**
*in* scales:**T1**
*in* zero_points:**T3**
*in* g_idx:**T4**
*out* Y:**T1**|1+|**T1** = tensor(float), tensor(float16)
**T2** = tensor(uint8)| -|MoE|*in* input:**T**
*in* router_probs:**T**
*in* fc1_experts_weights:**T**
*in* fc2_experts_weights:**T**
*in* fc1_experts_bias:**T**
*in* fc2_experts_bias:**T**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| +|MoE|*in* input:**T**
*in* router_probs:**T**
*in* fc1_experts_weights:**T**
*in* fc1_experts_bias:**T**
*in* fc2_experts_weights:**T**
*in* fc2_experts_bias:**T**
*in* fc3_experts_weights:**T**
*in* fc3_experts_bias:**T**
*out* output:**T**|1+|**T** = tensor(float), tensor(float16)| |MultiHeadAttention|*in* query:**T**
*in* key:**T**
*in* value:**T**
*in* bias:**T**
*in* key_padding_mask:**M**
*in* relative_position_bias:**T**
*in* past_key:**T**
*in* past_value:**T**
*out* output:**T**
*out* present_key:**T**
*out* present_value:**T**|1+|**T** = tensor(float), tensor(float16)| |NGramRepeatBlock|*in* input_ids:**Tid**
*in* scores:**T**
*out* scores_out:**T**|1+|**T** = tensor(float)
**Tid** = tensor(int64)| |NhwcConv|*in* X:**T**
*in* W:**T**
*in* B:**T**
*out* Y:**T**|1+|**T** = tensor(float), tensor(float16)| diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc index 40a667ffd5d83..2efc37cf98010 100644 --- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc +++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.cc @@ -1,6 +1,8 @@ // Copyright (c) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. +#include + #include "core/common/safeint.h" #include "core/providers/cuda/cuda_common.h" #include "contrib_ops/cuda/bert/transformer_cuda_common.h" @@ -35,6 +37,7 @@ using namespace ONNX_NAMESPACE; template ShardedMoE::ShardedMoE(const OpKernelInfo& op_kernel_info) : NcclKernel(op_kernel_info), MoEBase(op_kernel_info) { + ORT_ENFORCE(op_kernel_info.GetAttr("tensor_shards", &tensor_shards_).IsOK()); ORT_ENFORCE(op_kernel_info.GetAttr("local_experts_start_index", &local_experts_start_index_).IsOK()); rank_to_experts_start_index_.resize(nccl_->Size()); // Initialize rank_to_experts_start_index_[0] to a value to convey that it is not initialized. @@ -55,27 +58,36 @@ Status ShardedMoE::ComputeInternal(OpKernelContext* context) const { // Create a {Rank, ExpertsStartIndex} map on Host. AutoDestoryCudaEvent cuda_event; cudaEvent_t& copy_event = cuda_event.Get(); - ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event)); const Tensor* input = context->Input(0); const Tensor* router_probs = context->Input(1); const Tensor* fc1_experts_weights = context->Input(2); - const Tensor* fc2_experts_weights = context->Input(3); - const Tensor* fc1_experts_bias_optional = context->Input(4); + const Tensor* fc1_experts_bias_optional = context->Input(3); + const Tensor* fc2_experts_weights = context->Input(4); const Tensor* fc2_experts_bias_optional = context->Input(5); + const Tensor* fc3_experts_weights_optional = context->Input(6); + const Tensor* fc3_experts_bias_optional = context->Input(7); + + MoEParameters moe_params(tensor_shards_); + ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional, + fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional, + fc3_experts_bias_optional)); - MoEParameters moe_params; - ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights, - fc1_experts_bias_optional, fc2_experts_bias_optional)); ORT_RETURN_IF_NOT(moe_params.num_experts % nccl_->Size() == 0, "num_experts should be divisible by world_size"); - ort_fastertransformer::CutlassMoeFCRunner moe_runner(sm); + if (moe_params.parallel_type == MoEParallelType::EP || moe_params.parallel_type == MoEParallelType::EPAndTP) { + ORT_RETURN_IF_ERROR(SynchronizeExpertsStartIndex(allocator, context, copy_event)); + } + + ort_fastertransformer::CutlassMoeFCRunner moe_runner(sm, + fc3_experts_weights_optional != nullptr, + normalize_routing_weights_); size_t ws_size = - moe_runner.getWorkspaceSize(static_cast(moe_params.num_rows), static_cast(moe_params.hidden_size), - static_cast(moe_params.inter_size), static_cast(moe_params.num_experts), - static_cast(k_)); + moe_runner.getWorkspaceSize(static_cast(moe_params.num_rows), static_cast(moe_params.hidden_size), + static_cast(moe_params.inter_size), + static_cast(moe_params.num_experts), static_cast(k_)); size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT); size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT); @@ -93,19 +105,25 @@ Status ShardedMoE::ComputeInternal(OpKernelContext* context) const { IAllocatorUniquePtr expert_for_source_row = IAllocator::MakeUniquePtr(allocator, expert_for_source_row_size, false, stream); - // fc1_scales and fc2_scales are used in quantized MoE - const CudaT* fc1_scales_ptr = nullptr; - const CudaT* fc2_scales_ptr = nullptr; + const CudaT* fc_scales_ptr = nullptr; moe_runner.run_moe_fc(reinterpret_cast(input->template Data()), reinterpret_cast(router_probs->template Data()), reinterpret_cast(fc1_experts_weights->template Data()), - std::move(fc1_scales_ptr), + std::move(fc_scales_ptr), fc1_experts_bias_optional == nullptr ? nullptr : reinterpret_cast(fc1_experts_bias_optional->template Data()), - activation_type_, reinterpret_cast(fc2_experts_weights->template Data()), - std::move(fc2_scales_ptr), static_cast(moe_params.num_rows), + activation_type_, + fc3_experts_weights_optional == nullptr + ? nullptr + : reinterpret_cast(fc3_experts_weights_optional->template Data()), + std::move(fc_scales_ptr), + fc3_experts_bias_optional == nullptr + ? nullptr + : reinterpret_cast(fc3_experts_bias_optional->template Data()), + reinterpret_cast(fc2_experts_weights->template Data()), + std::move(fc_scales_ptr), static_cast(moe_params.num_rows), static_cast(moe_params.hidden_size), static_cast(moe_params.inter_size), static_cast(moe_params.num_experts), static_cast(moe_params.local_num_experts), static_cast(local_experts_start_index_), @@ -116,31 +134,54 @@ Status ShardedMoE::ComputeInternal(OpKernelContext* context) const { Tensor* output = context->Output(0, input->Shape()); - size_t stride_count = moe_params.hidden_size; - size_t stride_bytes = stride_count * sizeof(CudaT); - int64_t total_past_rows = 0; - int64_t total_covered_rows = 0; - if (copy_event != nullptr) { - CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event)); + if (moe_params.parallel_type == MoEParallelType::None) { + fc2_output_bc = std::move(fc2_output); } - NCCL_RETURN_IF_ERROR(ncclGroupStart()); - for (int rank = 0; rank < nccl_->Size(); ++rank) { - int64_t experts_start_index = rank_to_experts_start_index_[rank]; - moe_runner.get_total_rows_info(experts_start_index, - moe_params.local_num_experts, - total_past_rows, - total_covered_rows); - const char* src = reinterpret_cast(fc2_output.get()) + total_past_rows * stride_bytes; - char* dst = reinterpret_cast(fc2_output_bc.get()) + total_past_rows * stride_bytes; - NCCL_RETURN_IF_ERROR(ncclBroadcast(src, - dst, - total_covered_rows * stride_count, + + if (moe_params.parallel_type == MoEParallelType::EPAndTP) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Expert and Tensor Parallelism is not supported yet"); + } + + if (moe_params.parallel_type == MoEParallelType::TP) { + ORT_ENFORCE(moe_params.tensor_shards == nccl_->Size()); + NCCL_RETURN_IF_ERROR(ncclGroupStart()); + NCCL_RETURN_IF_ERROR(ncclAllReduce(reinterpret_cast(fc2_output.get()), + reinterpret_cast(fc2_output_bc.get()), + fc2_output_size / sizeof(CudaT), GetNcclDataType(input->DataType()), - rank, + ncclSum, nccl_->Comm(), Stream(context))); + NCCL_RETURN_IF_ERROR(ncclGroupEnd()); + } + + if (moe_params.parallel_type == MoEParallelType::EP) { + size_t stride_count = moe_params.hidden_size; + size_t stride_bytes = stride_count * sizeof(CudaT); + int64_t total_past_rows = 0; + int64_t total_covered_rows = 0; + if (copy_event != nullptr) { + CUDA_RETURN_IF_ERROR(cudaEventSynchronize(copy_event)); + } + NCCL_RETURN_IF_ERROR(ncclGroupStart()); + for (int rank = 0; rank < nccl_->Size(); ++rank) { + int64_t experts_start_index = rank_to_experts_start_index_[rank]; + moe_runner.get_total_rows_info(experts_start_index, + moe_params.local_num_experts, + total_past_rows, + total_covered_rows); + const char* src = reinterpret_cast(fc2_output.get()) + total_past_rows * stride_bytes; + char* dst = reinterpret_cast(fc2_output_bc.get()) + total_past_rows * stride_bytes; + NCCL_RETURN_IF_ERROR(ncclBroadcast(src, + dst, + total_covered_rows * stride_count, + GetNcclDataType(input->DataType()), + rank, + nccl_->Comm(), + Stream(context))); + } + NCCL_RETURN_IF_ERROR(ncclGroupEnd()); } - NCCL_RETURN_IF_ERROR(ncclGroupEnd()); ort_fastertransformer::finalize_moe_routing_kernelLauncher( reinterpret_cast(fc2_output_bc.get()), reinterpret_cast(output->template MutableData()), diff --git a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h index 5ea4ae59c4020..827283a794dd6 100644 --- a/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h +++ b/onnxruntime/contrib_ops/cuda/collective/sharded_moe.h @@ -26,6 +26,7 @@ class ShardedMoE final : public NcclKernel, public MoEBase { Status SynchronizeExpertsStartIndex(AllocatorPtr& alloc, OpKernelContext* ctx, cudaEvent_t& cuda_event) const; int64_t local_experts_start_index_; + int64_t tensor_shards_; std::vector rank_to_experts_start_index_; }; diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h index 78d206bf1d9bc..b18a70e899d1c 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/epilogue_helpers.h @@ -83,10 +83,16 @@ namespace ort_fastertransformer { struct EpilogueOpBiasSilu {}; +struct EpilogueOpNoBiasSilu {}; + struct EpilogueOpBiasReLU {}; +struct EpilogueOpNoBiasReLU {}; + struct EpilogueOpBiasFtGelu {}; +struct EpilogueOpNoBiasFtGelu {}; + struct EpilogueOpBias {}; struct EpilogueOpNoBias {}; @@ -101,6 +107,13 @@ struct Epilogue; }; +template +struct Epilogue { + using Op = cutlass::epilogue::thread::LinearCombinationSilu; +}; + template struct Epilogue { using Op = cutlass::epilogue::thread::LinearCombinationRelu; }; +template +struct Epilogue { + using Op = cutlass::epilogue::thread::LinearCombinationRelu; +}; + template struct Epilogue { using Op = cutlass::epilogue::thread::LinearCombinationGeneric< @@ -116,6 +136,14 @@ struct Epilogue; }; +template +struct Epilogue { + using Op = cutlass::epilogue::thread::LinearCombinationGeneric< + cutlass::epilogue::thread::GELU_taylor, ElementType, ElementsPerVectorAccess, ElementAccumulator, + ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling, + cutlass::FloatRoundStyle::round_to_nearest, true>; +}; + template struct Epilogue { using Op = cutlass::epilogue::thread::LinearCombination struct Epilogue { using Op = - cutlass::epilogue::thread::LinearCombination; + cutlass::epilogue::thread::LinearCombination< + ElementType, ElementsPerVectorAccess, ElementAccumulator, + ElementAccumulator, cutlass::epilogue::thread::ScaleType::OnlyAlphaScaling>; }; } // namespace ort_fastertransformer diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h index 60608f462fde5..e0f91ab806c85 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels.h @@ -42,8 +42,13 @@ class MoeGemmRunner { int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, ActivationType activation_type, cudaStream_t stream); - void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert, - int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream); + void moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales, T* C, int64_t* total_rows_before_expert, + int64_t total_rows, int64_t gemm_n, int64_t gemm_k, int num_experts, + ActivationType activation_type, cudaStream_t stream); + + void moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C, + int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, int64_t gemm_k, + int num_experts, cudaStream_t stream); private: template diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h index a3dcf0da16b98..2a15fdfd1cc1a 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_gemm_kernels_template.h @@ -311,8 +311,8 @@ void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weig template ::value>::type* = nullptr> void dispatch_moe_gemm_to_cutlass(const T* A, const WeightType* B, const T* weight_scales, const T* biases, T* C, - int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n, int64_t gemm_k, - int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/, + int64_t* total_rows_before_expert, int64_t /*total_rows*/, int64_t gemm_n, + int64_t gemm_k, int num_experts, CutlassGemmConfig gemm_config, int /*sm_version*/, int multi_processor_count, cudaStream_t stream, int* occupancy = nullptr) { switch (gemm_config.tile_config) { case CutlassTileConfig::CtaShape128x128x8_WarpShape64x64x8: @@ -429,11 +429,47 @@ void MoeGemmRunner::moe_gemm_bias_act(const T* A, const WeightTyp } template -void MoeGemmRunner::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, T* C, - int64_t* total_rows_before_expert, int64_t total_rows, int64_t gemm_n, - int64_t gemm_k, int num_experts, cudaStream_t stream) { - run_gemm(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, +void MoeGemmRunner::moe_gemm_act(const T* A, const WeightType* B, const T* weight_scales, + T* C, int64_t* total_rows_before_expert, int64_t total_rows, + int64_t gemm_n, int64_t gemm_k, int num_experts, + ActivationType activation_type, cudaStream_t stream) { + switch (activation_type) { + case ActivationType::Relu: + run_gemm(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, + gemm_k, num_experts, stream); + break; + case ActivationType::Gelu: + run_gemm(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, + gemm_k, num_experts, stream); + break; + case ActivationType::Silu: + run_gemm(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, + gemm_k, num_experts, stream); + break; + case ActivationType::Identity: + run_gemm(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, + num_experts, stream); + break; + case ActivationType::InvalidType: + ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM"); + break; + default: { + ORT_THROW("[FT Error][MoE Runner] Invalid activation type for MoE GEMM"); + } + } +} + +template +void MoeGemmRunner::moe_gemm(const T* A, const WeightType* B, const T* weight_scales, const T* biases, + T* C, int64_t* total_rows_before_expert, int64_t total_rows, + int64_t gemm_n, int64_t gemm_k, int num_experts, cudaStream_t stream) { + if (biases != nullptr) { + run_gemm(A, B, weight_scales, biases, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, num_experts, stream); + } else { + run_gemm(A, B, weight_scales, nullptr, C, total_rows_before_expert, total_rows, gemm_n, gemm_k, + num_experts, stream); + } } } // namespace ort_fastertransformer diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu index a5b47bcddefbc..5e6e484567988 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.cu @@ -30,7 +30,6 @@ #include "cutlass/array.h" #include "cutlass/numeric_conversion.h" -#include "cutlass/numeric_types.h" #ifdef __GNUC__ #pragma GCC diagnostic pop @@ -49,15 +48,14 @@ #endif namespace ort_fastertransformer { - static constexpr int WARP_SIZE = 32; // ====================== Softmax things =============================== // We have our own implementation of softmax here so we can support transposing the output // in the softmax kernel when we extend this module to support expert-choice routing. template -__launch_bounds__(TPB) __global__ - void moe_softmax(const T* input, const bool* finished, T* output, const int num_cols) { +__launch_bounds__(TPB) __global__ void moe_softmax(const T* input, const bool* finished, T* output, + const int num_cols) { using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmpStorage; @@ -108,14 +106,15 @@ __launch_bounds__(TPB) __global__ #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 template -__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, const int) { +__launch_bounds__(TPB) __global__ void moe_top_k(const T*, const bool*, T*, int*, int*, int, int, bool) { // Does not support pre-Kepler architectures ; } #else template __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, const bool* finished, T* output, - int* indices, int* source_rows, int num_experts, int k) { + int* indices, int* source_rows, int num_experts, int k, + bool normalize_routing_weights) { using cub_kvp = cub::KeyValuePair; using BlockReduce = cub::BlockReduce; __shared__ typename BlockReduce::TempStorage tmpStorage; @@ -128,6 +127,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, const bool should_process_row = finished ? !finished[block_row] : true; const int thread_read_offset = blockIdx.x * num_experts; + float output_row_sum = 0.f; for (int k_idx = 0; k_idx < k; ++k_idx) { thread_kvp.key = 0; thread_kvp.value = T(-1.f); // This is OK because inputs are probabilities @@ -155,6 +155,13 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, output[idx] = result_kvp.value; indices[idx] = should_process_row ? result_kvp.key : num_experts; source_rows[idx] = k_idx * num_rows + block_row; + + if (normalize_routing_weights && k_idx == k - 1) { +#pragma unroll + for (int ki = 0; ki < k; ++ki) { + output[idx - ki] = T(static_cast(output[idx - ki]) / output_row_sum); + } + } } __syncthreads(); } @@ -178,7 +185,7 @@ __launch_bounds__(TPB) __global__ void moe_top_k(const T* inputs_after_softmax, template __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ void topk_gating_softmax(const T* input, const bool* finished, T* output, int num_rows, int* indices, - int* source_rows, int k) { + int* source_rows, int k, bool normalize_routing_weights) { // We begin by enforcing compile time assertions and setting up compile time constants. static_assert(VPT == (VPT & -VPT), "VPT must be power of 2"); static_assert(NUM_EXPERTS == (NUM_EXPERTS & -NUM_EXPERTS), "NUM_EXPERTS must be power of 2"); @@ -296,6 +303,7 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ int start_col = first_elt_read_by_thread; static constexpr int COLS_PER_GROUP_LDG = ELTS_PER_LDG * THREADS_PER_ROW; + float output_row_sum = 0.f; for (int k_idx = 0; k_idx < k; ++k_idx) { // First, each thread does the local argmax float max_val = row_chunk[0]; @@ -336,8 +344,16 @@ __launch_bounds__(WARPS_PER_CTA* WARP_SIZE) __global__ // single) thread per row of the input/output matrices. const int idx = k * thread_row + k_idx; output[idx] = T(max_val); + output_row_sum = output_row_sum + static_cast(max_val); indices[idx] = should_process_row ? expert : NUM_EXPERTS; source_rows[idx] = k_idx * num_rows + thread_row; + + if (normalize_routing_weights && k_idx == k - 1) { +#pragma unroll + for (int ki = 0; ki < k; ++ki) { + output[idx - ki] = T(static_cast(output[idx - ki]) / output_row_sum); + } + } } // Finally, we clear the value in the thread with the current max if there is another iteration to run. @@ -370,7 +386,8 @@ struct TopkConstants { template void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T* output, int* indices, int* source_row, - int num_rows, int /*num_experts*/, int k, cudaStream_t stream) { + int num_rows, int /*num_experts*/, int k, bool normalize_routing_weights, + cudaStream_t stream) { static constexpr unsigned long MAX_BYTES_PER_LDG = 16; static constexpr int BYTES_PER_LDG = std::min((int)MAX_BYTES_PER_LDG, (int)sizeof(T) * EXPERTS); @@ -382,61 +399,63 @@ void topk_gating_softmax_launcher_helper(const T* input, const bool* finished, T dim3 block_dim(WARP_SIZE, WARPS_PER_TB); topk_gating_softmax - <<>>(input, finished, output, num_rows, indices, source_row, k); + <<>>(input, finished, output, num_rows, indices, source_row, k, + normalize_routing_weights); } template void topk_gating_softmax_kernelLauncher(const T* input, const bool* finished, T* output, T* softmax_temp_output, int* indices, int* source_row, int num_rows, int num_experts, - int k, cudaStream_t stream) { + int k, bool normalize_routing_weights, cudaStream_t stream) { static constexpr int WARPS_PER_TB = 4; switch (num_experts) { case 2: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 4: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 8: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 16: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 32: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 64: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 128: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } case 256: { topk_gating_softmax_launcher_helper(input, finished, output, indices, source_row, num_rows, - num_experts, k, stream); + num_experts, k, normalize_routing_weights, stream); break; } default: { static constexpr int TPB = 256; moe_softmax<<>>(input, finished, softmax_temp_output, num_experts); moe_top_k - <<>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k); + <<>>(softmax_temp_output, finished, output, indices, source_row, num_experts, k, + normalize_routing_weights); } } } @@ -521,25 +540,31 @@ __global__ void dispatch_activations_kernel(int64_t* total_rows_before_expert, i } template -CutlassMoeFCRunner::CutlassMoeFCRunner(int sm_version) { - total_past_rows_ = 0; - total_covered_rows_ = 0; +CutlassMoeFCRunner::CutlassMoeFCRunner(int sm_version, + bool has_fc3, + bool normalize_routing_weights) + : has_fc3_(has_fc3), + total_past_rows_(0), + total_covered_rows_(0), + normalize_routing_weights_(normalize_routing_weights) { moe_gemm_runner_.initialize(sm_version); } template -size_t CutlassMoeFCRunner::getWorkspaceSize(int num_rows, const int hidden_size, - const int inter_size, int num_experts, - int k) { - const int buf_size = static_cast(pad_to_multiple_of_16(k * num_rows * hidden_size)); - const int interbuf_size = static_cast(pad_to_multiple_of_16(k * num_rows * inter_size)); - const int padded_experts = static_cast(pad_to_multiple_of_16(num_experts)); - const int num_moe_inputs = static_cast(pad_to_multiple_of_16(k * num_rows)); - int num_softmax_outs = 0; +size_t CutlassMoeFCRunner::getWorkspaceSize(size_t num_rows, const size_t hidden_size, + const size_t inter_size, size_t num_experts, + size_t k) { + total_covered_rows_ = k * num_rows; + + const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size); + const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size); + const size_t padded_experts = pad_to_multiple_of_16(num_experts); + const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows); + size_t num_softmax_outs = 0; const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0); if (!is_pow_2 || num_experts > 256) { - num_softmax_outs = static_cast(pad_to_multiple_of_16(num_rows * num_experts)); + num_softmax_outs = pad_to_multiple_of_16(num_rows * num_experts); } // softmax output, permuted_rows and permuted_experts have moved to outside of moe kernel, allocate them @@ -548,13 +573,13 @@ size_t CutlassMoeFCRunner::getWorkspaceSize(int num_rows, total_ws_bytes += buf_size * sizeof(T); // permuted_data total_ws_bytes += padded_experts * sizeof(int64_t); // Hold total_rows_before_expert_ total_ws_bytes += num_softmax_outs * sizeof(T); - const int bytes_for_fc1_result = interbuf_size * sizeof(T); - const int sorter_ws_size_bytes = static_cast(pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows))); - sorter_.update_num_experts(num_experts); + const size_t bytes_for_fc1_result = has_fc3_ ? 2 * interbuf_size * sizeof(T) : interbuf_size * sizeof(T); + const size_t sorter_ws_size_bytes = pad_to_multiple_of_16(sorter_.getWorkspaceSize(num_rows)); + sorter_.update_num_experts(static_cast(num_experts)); - int bytes_for_intermediate_and_sorting = bytes_for_fc1_result; + size_t bytes_for_intermediate_and_sorting = bytes_for_fc1_result; if (sorter_ws_size_bytes > bytes_for_fc1_result) { - int remaining_bytes = static_cast(pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result)); + size_t remaining_bytes = pad_to_multiple_of_16(sorter_ws_size_bytes - bytes_for_fc1_result); bytes_for_intermediate_and_sorting += remaining_bytes; } @@ -563,13 +588,13 @@ size_t CutlassMoeFCRunner::getWorkspaceSize(int num_rows, } template -void CutlassMoeFCRunner::configure_ws_ptrs(char* ws_ptr, int num_rows, - const int hidden_size, const int inter_size, - int num_experts, int k) { - const int buf_size = static_cast(pad_to_multiple_of_16(k * num_rows * hidden_size)); - const int interbuf_size = static_cast(pad_to_multiple_of_16(k * num_rows * inter_size)); - const int padded_experts = static_cast(pad_to_multiple_of_16(num_experts)); - const int num_moe_inputs = static_cast(pad_to_multiple_of_16(k * num_rows)); +void CutlassMoeFCRunner::configure_ws_ptrs(char* ws_ptr, size_t num_rows, + const size_t hidden_size, const size_t inter_size, + size_t num_experts, size_t k) { + const size_t buf_size = pad_to_multiple_of_16(k * num_rows * hidden_size); + const size_t interbuf_size = pad_to_multiple_of_16(k * num_rows * inter_size); + const size_t padded_experts = pad_to_multiple_of_16(num_experts); + const size_t num_moe_inputs = pad_to_multiple_of_16(k * num_rows); source_rows_ = (int*)ws_ptr; permuted_rows_ = source_rows_ + num_moe_inputs; @@ -578,28 +603,130 @@ void CutlassMoeFCRunner::configure_ws_ptrs(char* ws_ptr, total_rows_before_expert_ = (int64_t*)(permuted_data_ + buf_size); - fc1_result_ = (T*)(total_rows_before_expert_ + padded_experts); + if (has_fc3_) { + fc3_result_ = reinterpret_cast(total_rows_before_expert_ + padded_experts); + fc1_result_ = reinterpret_cast(fc3_result_ + interbuf_size); + } else { + fc1_result_ = reinterpret_cast(total_rows_before_expert_ + padded_experts); + } const bool is_pow_2 = (num_experts != 0) && ((num_experts & (num_experts - 1)) == 0); if (!is_pow_2 || num_experts > 256) { - softmax_out_ = (T*)(fc1_result_ + interbuf_size); + softmax_out_ = reinterpret_cast(fc1_result_ + interbuf_size); } else { softmax_out_ = nullptr; } } +namespace { + +struct __align__(8) Half4 { + half2 x; + half2 y; +}; + +// TODO(wy): move to common header +template +struct T4; +template <> +struct T4 { + using Type = float4; +}; +template <> +struct T4 { + using Type = Half4; +}; + +template +struct T2; +template <> +struct T2 { + using Type = float2; +}; +template <> +struct T2 { + using Type = half2; +}; + +inline __device__ float2 operator*(const float2 a, const float2 b) { + return make_float2(a.x * b.x, a.y * b.y); +} + +inline __device__ float4 operator*(const float4 a, const float4 b) { + return make_float4(a.x * b.x, a.y * b.y, a.z * b.z, a.w * b.w); +} + +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 +inline __device__ half operator*(const half a, const half b) { + return __float2half(__half2float(a) * __half2float(b)); +} + +inline __device__ half2 operator*(const half2 a, const half2 b) { + return make_half2(a.x * b.x, a.y * b.y); +} +#endif + +inline __device__ Half4 operator*(const Half4 a, const Half4 b) { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ < 530 + Half4 result; + result.x = a.x * b.x; + result.y = a.y * b.y; + return result; +#else + return Half4{__hmul2(a.x, b.x), __hmul2(a.y, b.y)}; +#endif +} + +} // anonymous namespace + +template +__global__ void elementWiseMulKernel(T* output, T const* input, size_t inter_size) { + int const tid = threadIdx.x; + int const token = blockIdx.x; + + output = output + token * inter_size; + input = input + token * inter_size; + for (int i = tid; i < inter_size; i += blockDim.x) { + T fc1_value = input[i]; + output[i] = fc1_value * output[i]; + } +} + +template +void elementWiseMul(T* output, T const* input, int inter_size, int num_tokens, cudaStream_t stream) { + int const blocks = num_tokens; + + if (inter_size & 3 == 0) { + using vec_type = typename T4::Type; + int const threads = std::min(inter_size / 4, 1024); + elementWiseMulKernel<<>>(reinterpret_cast(output), + reinterpret_cast(input), + inter_size / 4); + } else if (inter_size & 1 == 0) { + using vec_type = typename T2::Type; + int const threads = std::min(inter_size / 2, 1024); + elementWiseMulKernel<<>>(reinterpret_cast(output), + reinterpret_cast(input), + inter_size / 2); + } else { + int const threads = std::min(inter_size, 1024); + elementWiseMulKernel<<>>(output, input, inter_size); + } +} + template void CutlassMoeFCRunner::run_moe_fc( const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales, - const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights, - const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts, - int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, - const bool* finished, int active_rows, T* expert_scales, int* expanded_source_row_to_expanded_dest_row, - int* expert_for_source_row, cudaStream_t stream) { + const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights, + const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales, + int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts, + int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows, + T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, + cudaStream_t stream) { static constexpr bool scales_required = std::is_same::value || std::is_same::value; - if constexpr (scales_required) { + if (scales_required) { if (fc1_scales == nullptr) { ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for first matmul is a null pointer"); } else if (fc2_scales == nullptr) { @@ -613,9 +740,10 @@ void CutlassMoeFCRunner::run_moe_fc( } } - configure_ws_ptrs(workspace_ptr, num_rows, hidden_size, inter_size, num_experts, k); + configure_ws_ptrs(workspace_ptr, static_cast(num_rows), static_cast(hidden_size), + static_cast(inter_size), static_cast(num_experts), static_cast(k)); topk_gating_softmax_kernelLauncher(gating_output, finished, expert_scales, softmax_out_, expert_for_source_row, - source_rows_, num_rows, num_experts, k, stream); + source_rows_, num_rows, num_experts, k, normalize_routing_weights_, stream); const int sorter_ws_size_bytes = static_cast(pad_to_multiple_of_16(sorter_.getWorkspaceSize(k * num_rows))); sorter_.run((void*)fc1_result_, sorter_ws_size_bytes, expert_for_source_row, permuted_experts_, source_rows_, @@ -634,15 +762,48 @@ void CutlassMoeFCRunner::run_moe_fc( } // expanded_active_expert_rows is not used - moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size, - fc1_expert_weights, fc1_scales, fc1_expert_biases, - fc1_result_ + total_past_rows_ * inter_size, - total_rows_before_expert_ + local_experts_start_index, - expanded_active_expert_rows, inter_size, hidden_size, - local_num_experts, fc1_activation_type, stream); + if (fc1_expert_biases != nullptr) { + moe_gemm_runner_.moe_gemm_bias_act(permuted_data_ + total_past_rows_ * hidden_size, + fc1_expert_weights, fc1_scales, fc1_expert_biases, + fc1_result_ + total_past_rows_ * inter_size, + total_rows_before_expert_ + local_experts_start_index, + expanded_active_expert_rows, inter_size, hidden_size, + local_num_experts, fc1_activation_type, stream); + } else { + moe_gemm_runner_.moe_gemm_act(permuted_data_ + total_past_rows_ * hidden_size, + fc1_expert_weights, fc1_scales, + fc1_result_ + total_past_rows_ * inter_size, + total_rows_before_expert_ + local_experts_start_index, + expanded_active_expert_rows, inter_size, hidden_size, + local_num_experts, fc1_activation_type, stream); + } + + if (has_fc3_) { + if (scales_required) { + if (fc3_scales == nullptr) { + ORT_THROW("[FT Error][Run MoE FC] Scales expected but scale for third matmul is a null pointer"); + } + } else { + if (fc3_scales != nullptr) { + ORT_THROW("[FT Error][Run MoE FC] Scales are ignored for fp32/fp16/bf16 but received scale for FC3"); + } + } + if (fc3_expert_weights == nullptr) { + ORT_THROW("[FT Error][Run MoE FC] FC3 weights are null"); + } + moe_gemm_runner_.moe_gemm(permuted_data_ + total_past_rows_ * hidden_size, + fc3_expert_weights, fc3_scales, fc3_expert_biases, + fc3_result_ + total_past_rows_ * inter_size, + total_rows_before_expert_ + local_experts_start_index, + expanded_active_expert_rows, inter_size, hidden_size, + local_num_experts, stream); + + elementWiseMul(fc1_result_ + total_past_rows_ * inter_size, fc3_result_ + total_past_rows_ * inter_size, + static_cast(inter_size), static_cast(total_covered_rows_), stream); + } moe_gemm_runner_.moe_gemm(fc1_result_ + total_past_rows_ * inter_size, - fc2_expert_weights, fc2_scales, + fc2_expert_weights, fc2_scales, nullptr, fc2_result + total_past_rows_ * hidden_size, total_rows_before_expert_ + local_experts_start_index, expanded_active_expert_rows, hidden_size, inter_size, local_num_experts, stream); @@ -651,14 +812,16 @@ void CutlassMoeFCRunner::run_moe_fc( template void CutlassMoeFCRunner::run_moe_fc( const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales, - const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc2_expert_weights, - const T* fc2_scales, int num_rows, const int hidden_size, const int inter_size, int num_experts, - int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales, + const T* fc1_expert_biases, ActivationType fc1_activation_type, const WeightType* fc3_expert_weights, + const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales, + int num_rows, const int hidden_size, const int inter_size, int num_experts, int local_num_experts, + int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales, int* expanded_source_row_to_expanded_dest_row, int* expert_for_source_row, cudaStream_t stream) { run_moe_fc(input_activations, gating_output, fc1_expert_weights, fc1_scales, fc1_expert_biases, fc1_activation_type, - fc2_expert_weights, fc2_scales, num_rows, hidden_size, inter_size, num_experts, local_num_experts, - local_experts_start_index, k, workspace_ptr, fc2_result, nullptr, num_rows, expert_scales, - expanded_source_row_to_expanded_dest_row, expert_for_source_row, stream); + fc3_expert_weights, fc3_scales, fc3_expert_biases, fc2_expert_weights, fc2_scales, num_rows, hidden_size, + inter_size, num_experts, local_num_experts, local_experts_start_index, k, workspace_ptr, fc2_result, + nullptr, num_rows, expert_scales, expanded_source_row_to_expanded_dest_row, expert_for_source_row, + stream); } template @@ -811,9 +974,10 @@ __global__ void finalize_moe_routing_kernel(const T* expanded_permuted_rows, T* const T* expanded_permuted_rows_row_ptr = expanded_permuted_rows + expanded_permuted_row * cols; const int expert_idx = expert_for_source_row[k_offset]; - const T* bias_ptr = bias + expert_idx * cols; + const T* bias_ptr = bias ? bias + expert_idx * cols : nullptr; - thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + bias_ptr[tid]); + thread_output = thread_output + row_scale * (expanded_permuted_rows_row_ptr[tid] + + (bias_ptr ? bias_ptr[tid] : T(0))); } reduced_row_ptr[tid] = thread_output; } @@ -866,9 +1030,9 @@ void finalize_moe_routing_kernelLauncher(const T* expanded_permuted_rows, T* red // ========================= TopK Softmax specializations =========================== template void topk_gating_softmax_kernelLauncher(const float*, const bool*, float*, float*, int*, int*, int, - int, int, cudaStream_t); + int, int, bool, cudaStream_t); template void topk_gating_softmax_kernelLauncher(const half*, const bool*, half*, half*, int*, int*, int, - int, int, cudaStream_t); + int, int, bool, cudaStream_t); // ==================== Variable batched GEMM specializations ================================== template class CutlassMoeFCRunner; diff --git a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h index 5cc2a3f79f003..5eef6f95f4820 100644 --- a/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h +++ b/onnxruntime/contrib_ops/cuda/moe/ft_moe/moe_kernel.h @@ -24,6 +24,8 @@ #include "core/common/common.h" #include "contrib_ops/cuda/bert/transformer_cuda_common.h" +#include "cutlass/numeric_types.h" + using namespace onnxruntime; namespace ort_fastertransformer { @@ -107,12 +109,13 @@ template class CutlassMoeFCRunner { public: - CutlassMoeFCRunner(int sm_version); + CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights); - size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k); + size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k); void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type, + const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size, int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, T* expert_scales, int* expanded_source_row_to_expanded_dest_row, @@ -120,6 +123,7 @@ class CutlassMoeFCRunner { void run_moe_fc(const T* input_activations, const T* gating_output, const WeightType* fc1_expert_weights, const T* fc1_scales, const T* fc1_expert_biases, ActivationType fc1_activation_type, + const WeightType* fc3_expert_weights, const T* fc3_scales, const T* fc3_expert_biases, const WeightType* fc2_expert_weights, const T* fc2_scales, int num_rows, int hidden_size, int inter_size, int num_experts, int local_num_experts, int local_experts_start_index, int k, char* workspace_ptr, T* fc2_result, const bool* finished, int active_rows, T* expert_scales, @@ -135,7 +139,8 @@ class CutlassMoeFCRunner { int64_t& total_covered_rows); private: - void configure_ws_ptrs(char* ws_ptr, int num_rows, int hidden_size, int inter_size, int num_experts, int k); + void configure_ws_ptrs(char* ws_ptr, size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, + size_t k); private: CubKeyValueSorter sorter_; @@ -152,12 +157,17 @@ class CutlassMoeFCRunner { int64_t* total_rows_before_expert_; T* fc1_result_; + T* fc3_result_; + + bool has_fc3_; + bool normalize_routing_weights_; // Cuda events contrib::cuda::AutoDestoryCudaEvent cuda_event_; int64_t total_past_rows_; int64_t total_covered_rows_; + // TODO: use pinned memory std::vector total_rows_before_expert_host_; }; @@ -165,11 +175,11 @@ class CutlassMoeFCRunner { template class CutlassMoeFCRunner::value>> { public: - CutlassMoeFCRunner(int sm_version); + CutlassMoeFCRunner(int sm_version, bool has_fc3, bool normalize_routing_weights); - size_t getWorkspaceSize(int num_rows, int hidden_size, int inter_size, int num_experts, int k) { + size_t getWorkspaceSize(size_t num_rows, size_t hidden_size, size_t inter_size, size_t num_experts, size_t k) { return 0; } }; -} // namespace ort_fastertransformer \ No newline at end of file +} // namespace ort_fastertransformer diff --git a/onnxruntime/contrib_ops/cuda/moe/moe.cc b/onnxruntime/contrib_ops/cuda/moe/moe.cc index 3f26a274109ad..b13aab959fc48 100644 --- a/onnxruntime/contrib_ops/cuda/moe/moe.cc +++ b/onnxruntime/contrib_ops/cuda/moe/moe.cc @@ -39,13 +39,16 @@ Status MoE::ComputeInternal(OpKernelContext* context) const { const Tensor* input = context->Input(0); const Tensor* router_probs = context->Input(1); const Tensor* fc1_experts_weights = context->Input(2); - const Tensor* fc2_experts_weights = context->Input(3); - const Tensor* fc1_experts_bias_optional = context->Input(4); + const Tensor* fc1_experts_bias_optional = context->Input(3); + const Tensor* fc2_experts_weights = context->Input(4); const Tensor* fc2_experts_bias_optional = context->Input(5); + const Tensor* fc3_experts_weights_optional = context->Input(6); + const Tensor* fc3_experts_bias_optional = context->Input(7); MoEParameters moe_params; - ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc2_experts_weights, - fc1_experts_bias_optional, fc2_experts_bias_optional)); + ORT_RETURN_IF_ERROR(CheckInputs(moe_params, input, router_probs, fc1_experts_weights, fc1_experts_bias_optional, + fc2_experts_weights, fc2_experts_bias_optional, fc3_experts_weights_optional, + fc3_experts_bias_optional)); typedef typename ToCudaType::MappedType CudaT; auto stream = context->GetComputeStream(); @@ -53,12 +56,14 @@ Status MoE::ComputeInternal(OpKernelContext* context) const { auto& device_prop = GetDeviceProp(); const int sm = device_prop.major * 10 + device_prop.minor; - ort_fastertransformer::CutlassMoeFCRunner moe_runner(sm); + ort_fastertransformer::CutlassMoeFCRunner moe_runner(sm, + fc3_experts_weights_optional != nullptr, + normalize_routing_weights_); size_t ws_size = - moe_runner.getWorkspaceSize(static_cast(moe_params.num_rows), static_cast(moe_params.hidden_size), - static_cast(moe_params.inter_size), static_cast(moe_params.num_experts), - static_cast(k_)); + moe_runner.getWorkspaceSize(static_cast(moe_params.num_rows), static_cast(moe_params.hidden_size), + static_cast(moe_params.inter_size), + static_cast(moe_params.num_experts), static_cast(k_)); size_t fc2_output_size = k_ * moe_params.num_rows * moe_params.hidden_size * sizeof(CudaT); size_t expert_scales_size = k_ * moe_params.num_rows * sizeof(CudaT); size_t expanded_source_row_to_expanded_dest_row_size = k_ * moe_params.num_rows * sizeof(int); @@ -77,26 +82,37 @@ Status MoE::ComputeInternal(OpKernelContext* context) const { IAllocatorUniquePtr expert_for_source_row = IAllocator::MakeUniquePtr(allocator, expert_for_source_row_size, false, stream); - // fc1_scales and fc2_scales are used in quantized MoE - const CudaT* fc1_scales_ptr = nullptr; - const CudaT* fc2_scales_ptr = nullptr; - + const CudaT* fc_scales_ptr = nullptr; moe_runner.run_moe_fc(reinterpret_cast(input->template Data()), reinterpret_cast(router_probs->template Data()), - reinterpret_cast(fc1_experts_weights->template Data()), - std::move(fc1_scales_ptr), + reinterpret_cast(fc1_experts_weights->DataRaw()), + fc_scales_ptr, fc1_experts_bias_optional == nullptr ? nullptr : reinterpret_cast(fc1_experts_bias_optional->template Data()), - activation_type_, reinterpret_cast(fc2_experts_weights->template Data()), - std::move(fc2_scales_ptr), static_cast(moe_params.num_rows), - static_cast(moe_params.hidden_size), static_cast(moe_params.inter_size), - static_cast(moe_params.num_experts), static_cast(moe_params.local_num_experts), - 0 /*local_experts_start_index_ used in sharded MoE*/, static_cast(k_), - reinterpret_cast(work_space.get()), reinterpret_cast(fc2_output.get()), + activation_type_, + fc3_experts_weights_optional == nullptr + ? nullptr + : reinterpret_cast(fc3_experts_weights_optional->DataRaw()), + fc_scales_ptr, + fc3_experts_bias_optional == nullptr + ? nullptr + : reinterpret_cast(fc3_experts_bias_optional->template Data()), + reinterpret_cast(fc2_experts_weights->DataRaw()), + fc_scales_ptr, + static_cast(moe_params.num_rows), + static_cast(moe_params.hidden_size), + static_cast(moe_params.inter_size), + static_cast(moe_params.num_experts), + static_cast(moe_params.local_num_experts), + 0 /*local_experts_start_index_ used in sharded MoE*/, + static_cast(k_), + reinterpret_cast(work_space.get()), + reinterpret_cast(fc2_output.get()), reinterpret_cast(expert_scales.get()), reinterpret_cast(expanded_source_row_to_expanded_dest_row.get()), - reinterpret_cast(expert_for_source_row.get()), Stream(context)); + reinterpret_cast(expert_for_source_row.get()), + Stream(context)); Tensor* output = context->Output(0, input->Shape()); diff --git a/onnxruntime/contrib_ops/cuda/moe/moe_base.h b/onnxruntime/contrib_ops/cuda/moe/moe_base.h index f55a7cde2e208..84a5e8c7c120d 100644 --- a/onnxruntime/contrib_ops/cuda/moe/moe_base.h +++ b/onnxruntime/contrib_ops/cuda/moe/moe_base.h @@ -13,16 +13,22 @@ namespace cuda { enum class MoEParallelType { None = 0, - ExpertSlicing = 1, + EP = 1, + TP = 2, + EPAndTP = 3, }; struct MoEParameters { + MoEParameters() {} + explicit MoEParameters(int64_t tensor_shards) : tensor_shards(tensor_shards) {} int64_t num_rows; int64_t num_experts; int64_t local_num_experts; int64_t hidden_size; int64_t inter_size; + MoEParallelType parallel_type; + int64_t tensor_shards{1}; }; class MoEBase { @@ -31,9 +37,11 @@ class MoEBase { const Tensor* input, const Tensor* router_probs, const Tensor* fc1_experts_weights, - const Tensor* fc2_experts_weights, const Tensor* fc1_experts_bias_optional, - const Tensor* fc2_experts_bias_optional) const { + const Tensor* fc2_experts_weights, + const Tensor* fc2_experts_bias_optional, + const Tensor* fc3_experts_weights_optional, + const Tensor* fc3_experts_bias_optional) const { const auto& input_dims = input->Shape().GetDims(); const auto& router_probs_dims = router_probs->Shape().GetDims(); const auto& fc1_experts_weights_dims = fc1_experts_weights->Shape().GetDims(); @@ -83,12 +91,6 @@ class MoEBase { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "router_probs_dims[0] must be equal to num_rows, got ", router_probs_dims[0], " and ", num_rows); } - if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional == nullptr) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is set but fc2_experts_bias is not set"); - } - if (fc1_experts_bias_optional == nullptr && fc2_experts_bias_optional != nullptr) { - return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "fc1_experts_bias is not set but fc2_experts_bias is set"); - } if (fc1_experts_bias_optional != nullptr && fc2_experts_bias_optional != nullptr) { const auto& fc1_experts_bias_dims = fc1_experts_bias_optional->Shape().GetDims(); const auto& fc2_experts_bias_dims = fc2_experts_bias_optional->Shape().GetDims(); @@ -126,15 +128,38 @@ class MoEBase { } } + if (fc3_experts_weights_optional != nullptr && + fc3_experts_weights_optional->Shape().GetDims() != fc1_experts_weights_dims) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "fc3_experts_weights_dims must be equal to fc1_experts_weights_dims, got ", + fc3_experts_weights_optional->Shape().GetDims(), " and ", fc1_experts_weights_dims); + } + + if (fc3_experts_bias_optional != nullptr && fc1_experts_bias_optional != nullptr && + fc3_experts_bias_optional->Shape().GetDims() != fc1_experts_bias_optional->Shape().GetDims()) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, + "fc3_experts_bias_dims must be equal to fc1_experts_bias_dims, got ", + fc3_experts_bias_optional->Shape().GetDims(), " and ", + fc1_experts_bias_optional->Shape().GetDims()); + } + parameters.num_rows = num_rows; parameters.num_experts = num_experts; parameters.local_num_experts = local_num_experts; parameters.hidden_size = hidden_size; parameters.inter_size = inter_size; if (num_experts == local_num_experts) { - parameters.parallel_type = MoEParallelType::None; + if (parameters.tensor_shards == 1) { + parameters.parallel_type = MoEParallelType::None; + } else { + parameters.parallel_type = MoEParallelType::TP; + } } else if (num_experts > local_num_experts) { - parameters.parallel_type = MoEParallelType::ExpertSlicing; + if (parameters.tensor_shards == 1) { + parameters.parallel_type = MoEParallelType::EP; + } else { + parameters.parallel_type = MoEParallelType::EPAndTP; + } } else { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "num_experts must be greater than or equal to local_num_experts, got ", @@ -161,8 +186,11 @@ class MoEBase { } else { ORT_THROW("Unsupported MoE activation type: ", activation_type_str); } + + normalize_routing_weights_ = op_kernel_info.GetAttrOrDefault("normalize_routing_weights", 0) == 1; } + bool normalize_routing_weights_; int64_t k_; ort_fastertransformer::ActivationType activation_type_; }; diff --git a/onnxruntime/core/graph/contrib_ops/collective_defs.cc b/onnxruntime/core/graph/contrib_ops/collective_defs.cc index 4aa43f5de1cd5..a0ca2e45f153a 100644 --- a/onnxruntime/core/graph/contrib_ops/collective_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/collective_defs.cc @@ -91,10 +91,18 @@ void RegisterCollectiveOps() { "Number of top experts to select from expert pool", AttributeProto::INT, static_cast(1)) + .Attr("normalize_routing_weights", + "Whether to normalize routing weights", + AttributeProto::INT, + static_cast(0)) .Attr("local_experts_start_index", "The start index of local experts", AttributeProto::INT, - static_cast(-1)) + static_cast(0)) + .Attr("tensor_shards", + "Tensor parallelism config. The number of shards for each expert weight and bias", + AttributeProto::INT, + static_cast(1)) .Input(0, "input", "2D input tensor with shape (num_rows, hidden_size) or " @@ -106,22 +114,32 @@ void RegisterCollectiveOps() { "T") .Input(2, "fc1_experts_weights", - "3D input tensor with shape (local_num_experts, hidden_size, inter_size)", + "3D input tensor with shape (local_num_experts, hidden_size, local_inter_size)", "T") .Input(3, - "fc2_experts_weights", - "3D input tensor with shape (local_num_experts, inter_size, hidden_size)", - "T") - .Input(4, "fc1_experts_bias", - "2D optional input tensor with shape (local_num_experts, inter_size)", + "2D optional input tensor with shape (local_num_experts, local_inter_size)", "T", OpSchema::Optional) + .Input(4, + "fc2_experts_weights", + "3D input tensor with shape (local_num_experts, local_inter_size, hidden_size)", + "T") .Input(5, "fc2_experts_bias", "2D optional input tensor with shape (num_experts, hidden_size)", "T", OpSchema::Optional) + .Input(6, + "fc3_experts_weights", + "3D optional input tensor with shape (local_num_experts, hidden_size, local_inter_size)", + "T", + OpSchema::Optional) + .Input(7, + "fc3_experts_bias", + "2D optional input tensor with shape (local_num_experts, local_inter_size)", + "T", + OpSchema::Optional) .Output(0, "output", "2D input tensor with shape (num_rows, hidden_size) or " diff --git a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc index 6709398c788f0..82cc16acad582 100644 --- a/onnxruntime/core/graph/contrib_ops/contrib_defs.cc +++ b/onnxruntime/core/graph/contrib_ops/contrib_defs.cc @@ -1382,8 +1382,8 @@ ONNX_MS_OPERATOR_SET_SCHEMA(Sampling, 1, constexpr const char* MoE_ver1_doc = R"DOC( Mixture of experts. Examples: Switch transformer(https://arxiv.org/pdf/2101.03961.pdf) use top 1, - GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, and Vision MOE(https://arxiv.org/pdf/2106.05974.pdf) - usually uses top 32 experts. + GLaM(https://arxiv.org/abs/2112.06905) activates top 2 FFN, Vision MOE(https://arxiv.org/pdf/2106.05974.pdf) + usually uses top 32 experts and Mixtral(https://huggingface.co/blog/mixtral). )DOC"; ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1, @@ -1391,12 +1391,15 @@ ONNX_MS_OPERATOR_SET_SCHEMA(MoE, 1, .SetDoc(MoE_ver1_doc) .Attr("activation_type", "Activation function to use. Choose from relu, gelu, silu and identity. Default is relu", AttributeProto::STRING, std::string("relu")) .Attr("k", "Number of top experts to select from expert pool", AttributeProto::INT, static_cast(1)) + .Attr("normalize_routing_weights", "Whether to normalize routing weights", AttributeProto::INT, static_cast(0)) .Input(0, "input", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T") .Input(1, "router_probs", "2D input tensor with shape (num_rows, num_experts)", "T") .Input(2, "fc1_experts_weights", "3D input tensor with shape (num_experts, hidden_size, inter_size)", "T") - .Input(3, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T") - .Input(4, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional) + .Input(3, "fc1_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional) + .Input(4, "fc2_experts_weights", "3D input tensor with shape (num_experts, inter_size, hidden_size)", "T") .Input(5, "fc2_experts_bias", "2D optional input tensor with shape (num_experts, hidden_size)", "T", OpSchema::Optional) + .Input(6, "fc3_experts_weights", "3D optional input tensor with shape (num_experts, hidden_size, inter_size)", "T", OpSchema::Optional) + .Input(7, "fc3_experts_bias", "2D optional input tensor with shape (num_experts, inter_size)", "T", OpSchema::Optional) .Output(0, "output", "2D input tensor with shape (num_rows, hidden_size) or 3D input tensor with shape (batch_size, sequence_length, hidden_size)", "T") .TypeConstraint("T", {"tensor(float)", "tensor(float16)"}, "Constrain input and output types to float or float16 tensors.") .TypeAndShapeInferenceFunction(ONNX_NAMESPACE::propagateShapeAndTypeFromFirstInput)); diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index bed2f677166d6..1cd3532846114 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -543,7 +543,7 @@ struct _IsNan { template <> struct _IsNan { __device__ __inline__ bool operator()(half a) const { - return static_cast(*reinterpret_cast(&a) & ~MLFloat16::kSignMask) + return static_cast(*reinterpret_cast(&a) & ~MLFloat16::kSignMask) > MLFloat16::kPositiveInfinityBits; } }; @@ -551,7 +551,7 @@ struct _IsNan { template <> struct _IsNan { __device__ __inline__ bool operator()(BFloat16 a) const { - return static_cast(*reinterpret_cast(&a) & ~BFloat16::kSignMask) + return static_cast(*reinterpret_cast(&a) & ~BFloat16::kSignMask) > BFloat16::kPositiveInfinityBits; } }; diff --git a/onnxruntime/test/contrib_ops/moe_test.cc b/onnxruntime/test/contrib_ops/moe_test.cc index e88ef7794cd07..263ace25ddfe0 100644 --- a/onnxruntime/test/contrib_ops/moe_test.cc +++ b/onnxruntime/test/contrib_ops/moe_test.cc @@ -14,6 +14,7 @@ static void RunMoETest( const std::vector& router_probs, const std::vector& fc1_experts_weights, const std::vector& fc2_experts_weights, + const std::vector& fc3_experts_weights, const std::vector& fc1_experts_bias, const std::vector& fc2_experts_bias, const std::vector& output_data, @@ -22,19 +23,23 @@ static void RunMoETest( int hidden_size, int inter_size, std::string activation_type, + int normalize_routing_weights = 0, + int top_k = 1, bool use_float16 = false) { int min_cuda_architecture = use_float16 ? 530 : 0; bool enable_cuda = HasCudaEnvironment(min_cuda_architecture); if (enable_cuda) { OpTester tester("MoE", 1, onnxruntime::kMSDomain); - tester.AddAttribute("k", static_cast(1)); + tester.AddAttribute("k", static_cast(top_k)); tester.AddAttribute("activation_type", activation_type); + tester.AddAttribute("normalize_routing_weights", static_cast(normalize_routing_weights)); std::vector input_dims = {num_rows, hidden_size}; std::vector router_probs_dims = {num_rows, num_experts}; std::vector fc1_experts_weights_dims = {num_experts, hidden_size, inter_size}; std::vector fc2_experts_weights_dims = {num_experts, inter_size, hidden_size}; + std::vector fc3_experts_weights_dims = fc1_experts_weights_dims; std::vector fc1_experts_bias_dims = {num_experts, inter_size}; std::vector fc2_experts_bias_dims = {num_experts, hidden_size}; std::vector output_dims = {num_rows, hidden_size}; @@ -43,18 +48,40 @@ static void RunMoETest( tester.AddInput("input", input_dims, ToFloat16(input)); tester.AddInput("router_probs", router_probs_dims, ToFloat16(router_probs)); tester.AddInput("fc1_experts_weights", fc1_experts_weights_dims, ToFloat16(fc1_experts_weights)); + if (!fc1_experts_bias.empty()) { + tester.AddInput("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias)); + } else { + tester.AddOptionalInputEdge(); + } tester.AddInput("fc2_experts_weights", fc2_experts_weights_dims, ToFloat16(fc2_experts_weights)); - tester.AddInput("fc1_experts_bias", fc1_experts_bias_dims, ToFloat16(fc1_experts_bias)); - tester.AddInput("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias)); + if (!fc2_experts_bias.empty()) { + tester.AddInput("fc2_experts_bias", fc2_experts_bias_dims, ToFloat16(fc2_experts_bias)); + } else { + tester.AddOptionalInputEdge(); + } + if (!fc3_experts_weights.empty()) { + tester.AddInput("fc3_experts_weights", fc3_experts_weights_dims, ToFloat16(fc3_experts_weights)); + } tester.AddOutput("output", output_dims, ToFloat16(output_data)); tester.SetOutputTolerance(0.005f); } else { tester.AddInput("input", input_dims, input); tester.AddInput("router_probs", router_probs_dims, router_probs); tester.AddInput("fc1_experts_weights", fc1_experts_weights_dims, fc1_experts_weights); + if (!fc1_experts_bias.empty()) { + tester.AddInput("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias); + } else { + tester.AddOptionalInputEdge(); + } tester.AddInput("fc2_experts_weights", fc2_experts_weights_dims, fc2_experts_weights); - tester.AddInput("fc1_experts_bias", fc1_experts_bias_dims, fc1_experts_bias); - tester.AddInput("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias); + if (!fc2_experts_bias.empty()) { + tester.AddInput("fc2_experts_bias", fc2_experts_bias_dims, fc2_experts_bias); + } else { + tester.AddOptionalInputEdge(); + } + if (!fc3_experts_weights.empty()) { + tester.AddInput("fc3_experts_weights", fc3_experts_weights_dims, fc3_experts_weights); + } tester.AddOutput("output", output_dims, output_data); tester.SetOutputTolerance(0.001f); } @@ -233,6 +260,7 @@ TEST(MoETest, MoETest_Gelu) { router_probs, fc1_experts_weights, fc2_experts_weights, + {}, fc1_experts_bias, fc2_experts_bias, output, @@ -411,6 +439,7 @@ TEST(MoETest, MoETest_Relu) { router_probs, fc1_experts_weights, fc2_experts_weights, + {}, fc1_experts_bias, fc2_experts_bias, output, @@ -421,5 +450,143 @@ TEST(MoETest, MoETest_Relu) { "relu"); } +TEST(MoETest, MoETest_Mixtral) { + int num_rows = 6; + int num_experts = 8; + int hidden_size = 4; + int inter_size = 8; + + const std::vector input = { + 0.9212995f, 0.5282444f, -0.008228387f, -1.449332f, -0.6051824f, -0.17924511f, 0.1995587f, -1.2461947f, + 0.86708033f, 0.19191018f, 1.1600108f, -0.008815222f, 0.8504777f, -0.84964496f, -1.4019964f, 0.17225051f, + 0.35569248f, 1.2056456f, 1.3690308f, -0.69495815f, 1.4324434f, 0.22761835f, -1.1286871f, 1.124213f}; + const std::vector router_probs = { + -0.09331456f, -0.47121337f, 0.07311103f, 0.47643483f, 0.21135253f, -0.72226393f, -0.048502743f, 0.39447474f, + -0.9014899f, -0.36629856f, -0.23088816f, -0.099606544f, -0.45191774f, -0.30394578f, 0.6266495f, 0.67937183f, + 0.27117345f, -0.36059442f, 0.81510246f, 0.61359257f, 0.07649982f, -0.44949868f, -0.54758865f, 0.4736983f, + 0.21584567f, 0.21296778f, 0.093342215f, -0.09353682f, 0.61422515f, 0.19574627f, 0.0063361377f, -0.2465148f, + 0.15675665f, -0.4546509f, 0.24447554f, 0.5921611f, -0.18192923f, -0.66116416f, -0.40265432f, 0.33475468f, + 1.2906091f, 0.4709078f, 0.16256471f, 0.19308007f, 0.97568524f, 0.25876164f, -0.7964541f, -1.0319631f}; + const std::vector fc1_experts_weights = { + 0.3860137f, 0.077925384f, 0.13434184f, 0.28902978f, 0.25391752f, -0.38351142f, 0.15813059f, 0.031481862f, + 0.083209574f, 0.4039817f, -0.13558972f, -0.21858627f, -0.30475253f, 0.41026944f, -0.008697987f, -0.3412701f, + -0.16235226f, 0.054659843f, 0.21042877f, 0.28863233f, -0.49495423f, 0.14401567f, 0.39130414f, 0.154176f, + 0.30897498f, -0.15768659f, 0.44641107f, 0.089463115f, -0.19318026f, 0.20710677f, -0.3552568f, -0.17219114f, + 0.41923493f, -0.4233985f, -0.41503525f, 0.19466156f, -0.08633667f, 0.45547962f, -0.054792404f, 0.26722562f, + -0.09923202f, 0.3460176f, -0.49708033f, -0.41033173f, 0.10443485f, -0.39646107f, -0.37424505f, 0.1757198f, + 0.43019837f, -0.13757241f, 0.14305532f, 0.37121457f, 0.2581259f, 0.12583363f, 0.45542932f, 0.16247797f, + 0.15579104f, -0.19166303f, -0.109221935f, -0.36702687f, 0.40365517f, -0.21506298f, -0.36697525f, -0.2703231f, + -0.49740213f, -0.3486371f, 0.24005288f, -0.0048963428f, 0.20468098f, -0.09111178f, -0.1485982f, -0.088219464f, + 0.33463532f, -0.49346995f, 0.42075223f, -0.38025302f, -0.245484f, -0.35191745f, 0.3086716f, -0.2423737f, + 0.37881732f, -0.40608948f, 0.26193494f, -0.4283861f, -0.10062629f, -0.32670784f, -0.16040438f, -0.15297079f, + 0.1822241f, 0.37285012f, 0.12654608f, -0.46767431f, -0.28775263f, 0.16585541f, -0.36678362f, -0.4759978f, + -0.34751755f, -0.3163945f, -0.3858195f, -0.38030273f, -0.06156373f, -0.04352224f, -0.4041785f, -0.335764f, + -0.10303855f, -0.4009425f, -0.1236487f, -0.40111196f, 0.23985302f, -0.118291676f, -0.26773083f, 0.121197104f, + 0.3702919f, -0.34168184f, 0.33743858f, 0.24873763f, -0.23140603f, -0.25351608f, 0.48291886f, 0.13780516f, + 0.25632292f, -0.49343884f, 0.08369112f, -0.37192065f, -0.05451995f, -0.44571918f, -0.24150735f, 0.27395487f, + -0.20423341f, -0.024149835f, 0.40208143f, -0.18211937f, -0.19767642f, -0.19397742f, -0.1510992f, 0.48074025f, + 0.18377024f, -0.18288034f, 0.08111167f, 0.12729281f, 0.27861303f, 0.0076527f, 0.36356348f, -0.24359548f, + -0.33313757f, -0.374829f, -0.08705664f, 0.23576546f, -0.39819986f, -0.09880793f, -0.012998581f, -0.36475456f, + -0.32685202f, 0.29657948f, -0.4631365f, -0.06320876f, 0.31600899f, 0.060619473f, 0.39029974f, 0.401151f, + 0.15562236f, 0.43565983f, -0.058149397f, 0.36150748f, 0.10750586f, -0.063970566f, -0.47026545f, -0.3035437f, + -0.38143605f, -0.4734699f, 0.31273925f, -0.43410504f, 0.07299572f, 0.47506f, 0.021913886f, -0.036100805f, + -0.31637233f, 0.37718338f, -0.046213806f, 0.19239199f, 0.13676548f, 0.33592474f, -0.34048676f, -0.11097133f, + -0.41569126f, -0.01680845f, 0.31357706f, 0.0943895f, -0.24053341f, -0.018784225f, 0.40659577f, 0.08897692f, + 0.3793823f, -0.3271106f, 0.067666054f, -0.12331611f, -0.010209799f, -0.48908865f, 0.19195485f, -0.45211792f, + 0.48282713f, 0.4363466f, -0.40184838f, -0.025082052f, -0.31057972f, 0.14850605f, 0.39756012f, -0.25782883f, + 0.3181312f, 0.17685872f, -0.16694272f, -0.41516554f, -0.062004805f, -0.33060408f, -0.13665432f, -0.43781847f, + -0.298562f, 0.013283849f, 0.48130906f, -0.27970356f, 0.20347959f, -0.24402553f, -0.20528454f, -0.114435256f, + 0.12556863f, -0.4344011f, 0.2868948f, 0.19894183f, -0.12849897f, -0.18726158f, -0.4850099f, -0.4352169f, + -0.40527463f, 0.13625044f, -0.49707252f, -0.45698053f, 0.28196156f, 0.16826987f, -0.25944453f, 0.2801003f, + 0.21121234f, -0.04066527f, 0.45854944f, -0.17861038f, 0.18178529f, 0.17789757f, 0.34227383f, 0.26976448f, + 0.15789884f, 0.22840887f, 0.419321f, -0.14490443f, 0.39608955f, -0.4162954f, -0.47072983f, 0.41119635f}; + const std::vector fc2_experts_weights = { + 0.10833451f, 0.34020698f, -0.18258394f, -0.17842063f, -0.07365984f, -0.29177922f, -0.24102151f, 0.1077901f, + 0.2932343f, -0.35068116f, 0.1875877f, 0.07474385f, -0.20955177f, -0.27660736f, -0.14290786f, -0.09014153f, + -0.21085852f, -0.2378315f, 0.21457997f, 0.21074237f, -0.21087126f, 0.14320332f, -0.08389844f, 0.24034885f, + 0.31800103f, 0.12659892f, 0.20224877f, -0.2563875f, 0.11782206f, 0.29377612f, -0.27469966f, -0.18875091f, + 0.32136288f, 0.0788243f, -0.26413083f, 0.18453442f, 0.0776935f, -0.19561274f, 0.12608862f, 0.18579696f, + 0.045481127f, -0.17894714f, 0.27366453f, 0.13220324f, -0.3115706f, -0.016884197f, -0.3328494f, -0.062126897f, + 0.14841764f, 0.19741052f, 0.08211302f, -0.09362138f, -0.053040292f, -0.090344846f, 0.18264277f, 0.037823465f, + -0.16197139f, -0.20172869f, 0.064109616f, -0.062456656f, 0.30368346f, -0.12107184f, -0.12590908f, -0.10535928f, + 0.1978099f, 0.13119277f, 0.21948591f, -0.080250844f, -0.24614547f, 0.33202717f, 0.2645375f, -0.21193951f, + 0.17770219f, -0.04986229f, 0.33435768f, -0.0309231f, 0.16043694f, -0.0027341924f, -0.08339601f, -0.17402375f, + 0.2525901f, -0.0813988f, -0.2904943f, -0.14452116f, -0.27119386f, -0.2952116f, 0.0794895f, -0.11223866f, + 0.25427446f, 0.16967128f, 0.19531254f, -0.33598322f, -0.16714293f, -0.35097876f, -0.35189477f, 0.2900932f, + 0.26874313f, -0.1322388f, -0.330179f, 0.064027935f, 0.19688474f, -0.20129368f, 0.006225848f, 0.19252343f, + -0.35054854f, -0.31874785f, 0.32238203f, 0.29287276f, 0.03135616f, 0.015792634f, 0.20397249f, -0.3245995f, + 0.21416605f, 0.15667121f, -0.2058509f, 0.23639117f, -0.032677338f, 0.07826358f, -0.04589425f, -0.24935842f, + -0.20834164f, 0.069915086f, -0.26063374f, 0.13239416f, 0.33705652f, -0.26813045f, -0.17056243f, 0.29919288f, + 0.27704936f, -0.096224755f, 0.13250813f, 0.26709175f, -0.26995474f, 0.3261805f, -0.18062393f, -0.04732303f, + -0.02733084f, 0.050550338f, -0.2937818f, -0.19453493f, -0.34864828f, -0.20862648f, -0.19311349f, 0.17665526f, + -0.2894185f, -0.020016002f, 0.3409702f, -0.18320526f, 0.068286195f, 0.08490415f, 0.30223787f, -0.2386011f, + 0.09405743f, 0.123811804f, 0.31660154f, -0.11290163f, 0.07494662f, -0.24999082f, 0.2075398f, 0.07419645f, + 0.3327035f, -0.09647329f, 0.24138254f, -0.32546985f, 0.033594366f, 0.16555631f, 0.33516192f, -0.32619375f, + 0.20476541f, -0.07724f, 0.018923176f, -0.21126744f, 0.2744358f, -0.23979841f, -0.30413106f, -0.3485449f, + 0.2854276f, 0.14391156f, -0.24802732f, -0.21701548f, -0.122100174f, 0.054206114f, -0.21961808f, 0.13481297f, + -0.07907457f, 0.15763119f, -0.31156835f, 0.29488218f, 0.17039073f, 0.35125035f, -0.17721775f, -0.10516899f, + 0.072144486f, -0.038529005f, -0.058253434f, 0.13062657f, -0.3312356f, -0.15963489f, -0.20129326f, 0.014987925f, + 0.30869225f, 0.283981f, -0.057181682f, 0.15174268f, 0.22181617f, -0.19763571f, 0.28675067f, 0.0003976555f, + -0.34610963f, 0.2931936f, -0.26233214f, 0.19563977f, -0.16886877f, 0.022812065f, 0.080249704f, -0.2798801f, + 0.11531327f, 0.07107194f, -0.34746924f, -0.051920194f, -0.07264093f, 0.27581826f, 0.18536879f, 0.15684144f, + -0.26691115f, -0.22811417f, -0.1498502f, -0.176639f, -0.25876564f, -0.16051741f, -0.0048792143f, -0.08490091f, + 0.18136817f, 0.24729891f, 0.32358363f, -0.09566104f, 0.3074607f, -0.24191524f, -0.21220984f, -0.23039621f, + 0.21154472f, -0.19495378f, 0.002779711f, -0.34692943f, 0.055384878f, 0.25809082f, 0.16814983f, 0.19935164f, + 0.11652225f, 0.1115539f, -0.24407779f, 0.09392998f, 0.33556697f, 0.11422251f, 0.34336287f, -0.33113837f}; + const std::vector fc3_experts_weights = { + 0.45783097f, -0.2863351f, 0.011728346f, -0.43760604f, 0.15407985f, 0.07818556f, 0.0013856292f, -0.34319758f, + -0.16871625f, 0.12490183f, -0.34154075f, -0.31836903f, -0.46634215f, -0.43996066f, -0.1860516f, -0.2917009f, + -0.1772582f, -0.06599659f, -0.42419833f, 0.49980444f, -0.3283869f, -0.21543652f, -0.034647882f, -0.17114872f, + -0.4837973f, -0.362943f, -0.27533132f, 0.09443748f, -0.16642791f, -0.2993343f, -0.33881485f, -0.39464045f, + 0.31960344f, 0.007296145f, -0.45412838f, -0.024868786f, -0.16298121f, -0.44197202f, 0.07232875f, -0.32362783f, + 0.42969978f, -0.029854119f, -0.18451887f, -0.30145288f, 0.16885209f, -0.30068123f, -0.12948537f, 0.36494362f, + -0.049498677f, 0.12020564f, 0.42106473f, -0.30590254f, 0.31881082f, -0.078908324f, 0.20685762f, -0.22735089f, + -0.11194843f, 0.14011681f, 0.19477749f, -0.44788343f, 0.23084867f, 0.48367476f, -0.19044077f, -0.100233376f, + 0.4191656f, -0.4515314f, -0.3214385f, 0.016065598f, -0.4069137f, -0.17348295f, -0.43329984f, 0.33521235f, + -0.07843453f, -0.4865722f, -0.039011598f, -0.10605621f, 0.4192536f, 0.04063064f, 0.1984514f, 0.49294376f, + -0.056941032f, 0.18582922f, -0.16650558f, -0.17215621f, -0.20009357f, 0.46615022f, 0.47462142f, -0.0766145f, + -0.20405996f, -0.27452308f, -0.16176039f, -0.23940295f, 0.13248974f, 0.23036134f, 0.13154167f, 0.10377723f, + 0.0070211887f, 0.29162645f, 0.34465307f, -0.4058748f, -0.13989884f, -0.12305027f, -0.2541607f, 0.4767149f, + 0.4549045f, -0.108933926f, 0.2452516f, 0.054080307f, 0.33768386f, -0.45279485f, 0.1557768f, 0.17416143f, + -0.42602575f, -0.102350116f, 0.16022503f, 0.14813942f, 0.03982985f, -0.47012872f, -0.14555538f, 0.35645115f, + -0.1909796f, -0.20839584f, -0.28098184f, -0.23085594f, 0.022559166f, -0.23900753f, -0.19561106f, -0.24205637f, + 0.2573983f, -0.2947166f, 0.4568925f, 0.11514187f, 0.18671238f, -0.121082425f, 0.3909887f, -0.10985571f, + -0.19420451f, -0.3255307f, 0.4863913f, 0.007830441f, 0.4648854f, -0.24156213f, 0.22956276f, -0.09216207f, + -0.29428315f, 0.26062596f, 0.14955276f, -0.036366224f, -0.12957954f, 0.08501935f, -0.36796576f, 0.041123867f, + 0.06744653f, -0.0839923f, 0.17207885f, 0.006872058f, -0.21135789f, 0.3732242f, -0.2683524f, -0.45898575f, + -0.14543939f, 0.30806476f, 0.08574325f, 0.027492225f, -0.38164973f, -0.040038824f, -0.26947904f, -0.09740937f, + 0.26697665f, -0.43565083f, 0.1359719f, 0.12271714f, 0.0149876475f, -0.44011843f, 0.26128954f, -0.42487514f, + -0.24668545f, 0.06113738f, -0.29119557f, 0.194273f, -0.24981815f, 0.3489496f, -0.47321397f, -0.31794417f, + -0.23641628f, 0.44169098f, -0.006898284f, 0.43446392f, -0.39553195f, 0.057907403f, -0.19339961f, -0.08160931f, + 0.4979084f, -0.11149913f, 0.35366338f, -0.16032219f, -0.48278677f, 0.08397317f, 0.4008311f, 0.30288273f, + 0.2546957f, -0.10675722f, 0.069722414f, 0.456497f, -0.19691509f, 0.49017924f, 0.41796166f, -0.2337895f, + -0.3635872f, -0.45445484f, -0.29122698f, -0.4339773f, 0.15762383f, 0.09782606f, -0.27986187f, -0.23860168f, + 0.38454843f, -0.07870716f, 0.15390605f, -0.15793777f, 0.48130733f, 0.288768f, 0.45969498f, -0.4193731f, + -0.3218134f, -0.29914904f, -0.3426242f, 0.06931591f, -0.2633695f, -0.25429398f, 0.25366426f, -0.27700734f, + 0.49418402f, -0.21919805f, 0.041192472f, -0.19817531f, -0.49578953f, 0.48185098f, -0.41920406f, -0.08335745f, + 0.19111753f, -0.07547706f, 0.049694f, 0.13012594f, 0.2617172f, -0.22612399f, 0.32247066f, -0.33702326f, + 0.20062232f, -0.09143996f, -0.063310504f, 0.1885702f, 0.11926836f, 0.3378734f, -0.45973647f, 0.48845494f}; + const std::vector output = { + 0.026516449f, 0.04061616f, 0.04403834f, -0.13644142f, 0.038774252f, 0.024002096f, -0.061423667f, 0.034824893f, + -0.022858473f, 0.04693405f, -0.0120724365f, -0.028846134f, -0.0168579f, -0.07958221f, 0.048179876f, 0.053492386f, + -0.026292695f, -0.009724421f, -0.026503641f, 0.031220898f, 0.04189077f, 0.11775493f, -0.037770163f, -0.0790936f}; + + RunMoETest(input, + router_probs, + fc1_experts_weights, + fc2_experts_weights, + fc3_experts_weights, + {}, + {}, + output, + num_rows, + num_experts, + hidden_size, + inter_size, + "silu", + 1, /*normalize_routing_weights*/ + 2 /*top_k*/); +} + } // namespace test } // namespace onnxruntime diff --git a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py index fd1d58cd2a3b8..ec64f2359f4be 100644 --- a/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py +++ b/onnxruntime/test/python/transformers/sharded_moe/test_sharded_moe.py @@ -24,25 +24,17 @@ def get_size(): return comm.Get_size() -def barrier(): - comm.Barrier() - - def print_out(*args): if get_rank() == 0: print(*args) -def broadcast(data): - comm = MPI.COMM_WORLD - comm.broadcast(data, root=0) - - local_rank = get_rank() ORT_DTYPE = TensorProto.FLOAT16 NP_TYPE = np.float16 if ORT_DTYPE == TensorProto.FLOAT16 else np.float32 -THRESHOLD = 1e-3 +THRESHOLD_TP = 3e-2 +THRESHOLD_EP = 1e-6 def create_moe_onnx_graph( @@ -52,12 +44,17 @@ def create_moe_onnx_graph( hidden_size, inter_size, fc1_experts_weights, - fc2_experts_weights, fc1_experts_bias, + fc2_experts_weights, fc2_experts_bias, - local_experts_start_index=-1, + fc3_experts_weights, + local_experts_start_index=0, + topk=2, + normalize_routing_weights=1, + activation_type="gelu", + tensor_shards=1, ): - use_sharded_moe = local_experts_start_index >= 0 + use_sharded_moe = num_experts > local_num_experts or tensor_shards > 1 nodes = [ ( helper.make_node( @@ -66,14 +63,16 @@ def create_moe_onnx_graph( "input", "router_probs", "fc1_experts_weights", - "fc2_experts_weights", "fc1_experts_bias", + "fc2_experts_weights", "fc2_experts_bias", + "fc3_experts_weights", ], ["output"], "MoE_0", - k=1, - activation_type="gelu", + k=topk, + normalize_routing_weights=normalize_routing_weights, + activation_type=activation_type, domain="com.microsoft", ) if not use_sharded_moe @@ -83,15 +82,18 @@ def create_moe_onnx_graph( "input", "router_probs", "fc1_experts_weights", - "fc2_experts_weights", "fc1_experts_bias", + "fc2_experts_weights", "fc2_experts_bias", + "fc3_experts_weights", ], ["output"], "MoE_0", - k=1, - activation_type="gelu", + k=topk, + normalize_routing_weights=normalize_routing_weights, + activation_type=activation_type, local_experts_start_index=local_experts_start_index, + tensor_shards=tensor_shards, domain="com.microsoft", ) ), @@ -99,6 +101,7 @@ def create_moe_onnx_graph( fc1_shape = [local_num_experts, hidden_size, inter_size] fc2_shape = [local_num_experts, inter_size, hidden_size] + fc3_shape = fc1_shape initializers = [ helper.make_tensor( @@ -115,6 +118,13 @@ def create_moe_onnx_graph( fc2_experts_weights.flatten(), raw=False, ), + helper.make_tensor( + "fc3_experts_weights", + ORT_DTYPE, + fc3_shape, + fc3_experts_weights.flatten(), + raw=False, + ), ] fc1_bias_shape = [local_num_experts, inter_size] @@ -166,18 +176,18 @@ def create_moe_onnx_graph( return model.SerializeToString() -def test_moe_with_expert_slicing( +def generate_weights_and_initial_model( + num_rows, + num_experts, hidden_size, inter_size, - num_experts, - num_rows, ): - local_experts_start_index = local_rank * num_experts // get_size() - - fc1_experts_weights_all = np.random.rand(num_experts, hidden_size, inter_size).astype(NP_TYPE) - fc2_experts_weights_all = np.random.rand(num_experts, inter_size, hidden_size).astype(NP_TYPE) - fc1_experts_bias_all = np.random.rand(num_experts, inter_size).astype(NP_TYPE) - fc2_experts_bias_all = np.random.rand(num_experts, hidden_size).astype(NP_TYPE) + s = 0.1 + fc1_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE) + fc2_experts_weights_all = np.random.normal(scale=s, size=(num_experts, inter_size, hidden_size)).astype(NP_TYPE) + fc3_experts_weights_all = np.random.normal(scale=s, size=(num_experts, hidden_size, inter_size)).astype(NP_TYPE) + fc1_experts_bias_all = np.random.normal(scale=s, size=(num_experts, inter_size)).astype(NP_TYPE) + fc2_experts_bias_all = np.random.normal(scale=s, size=(num_experts, hidden_size)).astype(NP_TYPE) onnx_model_full = create_moe_onnx_graph( num_rows, @@ -186,34 +196,31 @@ def test_moe_with_expert_slicing( hidden_size, inter_size, fc1_experts_weights_all, - fc2_experts_weights_all, fc1_experts_bias_all, + fc2_experts_weights_all, fc2_experts_bias_all, + fc3_experts_weights_all, ) - fc1_experts_weights = fc1_experts_weights_all[ - local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, : - ] - fc2_experts_weights = fc2_experts_weights_all[ - local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, : - ] - fc1_experts_bias = fc1_experts_bias_all[ - local_experts_start_index : local_experts_start_index + num_experts // get_size(), : - ] - - onnx_model_local = create_moe_onnx_graph( - num_rows, - num_experts, - num_experts // get_size(), - hidden_size, - inter_size, - fc1_experts_weights, - fc2_experts_weights, - fc1_experts_bias, + return ( + onnx_model_full, + fc1_experts_weights_all, + fc1_experts_bias_all, + fc2_experts_weights_all, fc2_experts_bias_all, - local_experts_start_index, + fc3_experts_weights_all, ) + +def run_ort_with_parity_check( + onnx_model_full, + onnx_model_local, + num_rows, + hidden_size, + num_experts, + inter_size, + threshold, +): sess_options = onnxruntime.SessionOptions() cuda_provider_options = {"device_id": local_rank} execution_providers = [("CUDAExecutionProvider", cuda_provider_options)] @@ -229,30 +236,161 @@ def test_moe_with_expert_slicing( output = ort_session.run(None, ort_inputs) sharded_output = ort_session_local.run(None, ort_inputs) - assert np.allclose(output[0], sharded_output[0], atol=THRESHOLD, rtol=THRESHOLD) + print_out("max diff:", np.max(np.abs(output[0] - sharded_output[0]))) + assert np.allclose(output[0], sharded_output[0], atol=threshold, rtol=threshold) print_out( - "hidden_size: ", + "hidden_size:", hidden_size, - " inter_size: ", + " inter_size:", inter_size, - " num_experts: ", + " num_experts:", num_experts, - " num_rows: ", + " num_rows:", num_rows, - " world_size: ", + " world_size:", get_size(), " Parity: OK", ) +def test_moe_with_tensor_parallelism( + hidden_size, + inter_size, + num_experts, + num_rows, + threshold=THRESHOLD_TP, +): + assert inter_size % get_size() == 0 + + ( + onnx_model_full, + fc1_experts_weights_all, + fc1_experts_bias_all, + fc2_experts_weights_all, + fc2_experts_bias_all, + fc3_experts_weights_all, + ) = generate_weights_and_initial_model( + num_rows, + num_experts, + hidden_size, + inter_size, + ) + + fc1_experts_weights = fc1_experts_weights_all[ + :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size() + ] + fc2_experts_weights = fc2_experts_weights_all[ + :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size(), : + ] + fc3_experts_weights = fc3_experts_weights_all[ + :, :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size() + ] + fc1_experts_bias = fc1_experts_bias_all[ + :, local_rank * inter_size // get_size() : (local_rank + 1) * inter_size // get_size() + ] + + onnx_model_local = create_moe_onnx_graph( + num_rows, + num_experts, + num_experts, + hidden_size, + inter_size // get_size(), + fc1_experts_weights, + fc1_experts_bias, + fc2_experts_weights, + fc2_experts_bias_all, + fc3_experts_weights, + tensor_shards=get_size(), + ) + + run_ort_with_parity_check( + onnx_model_full, + onnx_model_local, + num_rows, + hidden_size, + num_experts, + inter_size, + threshold, + ) + + +def test_moe_with_expert_parallelism( + hidden_size, + inter_size, + num_experts, + num_rows, + threshold=THRESHOLD_EP, +): + local_experts_start_index = local_rank * num_experts // get_size() + + ( + onnx_model_full, + fc1_experts_weights_all, + fc1_experts_bias_all, + fc2_experts_weights_all, + fc2_experts_bias_all, + fc3_experts_weights_all, + ) = generate_weights_and_initial_model( + num_rows, + num_experts, + hidden_size, + inter_size, + ) + + fc1_experts_weights = fc1_experts_weights_all[ + local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, : + ] + fc2_experts_weights = fc2_experts_weights_all[ + local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, : + ] + fc3_experts_weights = fc3_experts_weights_all[ + local_experts_start_index : local_experts_start_index + num_experts // get_size(), :, : + ] + fc1_experts_bias = fc1_experts_bias_all[ + local_experts_start_index : local_experts_start_index + num_experts // get_size(), : + ] + + onnx_model_local = create_moe_onnx_graph( + num_rows, + num_experts, + num_experts // get_size(), + hidden_size, + inter_size, + fc1_experts_weights, + fc1_experts_bias, + fc2_experts_weights, + fc2_experts_bias_all, + fc3_experts_weights, + local_experts_start_index, + ) + + run_ort_with_parity_check( + onnx_model_full, + onnx_model_local, + num_rows, + hidden_size, + num_experts, + inter_size, + threshold, + ) + + class TestMoE(unittest.TestCase): - def test_moe_expert_slicing(self): - for hidden_size in [16, 128]: - for inter_size in [512, 1024]: - for num_experts in [8, 16, 32]: - for num_rows in [16, 128, 512]: - test_moe_with_expert_slicing( + def test_moe_parallelism(self): + for hidden_size in [128, 1024]: + for inter_size in [512, 2048]: + for num_experts in [64]: + for num_rows in [1024]: + print_out("EP") + test_moe_with_expert_parallelism( + hidden_size, + inter_size, + num_experts, + num_rows, + ) + print_out("TP") + test_moe_with_tensor_parallelism( hidden_size, inter_size, num_experts, diff --git a/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py new file mode 100644 index 0000000000000..90b7da255081a --- /dev/null +++ b/onnxruntime/test/python/transformers/test_parity_mixtral_moe.py @@ -0,0 +1,365 @@ +# -------------------------------------------------------------------------- +# Copyright 2020 The HuggingFace Inc. team +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 +# -------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- +import unittest +from collections import OrderedDict + +import numpy +import torch +import torch.nn.functional as F +from onnx import TensorProto, helper +from torch import nn + +import onnxruntime + +torch.manual_seed(42) +numpy.random.seed(42) + +ORT_DTYPE = TensorProto.FLOAT +NP_TYPE = numpy.float16 if ORT_DTYPE == TensorProto.FLOAT16 else numpy.float32 +THRESHOLD = 3e-2 + + +def value_string_of(numpy_array): + arr = numpy_array.flatten() + lines = ["f, ".join([str(v) for v in arr[i : min(i + 8, arr.size)]]) for i in range(0, arr.size, 8)] + return "{\n " + "f,\n ".join(lines) + "f}" + + +def print_tensor(name, numpy_array): + print(f"const std::vector {name} = {value_string_of(numpy_array)};") + + +def create_moe_onnx_graph( + num_rows, + num_experts, + hidden_size, + inter_size, + fc1_experts_weights, + fc2_experts_weights, + fc3_experts_weights, + topk, +): + nodes = [ + helper.make_node( + "MoE", + [ + "input", + "router_probs", + "fc1_experts_weights", + "", + "fc2_experts_weights", + "", + "fc3_experts_weights", + ], + ["output"], + "MoE_0", + k=topk, + normalize_routing_weights=1, + activation_type="silu", + domain="com.microsoft", + ), + ] + + fc1_shape = [num_experts, hidden_size, inter_size] + fc2_shape = [num_experts, inter_size, hidden_size] + fc3_shape = [num_experts, hidden_size, inter_size] + + torch_type = torch.float16 if ORT_DTYPE == TensorProto.FLOAT16 else torch.float32 + + initializers = [ + helper.make_tensor( + "fc1_experts_weights", + ORT_DTYPE, + fc1_shape, + fc1_experts_weights.to(torch_type).flatten().tolist(), + raw=False, + ), + helper.make_tensor( + "fc2_experts_weights", + ORT_DTYPE, + fc2_shape, + fc2_experts_weights.to(torch_type).flatten().tolist(), + raw=False, + ), + helper.make_tensor( + "fc3_experts_weights", + ORT_DTYPE, + fc3_shape, + fc3_experts_weights.to(torch_type).flatten().tolist(), + raw=False, + ), + ] + + graph_inputs = [ + helper.make_tensor_value_info("input", ORT_DTYPE, [num_rows, hidden_size]), + ] + + graph_inputs.append( + helper.make_tensor_value_info( + "router_probs", + ORT_DTYPE, + [num_rows, num_experts], + ) + ) + + graph_outputs = [ + helper.make_tensor_value_info("output", ORT_DTYPE, [num_rows, hidden_size]), + ] + + graph = helper.make_graph( + nodes, + "MoE_Graph", + graph_inputs, + graph_outputs, + initializers, + ) + + model = helper.make_model(graph) + return model.SerializeToString() + + +class ClassInstantier(OrderedDict): + def __getitem__(self, key): + content = super().__getitem__(key) + cls, kwargs = content if isinstance(content, tuple) else (content, {}) + return cls(**kwargs) + + +ACT2CLS = { + "silu": nn.SiLU, +} +ACT2FN = ClassInstantier(ACT2CLS) + + +class MixtralConfig: + def __init__( + self, + hidden_size=4096, + intermediate_size=14336, + num_hidden_layers=32, + num_attention_heads=32, + num_key_value_heads=8, + hidden_act="silu", + initializer_range=0.02, + rms_norm_eps=1e-5, + use_cache=True, + rope_theta=1e6, + attention_dropout=0.0, + num_experts_per_tok=2, + num_local_experts=8, + output_router_logits=False, + router_aux_loss_coef=0.001, + ): + self.hidden_size = hidden_size + self.intermediate_size = intermediate_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + if num_key_value_heads is None: + num_key_value_heads = num_attention_heads + self.num_key_value_heads = num_key_value_heads + self.hidden_act = hidden_act + self.initializer_range = initializer_range + self.rms_norm_eps = rms_norm_eps + self.use_cache = use_cache + self.rope_theta = rope_theta + self.attention_dropout = attention_dropout + self.num_experts_per_tok = num_experts_per_tok + self.num_local_experts = num_local_experts + self.output_router_logits = output_router_logits + self.router_aux_loss_coef = router_aux_loss_coef + + +class MixtralBlockSparseTop2MLP(nn.Module): + def __init__(self, config: MixtralConfig): + super().__init__() + self.ffn_dim = config.intermediate_size + self.hidden_dim = config.hidden_size + + self.w1 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) + self.w2 = nn.Linear(self.ffn_dim, self.hidden_dim, bias=False) + self.w3 = nn.Linear(self.hidden_dim, self.ffn_dim, bias=False) + + self.act_fn = ACT2FN[config.hidden_act] + + def forward(self, hidden_states): + current_hidden_states_1 = self.act_fn(self.w1(hidden_states)) + current_hidden_states_3 = self.w3(hidden_states) + current_hidden_states = current_hidden_states_1 * current_hidden_states_3 + current_hidden_states = self.w2(current_hidden_states) + return current_hidden_states + + +class MixtralSparseMoeBlock(nn.Module): + """ + This implementation is + strictly equivalent to standard MoE with full capacity (no + dropped tokens). It's faster since it formulates MoE operations + in terms of block-sparse operations to accommodate imbalanced + assignments of tokens to experts, whereas standard MoE either + (1) drop tokens at the cost of reduced performance or (2) set + capacity factor to number of experts and thus waste computation + and memory on padding. + """ + + def __init__(self, config, batch_size, sequence_length): + super().__init__() + self.hidden_dim = config.hidden_size + self.ffn_dim = config.intermediate_size + self.num_experts = config.num_local_experts + self.top_k = config.num_experts_per_tok + + # gating + self.gate = nn.Linear(self.hidden_dim, self.num_experts, bias=False) + + self.experts = nn.ModuleList([MixtralBlockSparseTop2MLP(config) for _ in range(self.num_experts)]) + + w1_list = [] + w2_list = [] + w3_list = [] + for i in range(self.num_experts): + w1_list.append(self.experts[i].w1.weight.transpose(0, 1)) + w2_list.append(self.experts[i].w2.weight.transpose(0, 1)) + w3_list.append(self.experts[i].w3.weight.transpose(0, 1)) + + self.moe_experts_weight1 = torch.stack(w1_list, dim=0) + self.moe_experts_weight2 = torch.stack(w2_list, dim=0) + self.moe_experts_weight3 = torch.stack(w3_list, dim=0) + + self.batch_size = batch_size + self.sequence_length = sequence_length + self.moe_onnx_graph = create_moe_onnx_graph( + self.batch_size * self.sequence_length, + self.num_experts, + self.hidden_dim, + self.ffn_dim, + self.moe_experts_weight1, + self.moe_experts_weight2, + self.moe_experts_weight3, + self.top_k, + ) + + self.ort_sess = self.create_ort_session() + + def create_ort_session(self): + from onnxruntime import InferenceSession, SessionOptions + + sess_options = SessionOptions() + + cuda_providers = ["CUDAExecutionProvider"] + if cuda_providers[0] not in onnxruntime.get_available_providers(): + return None + + sess_options.log_severity_level = 2 + ort_session = InferenceSession(self.moe_onnx_graph, sess_options, providers=["CUDAExecutionProvider"]) + + return ort_session + + def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + """ """ + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits = self.gate(hidden_states) + + routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float) + routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1) + + routing_weights /= routing_weights.sum(dim=-1, keepdim=True) + # we cast back to the input dtype + routing_weights = routing_weights.to(hidden_states.dtype) + + final_hidden_states = torch.zeros( + (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device + ) + + # One hot encode the selected experts to create an expert mask + # this will be used to easily index which expert is going to be sollicitated + expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0) + + # Loop over all available experts in the model and perform the computation on each expert + for expert_idx in range(self.num_experts): + expert_layer = self.experts[expert_idx] + idx, top_x = torch.where(expert_mask[expert_idx]) + + if top_x.shape[0] == 0: + continue + + # in torch it is faster to index using lists than torch tensors + top_x_list = top_x.tolist() + idx_list = idx.tolist() + + # Index the correct hidden states and compute the expert hidden state for + # the current expert. We need to make sure to multiply the output hidden + # states by `routing_weights` on the corresponding tokens (top-1 and top-2) + current_state = hidden_states[None, top_x_list].reshape(-1, hidden_dim) + current_hidden_states = expert_layer(current_state) * routing_weights[top_x_list, idx_list, None] + + # However `index_add_` only support torch tensors for indexing so we'll use + # the `top_x` tensor here. + final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype)) + final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) + return final_hidden_states # , router_logits + + def ort_forward(self, hidden_states: torch.Tensor) -> torch.Tensor: + batch_size, sequence_length, hidden_dim = hidden_states.shape + hidden_states = hidden_states.view(-1, hidden_dim) + # router_logits: (batch * sequence_length, n_experts) + router_logits = self.gate(hidden_states) + + ort_inputs = { + "input": numpy.ascontiguousarray(hidden_states.detach().numpy().astype(NP_TYPE)), + "router_probs": numpy.ascontiguousarray(router_logits.detach().numpy().astype(NP_TYPE)), + } + + ort_output = None + if self.ort_sess is not None: + ort_output = self.ort_sess.run(None, ort_inputs) + return torch.tensor(ort_output).reshape(batch_size, sequence_length, -1) # , router_logits + + # print_tensor("input", ort_inputs["input"]) + # print_tensor("router_probs", ort_inputs["router_probs"]) + # print_tensor("fc1_experts_weights", self.moe_experts_weight1.detach().numpy()) + # print_tensor("fc2_experts_weights", self.moe_experts_weight2.detach().numpy()) + # print_tensor("fc3_experts_weights", self.moe_experts_weight3.detach().numpy()) + # print_tensor("output", ort_output[0]) + + return None + + def parity_check(self): + hidden_state = torch.randn(self.batch_size, self.sequence_length, self.hidden_dim) + torch_output = self.forward(hidden_state) + ort_output = self.ort_forward(hidden_state) + if ort_output is not None: + assert torch.allclose(torch_output, ort_output, rtol=1e-04, atol=1e-04) + print( + "batch_size:", + self.batch_size, + " sequence_length:", + self.sequence_length, + " max_diff:", + (torch_output - ort_output).abs().max(), + " parity: OK", + ) + + +class TestMixtralMoE(unittest.TestCase): + def test_mixtral_moe_parity(self): + for batch_size in [1, 16]: + for sequence_length in [128, 1024]: + # use a small sizes to speed up the test + config = MixtralConfig(hidden_size=256, intermediate_size=1024) + mixtral_moe = MixtralSparseMoeBlock(config, batch_size, sequence_length) + mixtral_moe.parity_check() + + +if __name__ == "__main__": + unittest.main() diff --git a/onnxruntime/test/python/transformers/test_parity_moe.py b/onnxruntime/test/python/transformers/test_parity_moe.py index 72ca5d9975c05..dbf6ee7dabb0e 100644 --- a/onnxruntime/test/python/transformers/test_parity_moe.py +++ b/onnxruntime/test/python/transformers/test_parity_moe.py @@ -47,8 +47,8 @@ def create_moe_onnx_graph( hidden_size, inter_size, fc1_experts_weights, - fc2_experts_weights, fc1_experts_bias, + fc2_experts_weights, fc2_experts_bias, ): nodes = [ @@ -58,8 +58,8 @@ def create_moe_onnx_graph( "input", "router_probs", "fc1_experts_weights", - "fc2_experts_weights", "fc1_experts_bias", + "fc2_experts_weights", "fc2_experts_bias", ], ["output"], @@ -250,8 +250,8 @@ def __init__( in_features, hidden_features, self.moe_experts.weight1, - self.moe_experts.weight2, self.moe_experts.bias1, + self.moe_experts.weight2, self.moe_experts.bias2, ) @@ -296,8 +296,6 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000): ).data_ptr(), ) - iobinding.synchronize_inputs() - iobinding.bind_output( name="output", device_type="cuda", @@ -308,11 +306,12 @@ def ort_run_with_iobinding(self, ort_inputs, repeat=1000): numpy.zeros(ort_inputs["input"].shape), "cuda", device_id ).data_ptr(), ) - iobinding.synchronize_outputs() s = time.time() for _ in range(repeat): + iobinding.synchronize_inputs() self.ort_sess.run_with_iobinding(iobinding) + iobinding.synchronize_outputs() e = time.time() print(f"MoE cuda kernel time: {(e - s) / repeat * 1000} ms") @@ -356,8 +355,8 @@ def onnx_forward(self, iobinding=False): # print_tensor("input", ort_inputs["input"]) # print_tensor("router_probs", ort_inputs["router_probs"]) # print_tensor("fc1_experts_weights", self.moe_experts.weight1.detach().numpy()) - # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy()) # print_tensor("fc1_experts_bias", self.moe_experts.bias1.detach().numpy()) + # print_tensor("fc2_experts_weights", self.moe_experts.weight2.detach().numpy()) # print_tensor("fc2_experts_bias", self.moe_experts.bias2.detach().numpy()) # print_tensor("output", ort_output[0]) From 7e18cb4c3509fbf6f9780c5f782a129b3de15d4d Mon Sep 17 00:00:00 2001 From: zesongw Date: Wed, 20 Mar 2024 23:32:57 +0800 Subject: [PATCH 38/55] [WebNN EP] Support MatMul 1D (#19862) ### Description Support MatMul 1D inputs by combining Reshape and ReduceMean. ### Motivation and Context ONNX MatMul can support 1D inputs, which is disabled in `IsOpSupportedImpl`. --- .../webnn/builders/impl/gemm_op_builder.cc | 49 +++++++++++++++++-- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc index 455e0e5f16a42..ed320132169e9 100644 --- a/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc +++ b/onnxruntime/core/providers/webnn/builders/impl/gemm_op_builder.cc @@ -42,6 +42,26 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N if (!GetShape(*input_defs[a_idx], a_shape, logger)) { return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of A."); } + std::vector b_shape; + if (!GetShape(*input_defs[b_idx], b_shape, logger)) { + return ORT_MAKE_STATUS(ONNXRUNTIME, INVALID_ARGUMENT, "Can not get shape of B."); + } + // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. + bool extended_a_shape = false; + if (a_shape.size() == 1) { + extended_a_shape = true; + a_shape.insert(a_shape.begin(), 1); + a = model_builder.GetBuilder().call("reshape", a, + emscripten::val::array(GetVecUint32FromVecInt64(a_shape))); + } + // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. + bool extended_b_shape = false; + if (b_shape.size() == 1) { + extended_b_shape = true; + b_shape.push_back(1); + b = model_builder.GetBuilder().call("reshape", b, + emscripten::val::array(GetVecUint32FromVecInt64(b_shape))); + } // The inputs of MatMul must be at least 3D for WebNN CPU backend. Use GEMM for 2D case. // TODO: Remove this workaround when it is fixed in Chromium. if (model_builder.GetWebnnDeviceType() == WebnnDeviceType::CPU && a_shape.size() == 2) { @@ -49,6 +69,27 @@ Status GemmOpBuilder::AddToModelBuilderImpl(ModelBuilder& model_builder, const N } else { output = model_builder.GetBuilder().call("matmul", a, b); } + // If the inputs are both 1D, reduce the output to a scalar. + if (extended_a_shape && extended_b_shape) { + output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array()); + } + // After matrix multiplication the prepended 1 is removed. + else if (extended_a_shape) { + std::vector new_shape; + for (size_t i = 0; i < b_shape.size() - 2; i++) { + new_shape.push_back(narrow(b_shape[i])); + } + new_shape.push_back(narrow(b_shape.back())); + output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(new_shape)); + } + // After matrix multiplication the appended 1 is removed. + else if (extended_b_shape) { + std::vector new_shape; + for (size_t i = 0; i < a_shape.size() - 1; i++) { + new_shape.push_back(narrow(a_shape[i])); + } + output = model_builder.GetBuilder().call("reshape", output, emscripten::val::array(new_shape)); + } } else if (op_type == "MatMulInteger") { emscripten::val a_zero_point = emscripten::val::null(); emscripten::val b_zero_point = emscripten::val::null(); @@ -152,10 +193,10 @@ bool GemmOpBuilder::IsOpSupportedImpl(const InitializedTensorSet& initializers, } if (op_type == "MatMul") { - if (a_shape.size() < 2 || b_shape.size() < 2) { - LOGS(logger, VERBOSE) << "Inputs of MatMul must be at least 2D"; - return false; - } + // If the first argument is 1-D, it is promoted to a matrix by prepending a 1 to its dimensions. + // If the second argument is 1-D, it is promoted to a matrix by appending a 1 to its dimensions. + if (a_shape.size() == 1) a_shape.insert(a_shape.begin(), 1); + if (b_shape.size() == 1) b_shape.push_back(1); // WebNN CPU backend has two more constraints. // https://source.chromium.org/chromium/chromium/src/+/main:third_party/blink/renderer/modules/ml/webnn/ml_graph_xnnpack.cc;l=1177 From 8adbc09314e80ec9528c3269afe8f30fab8c864b Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Thu, 21 Mar 2024 00:02:50 +0800 Subject: [PATCH 39/55] [Fix] Error Python Packaging Pipeline (Training CPU) (#19992) ### Description fix the error caused by https://github.com/microsoft/onnxruntime/pull/19973 --- .../azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml index 0e6e5bd53fab3..4ca122f639551 100644 --- a/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/orttraining-py-packaging-pipeline-cpu.yml @@ -63,7 +63,7 @@ stages: -e BUILD_BUILDNUMBER \ -e ORT_DISABLE_PYTHON_PACKAGE_LOCAL_VERSION \ -e DEFAULT_TRAINING_PACKAGE_DEVICE \ - onnxruntimetrainingcpubuild \ + onnxruntimetrainingcpubuild_$(PythonVersion) \ $(PythonManylinuxDir)/bin/python3 /onnxruntime_src/tools/ci_build/build.py \ --build_dir /build --cmake_generator Ninja \ --config Debug Release \ From 0af5eacc8b300b475cfca06a7cff8369c1b07157 Mon Sep 17 00:00:00 2001 From: Markus Tavenrath Date: Wed, 20 Mar 2024 17:57:29 +0100 Subject: [PATCH 40/55] Fix broken Pooling CUDA NHWC Ops and ensure NCHW / NHWC parity. (#19889) ### Description Fixed all CUDA NHWC Pooling operations which were broken and enabled the NHWC CUDA pooling tests. Disabled all pooling tests which are not supported by the CUDA EP. ### Motivation and Context Ensure parity between CUDA NHWC / NCHW and work towards 100% tests enabled for the CUDA EP / CUDA NHWC EP. --------- Co-authored-by: Tianlei Wu --- .../core/providers/cuda/cuda_nhwc_kernels.cc | 7 ++ .../core/providers/cuda/cudnn_common.cc | 29 +++-- .../providers/cuda/nn/max_pool_with_index.cu | 114 +++++++++++++----- .../providers/cuda/nn/max_pool_with_index.h | 2 +- onnxruntime/core/providers/cuda/nn/pool.cc | 90 ++++++++------ onnxruntime/core/providers/cuda/nn/pool.h | 6 +- onnxruntime/core/providers/rocm/nn/pool.cc | 2 +- .../test/providers/cpu/nn/pool_op_test.cc | 112 +++++++++++------ 8 files changed, 252 insertions(+), 110 deletions(-) diff --git a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc index 8fdcaacdb0f29..7afd2d430ec46 100644 --- a/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc +++ b/onnxruntime/core/providers/cuda/cuda_nhwc_kernels.cc @@ -74,6 +74,8 @@ class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kM MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, float, MaxPool); class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, MLFloat16, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, int8_t, MaxPool); +class ONNX_OPERATOR_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 12, uint8_t, MaxPool); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, float, BatchNormalization); class ONNX_OPERATOR_VERSIONED_TYPED_KERNEL_CLASS_NAME(kCudaExecutionProvider, kMSInternalNHWCDomain, 14, 14, double, @@ -165,6 +167,7 @@ Status RegisterCudaNhwcKernels(KernelRegistry& kernel_registry) { kCudaExecutionProvider, kMSInternalNHWCDomain, 10, 10, float, MaxPool)>, BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo, + BuildKernelCreateInfo, + BuildKernelCreateInfo, BuildKernelCreateInfo, BuildKernelCreateInfo input_dims, cudnnDataType_t dat TensorPitches pitches(input_dims); InlinedVector dims(rank); InlinedVector strides(rank); - for (int i = 0; i < rank; i++) { - dims[i] = gsl::narrow_cast(input_dims[i]); - strides[i] = gsl::narrow_cast(pitches[i]); - } - if (is_nhwc) { - std::swap(dims[1], dims[rank - 1]); - std::swap(strides[1], strides[rank - 1]); + + if (!is_nhwc) { + for (int i = 0; i < rank; i++) { + dims[i] = gsl::narrow_cast(input_dims[i]); + strides[i] = gsl::narrow_cast(pitches[i]); + } + } else { + // NHWDC <-> NCHWD + + // N + dims[0] = gsl::narrow_cast(input_dims[0]); + strides[0] = gsl::narrow_cast(pitches[0]); + + // HWD + for (int i = 1; i < rank - 1; i++) { + dims[i + 1] = gsl::narrow_cast(input_dims[i]); + strides[i + 1] = gsl::narrow_cast(pitches[i]); + } + + // C + dims[1] = gsl::narrow_cast(input_dims[rank - 1]); + strides[1] = gsl::narrow_cast(pitches[rank - 1]); } CUDNN_RETURN_IF_ERROR(cudnnSetTensorNdDescriptor(tensor_, dataType, static_cast(rank), dims.data(), strides.data())); return Status::OK(); diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu index ef1155af127d1..9311f044f4ec5 100644 --- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu +++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.cu @@ -7,10 +7,11 @@ #include "core/providers/cuda/cu_inc/common.cuh" #include "core/providers/cuda/shared_inc/fast_divmod.h" +#include "core/providers/cuda/shared_inc/cuda_utils.h" namespace onnxruntime { namespace cuda { -template +template __global__ void MaxPoolWithIndexKernel( int64_t batch, int64_t channels, @@ -44,11 +45,27 @@ __global__ void MaxPoolWithIndexKernel( int id = blockIdx.x * blockDim.x + threadIdx.x; if (id >= output_size) return; + auto compute_offset = + [height, width, depth, channels](int n_index, int c_index, int h_index, int w_index, int d_index) -> int64_t { + if constexpr (Layout == LAYOUT_NCHW) { + return (((n_index * channels + c_index) * height + h_index) * width + w_index) * depth + d_index; + } else if constexpr (Layout == LAYOUT_NHWC) { + return (((n_index * height + h_index) * width + w_index) * depth + d_index) * channels + c_index; + } + }; + int d_index, w_index, h_index, c_index, n_index, id_tmp; - fdm_d.divmod(id, id_tmp, d_index); - fdm_w.divmod(id_tmp, id_tmp, w_index); - fdm_h.divmod(id_tmp, id_tmp, h_index); - fdm_c.divmod(id_tmp, n_index, c_index); + if constexpr (Layout == LAYOUT_NCHW) { + fdm_d.divmod(id, id_tmp, d_index); + fdm_w.divmod(id_tmp, id_tmp, w_index); + fdm_h.divmod(id_tmp, id_tmp, h_index); + fdm_c.divmod(id_tmp, n_index, c_index); + } else if constexpr (Layout == LAYOUT_NHWC) { + fdm_c.divmod(id, id_tmp, c_index); + fdm_d.divmod(id_tmp, id_tmp, d_index); + fdm_w.divmod(id_tmp, id_tmp, w_index); + fdm_h.divmod(id_tmp, n_index, h_index); + } int64_t d_start = d_index * stride_d - pad_d; int64_t w_start = w_index * stride_w - pad_w; @@ -64,29 +81,45 @@ __global__ void MaxPoolWithIndexKernel( int64_t d_index_max = -1; int64_t w_index_max = -1; int64_t h_index_max = -1; - int64_t offset = (n_index * channels + c_index) * height * width * depth; + int64_t offset = compute_offset(n_index, c_index, 0, 0, 0); const T* p_slice = p_input + offset; - T maxval = p_slice[h_start * width * depth + w_start * depth + d_start] - (T)1; + T maxval = p_slice[compute_offset(0, 0, h_start, w_start, d_start)] - (T)1; for (int64_t d = d_start; d < d_end; d += dilation_d) { for (int64_t w = w_start; w < w_end; w += dilation_w) { for (int64_t h = h_start; h < h_end; h += dilation_h) { - if (p_slice[h * width * depth + w * depth + d] > maxval) { + auto pool_offset = compute_offset(0, 0, h, w, d); + if (p_slice[pool_offset] > maxval) { h_index_max = h; w_index_max = w; d_index_max = d; - maxval = static_cast(p_slice[h * width * depth + w * depth + d]); + maxval = static_cast(p_slice[pool_offset]); } } } } - p_output[id] = p_input[offset + h_index_max * width * depth + w_index_max * depth + d_index_max]; + p_output[id] = p_input[offset + compute_offset(0, 0, h_index_max, w_index_max, d_index_max)]; + if (p_indices) { - p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max - : offset + h_index_max + w_index_max * height + d_index_max * width * height; + if constexpr (Layout == LAYOUT_NCHW) { + p_indices[id] = storage_order == 0 ? offset + h_index_max * width * depth + w_index_max * depth + d_index_max + : offset + h_index_max + w_index_max * height + d_index_max * width * height; + } else if constexpr (Layout == LAYOUT_NHWC) { + // The tests currently have to be provided in NHWC layout so that tests do not fail. When converting between + // layouts, does it make sense to do an index conversion as well? + // Storing indices in NHWC layout isn't critical as they are supposed to be used by Unpooling operations + // which currently assume that indices reference to Tensors in NHWC layout. + int64_t id_nchw = + (((n_index * channels + c_index) * pooled_height + h_index) * pooled_width + w_index) * pooled_depth + d_index; + int64_t offset_nchw = (n_index * channels + c_index) * width * height * depth; + + p_indices[id_nchw] = (storage_order == 0) + ? offset_nchw + h_index_max * width * depth + w_index_max * depth + d_index_max + : offset_nchw + h_index_max + w_index_max * height + d_index_max * width * height; + } } } -template +template void MaxPoolWithIndex( cudaStream_t stream, const TensorShape& input_shape, @@ -99,14 +132,29 @@ void MaxPoolWithIndex( const T* p_input, T* p_output, int64_t* p_indices) { - int64_t batchs = input_shape[0]; - int64_t channels = input_shape[1]; - int64_t height = input_shape[2]; - int64_t width = kernel_shape.size() > 1 ? input_shape[3] : 1; - int64_t depth = kernel_shape.size() > 2 ? input_shape[4] : 1; - int64_t pooled_height = output_shape[2]; - int64_t pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1; - int64_t pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1; + int64_t batchs, channels, height, width, depth; + int64_t pooled_height, pooled_width, pooled_depth; + if constexpr (Layout == LAYOUT_NCHW) { + batchs = input_shape[0]; + channels = input_shape[1]; + height = input_shape[2]; + width = kernel_shape.size() > 1 ? input_shape[3] : 1; + depth = kernel_shape.size() > 2 ? input_shape[4] : 1; + + pooled_height = output_shape[2]; + pooled_width = kernel_shape.size() > 1 ? output_shape[3] : 1; + pooled_depth = kernel_shape.size() > 2 ? output_shape[4] : 1; + } else if constexpr (Layout == LAYOUT_NHWC) { + batchs = input_shape[0]; + height = input_shape[1]; + width = kernel_shape.size() > 1 ? input_shape[2] : 1; + depth = kernel_shape.size() > 2 ? input_shape[3] : 1; + channels = input_shape[input_shape.NumDimensions() - 1]; + + pooled_height = output_shape[1]; + pooled_width = kernel_shape.size() > 1 ? output_shape[2] : 1; + pooled_depth = kernel_shape.size() > 2 ? output_shape[3] : 1; + } int64_t kernel_h = kernel_shape[0]; int64_t kernel_w = kernel_shape.size() > 1 ? kernel_shape[1] : 1; int64_t kernel_d = kernel_shape.size() > 2 ? kernel_shape[2] : 1; @@ -130,7 +178,7 @@ void MaxPoolWithIndex( fast_divmod fdm_d(static_cast(pooled_depth)); int blocksPerGrid = (int)((output_size + GridDim::maxThreadsPerBlock - 1) / GridDim::maxThreadsPerBlock); - MaxPoolWithIndexKernel<<>>( + MaxPoolWithIndexKernel<<>>( batchs, channels, height, @@ -162,8 +210,8 @@ void MaxPoolWithIndex( p_indices); } -#define INSTANTIATEMAXPOOLWITHINDEX(T) \ - template void MaxPoolWithIndex( \ +#define INSTANTIATEMAXPOOLWITHINDEX(T, Layout) \ + template void MaxPoolWithIndex( \ cudaStream_t stream, \ const TensorShape& input_shape, \ const TensorShape& output_shape, \ @@ -176,11 +224,19 @@ void MaxPoolWithIndex( T* p_output, \ int64_t* p_indices); -INSTANTIATEMAXPOOLWITHINDEX(float) -INSTANTIATEMAXPOOLWITHINDEX(double) -INSTANTIATEMAXPOOLWITHINDEX(half) -INSTANTIATEMAXPOOLWITHINDEX(int8_t) -INSTANTIATEMAXPOOLWITHINDEX(uint8_t) +INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NCHW) +INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NCHW) +INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NCHW) +INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NCHW) +INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NCHW) + +#ifdef ENABLE_CUDA_NHWC_OPS +INSTANTIATEMAXPOOLWITHINDEX(float, LAYOUT_NHWC) +INSTANTIATEMAXPOOLWITHINDEX(double, LAYOUT_NHWC) +INSTANTIATEMAXPOOLWITHINDEX(half, LAYOUT_NHWC) +INSTANTIATEMAXPOOLWITHINDEX(int8_t, LAYOUT_NHWC) +INSTANTIATEMAXPOOLWITHINDEX(uint8_t, LAYOUT_NHWC) +#endif } // namespace cuda } // namespace onnxruntime diff --git a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h index 27f5b241cc785..98f14c3f6a626 100644 --- a/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h +++ b/onnxruntime/core/providers/cuda/nn/max_pool_with_index.h @@ -7,7 +7,7 @@ namespace onnxruntime { namespace cuda { -template +template void MaxPoolWithIndex( cudaStream_t stream, const TensorShape& input_shape, diff --git a/onnxruntime/core/providers/cuda/nn/pool.cc b/onnxruntime/core/providers/cuda/nn/pool.cc index 8bc96958693bc..4acdcfcf35491 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.cc +++ b/onnxruntime/core/providers/cuda/nn/pool.cc @@ -87,6 +87,8 @@ POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, float, MaxPool<8>, 11, 11, kMSInt POOLING_KERNEL_VERSIONED_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 11, 11, kMSInternalNHWCDomain, true) POOLING_KERNEL_WITH_INDICES(MaxPool, float, MaxPool<8>, 12, kMSInternalNHWCDomain, true) POOLING_KERNEL_WITH_INDICES(MaxPool, MLFloat16, MaxPool<8>, 12, kMSInternalNHWCDomain, true) +POOLING_KERNEL_WITH_INDICES(MaxPool, int8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true) +POOLING_KERNEL_WITH_INDICES(MaxPool, uint8_t, MaxPool<8>, 12, kMSInternalNHWCDomain, true) POOLING_KERNEL(GlobalMaxPool, float, MaxPool<1>, 1, kMSInternalNHWCDomain, true) POOLING_KERNEL(GlobalMaxPool, MLFloat16, MaxPool<1>, 1, kMSInternalNHWCDomain, true) @@ -145,8 +147,8 @@ class CudnnPoolingDescriptor final { cudnnPoolingDescriptor_t desc_; }; -template -Status Pool::ComputeInternal(OpKernelContext* context) const { +template +Status Pool::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); @@ -157,16 +159,21 @@ Status Pool::ComputeInternal(OpKernelContext* context) const } auto kernel_shape = pool_attrs_.kernel_shape; - auto pads = pool_attrs_.pads; auto strides = pool_attrs_.strides; + TensorShapeVector pads = pool_attrs_.pads; if (pool_attrs_.global_pooling) { - kernel_shape.assign(x_dims.begin() + 2, x_dims.end()); - pads.assign(kernel_shape.size(), 0); + if constexpr (Layout == LAYOUT_NCHW) { + kernel_shape.assign(x_dims.begin() + 2, x_dims.end()); + } else if constexpr (Layout == LAYOUT_NHWC) { + kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1); + } + pads.assign(2 * kernel_shape.size(), 0); strides.assign(kernel_shape.size(), 1); } - auto out_channel = NHWC ? x_shape[3] : x_shape[1]; - auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC); + auto out_channel = (Layout == LAYOUT_NHWC) ? x_shape[x_dims.size() - 1] : x_shape[1]; + + auto y_dims = pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC); TensorShape y_shape(y_dims); Tensor* Y = context->Output(0, y_shape); // special case when there is a dim value of 0 in the shape. @@ -178,20 +185,22 @@ Status Pool::ComputeInternal(OpKernelContext* context) const TensorShapeVector x_dims_cudnn(x_dims.begin(), x_dims.end()); TensorShapeVector y_dims_cudnn(y_dims); if (kernel_shape.size() < 2) { - // cudnn only takes 4D or 5D input, so pad dimensions if needed - if (NHWC) { - x_dims_cudnn.insert(x_dims_cudnn.begin() + 1, 1); - y_dims_cudnn.insert(y_dims_cudnn.begin() + 1, 1); - kernel_shape.insert(kernel_shape.begin() + 1, 1); - strides.insert(strides.begin() + 1, 1); - } else { - x_dims_cudnn.push_back(1); - y_dims_cudnn.push_back(1); - kernel_shape.push_back(1); - strides.push_back(1); + // cuDNN only takes 4D or 5D input, so pad dimensions if needed + if constexpr (Layout == LAYOUT_NHWC) { + x_dims_cudnn.insert(x_dims_cudnn.end() - 1, 1); + y_dims_cudnn.insert(y_dims_cudnn.end() - 1, 1); + pads.insert(pads.begin() + pads.size() / 2, 0); + pads.insert(pads.end(), 0); + kernel_shape.insert(kernel_shape.end(), 1); + strides.insert(strides.end(), 1); + } else { // Layout == LAYOUT_NCHW + x_dims_cudnn.insert(x_dims_cudnn.end(), 1); + y_dims_cudnn.insert(y_dims_cudnn.end(), 1); + pads.insert(pads.begin() + pads.size() / 2, 0); + pads.insert(pads.end(), 0); + kernel_shape.insert(kernel_shape.end(), 1); + strides.insert(strides.end(), 1); } - pads.insert(pads.begin() + kernel_shape.size(), 0); - pads.insert(pads.end(), 0); } cudnnPoolingMode_t mode = CUDNN_POOLING_MAX; @@ -208,8 +217,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const const auto beta = Consts::Zero; CudnnTensor x_tensor; CudnnTensor y_tensor; - ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); - ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); + ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), Layout == LAYOUT_NHWC)); + ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), Layout == LAYOUT_NHWC)); const auto input_count = x_shape.Size(); const auto output_count = y_shape.Size(); @@ -225,8 +234,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const const auto beta = Consts::Zero; CudnnTensor x_tensor; CudnnTensor y_tensor; - ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); - ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), NHWC)); + ORT_RETURN_IF_ERROR(x_tensor.Set(x_dims_cudnn, CudnnTensor::GetDataType(), Layout == LAYOUT_NHWC)); + ORT_RETURN_IF_ERROR(y_tensor.Set(y_dims_cudnn, CudnnTensor::GetDataType(), Layout == LAYOUT_NHWC)); CUDNN_RETURN_IF_ERROR( PoolingForwardHelper(GetCudnnHandle(context), pooling_desc, &alpha, x_tensor, x_data, &beta, y_tensor, y_data)); @@ -235,8 +244,8 @@ Status Pool::ComputeInternal(OpKernelContext* context) const return Status::OK(); } -template -Status Pool, NHWC>::ComputeInternal(OpKernelContext* context) const { +template +Status Pool, Layout>::ComputeInternal(OpKernelContext* context) const { typedef typename ToCudaType::MappedType CudaT; const Tensor* X = context->Input(0); const TensorShape& x_shape = X->Shape(); @@ -251,12 +260,16 @@ Status Pool, NHWC>::ComputeInternal(OpKernelContext* context) cons auto strides = this->pool_attrs_.strides; if (this->pool_attrs_.global_pooling) { - kernel_shape.assign(x_dims.begin() + 2, x_dims.end()); - pads.assign(kernel_shape.size(), 0); + if constexpr (Layout == LAYOUT_NCHW) { + kernel_shape.assign(x_dims.begin() + 2, x_dims.end()); + } else if constexpr (Layout == LAYOUT_NHWC) { + kernel_shape.assign(x_dims.begin() + 1, x_dims.end() - 1); + } + pads.assign(2 * kernel_shape.size(), 0); // x{i}_begin + x{i}_end strides.assign(kernel_shape.size(), 1); } - auto out_channel = NHWC ? x_shape[3] : x_shape[1]; - auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, NHWC); + auto out_channel = Layout == LAYOUT_NHWC ? x_shape[x_shape.NumDimensions() - 1] : x_shape[1]; + auto y_dims = this->pool_attrs_.SetOutputSize(x_shape, out_channel, &pads, Layout == LAYOUT_NHWC); Tensor* Y = context->Output(0, TensorShape(y_dims)); // special case when there is a dim value of 0 in the shape. @@ -265,13 +278,22 @@ Status Pool, NHWC>::ComputeInternal(OpKernelContext* context) cons auto x_data = reinterpret_cast(X->Data()); auto y_data = reinterpret_cast(Y->MutableData()); - Tensor* I = context->Output(1, TensorShape(y_dims)); + // I is in NCHW format and the contained indices use NCHW math to compute the index + auto i_dims = y_dims; + if constexpr (Layout == LAYOUT_NHWC) { + // y_dims in NHWDC format, i_dims has to be in NCHWD format. + i_dims.insert(i_dims.begin() + 1, i_dims.back()); // N*C*HWDC + i_dims.pop_back(); // NCHW + } + + Tensor* I = context->Output(1, TensorShape(i_dims)); if (nullptr != I || !this->pool_attrs_.default_dilations) { auto i_data = nullptr == I ? nullptr : I->MutableData(); - MaxPoolWithIndex(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, strides, pads, - this->pool_attrs_.dilations, this->pool_attrs_.storage_order, x_data, y_data, i_data); + MaxPoolWithIndex(this->Stream(context), x_shape, TensorShape(y_dims), kernel_shape, + strides, pads, this->pool_attrs_.dilations, + this->pool_attrs_.storage_order, x_data, y_data, i_data); } else { - ORT_RETURN_IF_ERROR((Pool, NHWC>::ComputeInternal(context))); + ORT_RETURN_IF_ERROR((Pool, Layout == LAYOUT_NHWC>::ComputeInternal(context))); } return Status::OK(); } diff --git a/onnxruntime/core/providers/cuda/nn/pool.h b/onnxruntime/core/providers/cuda/nn/pool.h index 8b5152a1565a9..97f7c8b8762d5 100644 --- a/onnxruntime/core/providers/cuda/nn/pool.h +++ b/onnxruntime/core/providers/cuda/nn/pool.h @@ -19,10 +19,10 @@ class Pool : public CudaKernel, public PoolBase { Status ComputeInternal(OpKernelContext* context) const override; }; -template -class Pool, NHWC> final : public Pool, NHWC> { +template +class Pool, Layout> final : public Pool, Layout> { public: - explicit Pool(const OpKernelInfo& info) : Pool, NHWC>(info) {} + explicit Pool(const OpKernelInfo& info) : Pool, Layout>(info) {} Status ComputeInternal(OpKernelContext* context) const override; }; diff --git a/onnxruntime/core/providers/rocm/nn/pool.cc b/onnxruntime/core/providers/rocm/nn/pool.cc index 045c8b55c0b0d..3a82ab598004b 100644 --- a/onnxruntime/core/providers/rocm/nn/pool.cc +++ b/onnxruntime/core/providers/rocm/nn/pool.cc @@ -257,7 +257,7 @@ Status Pool>::ComputeInternal(OpKernelContext* context) const { Tensor* I = context->Output(1, TensorShape(y_dims)); if (nullptr != I || !this->pool_attrs_.default_dilations) { auto i_data = nullptr == I ? nullptr : I->MutableData(); - MaxPoolWithIndex( + MaxPoolWithIndex( this->Stream(context), x_shape, TensorShape(y_dims), diff --git a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc index f98b18ddb17eb..c8cf183291518 100644 --- a/onnxruntime/test/providers/cpu/nn/pool_op_test.cc +++ b/onnxruntime/test/providers/cpu/nn/pool_op_test.cc @@ -58,7 +58,7 @@ TEST(PoolTest, MaxPool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); // TensorRT: result differs - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } // Only CUDA kernel has float 16 support @@ -117,7 +117,7 @@ TEST(PoolTest, MaxPool_F16) { test.AddInput("X", x_dims, f_X); test.AddOutput("Y", expected_dims, f_Y); // TensorRT: Assertion `!attrs.count("pads")' failed - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } #endif @@ -170,7 +170,7 @@ static void MaxPool_8_WithIndexTest(bool has_index, int64_t storage_order = 0) { : test.AddOutput("Indices", expected_dims, expected_indices_col); } test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kDnnlExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, + {kDnnlExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kArmNNExecutionProvider, kOpenVINOExecutionProvider}); } @@ -185,7 +185,7 @@ TEST(PoolTest, MaxPool_8_With_Index) { MaxPool_8_WithIndexTest(true, 1 /*storage_order*/); // col major } -TEST(PoolTest, MaxPool1D) { +TEST(PoolTest, MaxPool1D_case1) { OpTester test("MaxPool"); test.AddAttribute("auto_pad", ""); @@ -200,7 +200,45 @@ TEST(PoolTest, MaxPool1D) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + +TEST(PoolTest, MaxPool1D_case2) { + OpTester test("MaxPool"); + // no padding + test.AddAttribute("auto_pad", "VALID"); + test.AddAttribute("strides", std::vector{1}); + test.AddAttribute("pads", vector{0, 0}); + test.AddAttribute("kernel_shape", vector{2}); + + std::vector x_vals = {1, 2, 3, 4, 5}; + std::vector x_dims = {1, 1, 5}; + // The last dim is (5-2+1)/1 = 4 + std::vector expected_dims = {1, 1, 4}; + std::vector expected_vals = {2, 3, 4, 5}; + + test.AddInput("X", x_dims, x_vals); + test.AddOutput("Y", expected_dims, expected_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); +} + +TEST(PoolTest, MaxPool1D_case3) { + OpTester test("MaxPool"); + test.AddAttribute("auto_pad", ""); + test.AddAttribute("strides", std::vector{1}); + // Pad one element + test.AddAttribute("pads", vector{0, 1}); + test.AddAttribute("kernel_shape", vector{2}); + + std::vector x_vals = {1, 2, 3, 4, 5}; + std::vector x_dims = {1, 1, 5}; + // Since we padded it, the last dim is larger compared to the case above + std::vector expected_dims = {1, 1, 5}; + std::vector expected_vals = {2, 3, 4, 5, 5}; + + test.AddInput("X", x_dims, x_vals); + test.AddOutput("Y", expected_dims, expected_vals); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } static void MaxPool1D_8_WithIndexTest(int64_t storage_order) { @@ -222,7 +260,7 @@ static void MaxPool1D_8_WithIndexTest(int64_t storage_order) { test.AddOutput("Y", expected_dims, expected_vals); test.AddOutput("Indices", expected_dims, expected_indices); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } TEST(PoolTest, MaxPool1D_8_With_Index) { @@ -249,7 +287,7 @@ static void MaxPool1D_12_WithIndexTest_int8(int64_t storage_order) { test.AddOutput("Y", expected_dims, expected_vals); test.AddOutput("Indices", expected_dims, expected_indices); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) { @@ -271,7 +309,7 @@ static void MaxPool1D_12_WithIndexTest_uint8(int64_t storage_order) { test.AddOutput("Y", expected_dims, expected_vals); test.AddOutput("Indices", expected_dims, expected_indices); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } TEST(PoolTest, MaxPool1D_12_With_Index_8bits) { @@ -309,9 +347,9 @@ TEST(PoolTest, MaxPool2D_uint8) { test.AddOutput("Output", output_shape, output); #if defined(OPENVINO_CONFIG_GPU_FP32) || defined(OPENVINO_CONFIG_GPU_FP16) - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kOpenVINOExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kOpenVINOExecutionProvider}); #else - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}); #endif } @@ -337,7 +375,7 @@ TEST(PoolTest, MaxPool_10_Dilation_1d) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, MaxPool_DefaultDilations) { @@ -357,7 +395,7 @@ TEST(PoolTest, MaxPool_DefaultDilations) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, MaxPool_DefaultDilations_int8) { @@ -377,7 +415,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_int8) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, MaxPool_DefaultDilations_uint8) { @@ -397,7 +435,7 @@ TEST(PoolTest, MaxPool_DefaultDilations_uint8) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, MaxPool_10_DilationPadding_1d) { @@ -451,7 +489,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, MaxPool_10_Dilation_2d_int8) { @@ -479,7 +517,7 @@ TEST(PoolTest, MaxPool_10_Dilation_2d_int8) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, MaxPool_10_DilationPadding_2d) { @@ -536,7 +574,7 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil0_2d) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) { @@ -565,7 +603,7 @@ TEST(PoolTest, MaxPool_12_Dilation_Ceil0_2d_int8) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) { @@ -595,7 +633,7 @@ TEST(PoolTest, MaxPool_10_Dilation_Ceil1_2d) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } TEST(PoolTest, MaxPool_10_DilationPadding_3d) { @@ -707,7 +745,7 @@ TEST(PoolTest, GlobalMaxPool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}); } TEST(PoolTest, GlobalMaxPool3D) { @@ -783,7 +821,7 @@ TEST(PoolTest, GlobalMaxPool3D) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, AveragePool) { @@ -864,7 +902,7 @@ TEST(PoolTest, AveragePool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, AveragePool_IncludePadPixel) { @@ -889,7 +927,7 @@ TEST(PoolTest, AveragePool_IncludePadPixel) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); test.SetOutputTolerance(0.0001f); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } // test 'strides' attribute not specified @@ -908,7 +946,7 @@ TEST(PoolTest, AveragePool_DefaultStrides) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); } TEST(PoolTest, AveragePool_10_ceil1_2d) { @@ -932,7 +970,7 @@ TEST(PoolTest, AveragePool_10_ceil1_2d) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider}); + {kTensorrtExecutionProvider, kAclExecutionProvider}); } TEST(PoolTest, AveragePool_19_dilation_2d) { @@ -956,7 +994,9 @@ TEST(PoolTest, AveragePool_19_dilation_2d) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", + {kCudaExecutionProvider, kCudaNHWCExecutionProvider, + kTensorrtExecutionProvider, kAclExecutionProvider, kOpenVINOExecutionProvider}); } TEST(PoolTest, GlobalAveragePool) { @@ -1032,7 +1072,7 @@ TEST(PoolTest, GlobalAveragePool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}); } TEST(PoolTest, GlobalAveragePool_Large_128) { @@ -1045,7 +1085,7 @@ TEST(PoolTest, GlobalAveragePool_Large_128) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals, /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}); } TEST(PoolTest, GlobalAveragePool_Large_256) { @@ -1058,7 +1098,7 @@ TEST(PoolTest, GlobalAveragePool_Large_256) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals, /*sort_output=*/false, /*rel_error=*/1e-3f, /*abs_error=*/1e-2f); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}); } TEST(PoolTest, LpPool) { @@ -1365,7 +1405,7 @@ TEST(PoolTest, LpPool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider}); } // test data generated with lp_pool_test_generator.py @@ -1397,7 +1437,8 @@ TEST(PoolTest, LpPool1d) { // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060 // TensorRT does not support 1d pooling - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", + {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); y_count++; } } @@ -1429,7 +1470,7 @@ TEST(PoolTest, LpPool2d) { test.AddAttribute("kernel_shape", kernel_sizes[kernel_size_count]); test.AddOutput("Y", y_sizes[y_count], ys[y_count]); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider}); y_count++; } } @@ -1447,7 +1488,8 @@ TEST(PoolTest, LpPoolCeilMode) { // https://docs.nvidia.com/deeplearning/tensorrt/api/c_api/classnvinfer1_1_1_i_network_definition.html#a94f434942252e6d98ac17705c06ce060 // TensorRT does not support 1d pooling - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", + {kCudaExecutionProvider, kCudaNHWCExecutionProvider, kTensorrtExecutionProvider}); } TEST(PoolTest, GlobalLpPool) { @@ -1702,7 +1744,7 @@ TEST(PoolTest, GlobalLpPool) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); - test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaNHWCExecutionProvider}); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kCudaExecutionProvider, kCudaNHWCExecutionProvider}); } TEST(PoolTest, MaxPoolDimWithZeroForN) { @@ -1720,7 +1762,7 @@ TEST(PoolTest, MaxPoolDimWithZeroForN) { test.AddInput("X", x_dims, x_vals); test.AddOutput("Y", expected_dims, expected_vals); test.Run(OpTester::ExpectResult::kExpectSuccess, "", - {kCudaNHWCExecutionProvider, kTensorrtExecutionProvider, kQnnExecutionProvider}); + {kTensorrtExecutionProvider, kQnnExecutionProvider}); } } // namespace test From 19ff4a6d6c420d4d43c92d57ff45313f00b49336 Mon Sep 17 00:00:00 2001 From: Adam Pocock Date: Wed, 20 Mar 2024 13:52:00 -0400 Subject: [PATCH 41/55] String Tensor SplitToSequence fix (#19942) --- .../core/providers/cpu/sequence/sequence_ops.cc | 2 +- .../providers/cpu/sequence/sequence_ops_test.cc | 13 +++++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc index 8064bc0a58cb1..2913f4ac32b6e 100644 --- a/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc +++ b/onnxruntime/core/providers/cpu/sequence/sequence_ops.cc @@ -453,7 +453,7 @@ Status SplitToSequence::ComputeImpl(OpKernelContext& context, const Tensor& inpu int num_remaining_splits = 0; InlinedVector split_sizes; const bool is_string_type = input.IsDataTypeString(); - const size_t element_size = (is_string_type) ? 0U : input.DataType()->Size(); + const size_t element_size = input.DataType()->Size(); // figure out split_scalar or split_sizes if (p_split_input) { diff --git a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc index 60e75811e4333..c2d64b8e5ee4a 100644 --- a/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc +++ b/onnxruntime/test/providers/cpu/sequence/sequence_ops_test.cc @@ -442,6 +442,19 @@ TEST(SequenceOpsTest, SplitToSequence_PositiveAxisScalarSplit) { test.Run(); } +TEST(SequenceOpsTest, SplitToSequence_StringSplit) { + OpTester test("SplitToSequence", 11); + test.AddInput("input", {3}, std::vector({"Test string", "Another string", "A third and much longer string"})); + int64_t axis = 0; + test.AddAttribute("axis", axis); + SeqTensors output; + output.AddTensor({1}, {"Test string"}); + output.AddTensor({1}, {"Another string"}); + output.AddTensor({1}, {"A third and much longer string"}); + test.AddSeqOutput("S2", output); + test.Run(); +} + TEST(SequenceOpsTest, SplitToSequence_DefaultAxis0UnevenSplitFloat) { OpTester test("SplitToSequence", 11); test.AddInput("input", {5, 2}, GetConsecutiveVector(1.f, 10)); From 6b305f95e0dbbb5e629d45d27a2beb53b6223c00 Mon Sep 17 00:00:00 2001 From: Rachel Guo <35738743+YUNQIUGUO@users.noreply.github.com> Date: Wed, 20 Mar 2024 10:55:19 -0700 Subject: [PATCH 42/55] Support xcframework for mac catalyst builds. (#19534) ### Description ### Motivation and Context MAUI on macOS uses mac-catalyst which requires a different native binary. --------- Co-authored-by: rachguo Co-authored-by: Scott McKay --- cmake/adjust_global_compile_flags.cmake | 9 +++ ...maccatalyst_prepare_objects_for_prelink.py | 72 +++++++++++++++++++ cmake/onnxruntime.cmake | 36 ++++++++-- cmake/onnxruntime_mlas.cmake | 6 ++ .../project.pbxproj | 24 ++++--- .../ios_package_test.entitlements | 10 +++ tools/ci_build/build.py | 42 +++++++++-- .../github/apple/build_apple_framework.py | 4 +- ...t_full_apple_framework_build_settings.json | 1 + ...ult_full_ios_framework_build_settings.json | 14 +++- ...training_ios_framework_build_settings.json | 1 + .../github/apple/framework_info.json.template | 2 +- .../github/apple/test_apple_packages.py | 25 +++++++ .../azure-pipelines/templates/c-api-cpu.yml | 3 +- 14 files changed, 225 insertions(+), 24 deletions(-) create mode 100644 cmake/maccatalyst_prepare_objects_for_prelink.py create mode 100644 onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements diff --git a/cmake/adjust_global_compile_flags.cmake b/cmake/adjust_global_compile_flags.cmake index d3f9256105127..9a3bc3302cc2b 100644 --- a/cmake/adjust_global_compile_flags.cmake +++ b/cmake/adjust_global_compile_flags.cmake @@ -8,6 +8,15 @@ if (CMAKE_SYSTEM_NAME STREQUAL "Android") string(APPEND CMAKE_ASM_FLAGS_RELEASE " -O3") endif() +# Suggested by https://gitlab.kitware.com/cmake/cmake/-/issues/20132 +# MacCatalyst is not well supported in CMake +# The error that can emerge without this flag can look like: +# "clang : error : overriding '-mmacosx-version-min=11.0' option with '-target x86_64-apple-ios14.0-macabi' [-Werror,-Woverriding-t-option]" +if (PLATFORM_NAME STREQUAL "macabi") + add_compile_options(-Wno-overriding-t-option) + add_link_options(-Wno-overriding-t-option) +endif() + # Enable space optimization for gcc/clang # Cannot use "-ffunction-sections -fdata-sections" if we enable bitcode (iOS) if (NOT MSVC AND NOT onnxruntime_ENABLE_BITCODE) diff --git a/cmake/maccatalyst_prepare_objects_for_prelink.py b/cmake/maccatalyst_prepare_objects_for_prelink.py new file mode 100644 index 0000000000000..34664b4e05237 --- /dev/null +++ b/cmake/maccatalyst_prepare_objects_for_prelink.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. + +import os +import shutil +import sys + + +# Note: This script is mainly used for sanity checking/validating the files in the .a library equal to the .o files +# in the source dir to handle the case of source files having duplicate names under different subdirectories for +# each onnxruntime library. (Only applicable when doing a Mac Catalyst build.) +def main(): + source_dir = sys.argv[1] + dest_dir = sys.argv[2] + files_from_static_lib = sys.argv[3] + files_from_source_dir = [] + for subdir, _, files in os.walk(source_dir): + for file_name in files: + if file_name.endswith(".o"): + files_from_source_dir.append(file_name.strip()) + dest_name_without_extension, _ = os.path.splitext(file_name) + counter = 0 + + dest_file = f"{dest_name_without_extension}.o" + while os.path.exists(os.path.join(dest_dir, dest_file)): + print("Duplicate file name from source: " + os.path.join(source_dir, subdir, file_name)) + counter += 1 + dest_file = f"{dest_name_without_extension}_{counter}.o" + print("Renamed file name in destination: " + os.path.join(dest_dir, dest_file)) + + destination_path = os.path.join(dest_dir, dest_file) + source_file = os.path.join(source_dir, subdir, file_name) + shutil.copy(source_file, destination_path) + + # Sanity check to ensure the number of .o object from the original cmake source directory matches with the number + # of .o files extracted from each .a onnxruntime library + file_lists_from_static_lib = [] + with open(files_from_static_lib) as file: + filenames = file.readlines() + for filename in filenames: + file_lists_from_static_lib.append(filename.strip()) + + sorted_list1 = sorted(file_lists_from_static_lib) + sorted_list2 = sorted(files_from_source_dir) + + if len(sorted_list1) != len(sorted_list2): + print( + "Caught a mismatch in the number of .o object files from the original cmake source directory: ", + len(sorted_list1), + "the number of .o files extracted from the static onnxruntime lib: ", + len(sorted_list2), + "for: ", + os.path.basename(source_dir), + ) + + if sorted_list1 == sorted_list2: + print( + "Sanity check passed: object files from original source directory matches with files extracted " + "from static library for: ", + os.path.basename(source_dir), + ) + else: + print( + "Error: Mismatch between object files from original source directory " + "and the .o files extracted from static library for: ", + os.path.basename(source_dir), + ) + + +if __name__ == "__main__": + main() diff --git a/cmake/onnxruntime.cmake b/cmake/onnxruntime.cmake index 2ead13e554197..e15c8a046dc20 100644 --- a/cmake/onnxruntime.cmake +++ b/cmake/onnxruntime.cmake @@ -281,7 +281,13 @@ endif() # Assemble the Apple static framework (iOS and macOS) if(onnxruntime_BUILD_APPLE_FRAMEWORK) - set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT}) + # when building for mac catalyst, the CMAKE_OSX_SYSROOT is set to MacOSX as well, to avoid duplication, + # we specify as `-macabi` in the name of the output static apple framework directory. + if (PLATFORM_NAME STREQUAL "macabi") + set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-macabi) + else() + set(STATIC_FRAMEWORK_OUTPUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/${CMAKE_BUILD_TYPE}-${CMAKE_OSX_SYSROOT}) + endif() # Setup the various directories required. Remove any existing ones so we start with a clean directory. set(STATIC_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/static_libraries) @@ -299,18 +305,34 @@ if(onnxruntime_BUILD_APPLE_FRAMEWORK) # to enforce symbol visibility. doing it this way limits the symbols included from the .a files to symbols used # by the ORT .o files. - # If it's an onnxruntime library, extract .o files to a separate directory for each library to avoid any clashes - # with filenames (e.g. utils.o) + # If it's an onnxruntime library, extract .o files from the original cmake build path to a separate directory for + # each library to avoid any clashes with filenames (e.g. utils.o) foreach(_LIB ${onnxruntime_INTERNAL_LIBRARIES} ) GET_TARGET_PROPERTY(_LIB_TYPE ${_LIB} TYPE) if(_LIB_TYPE STREQUAL "STATIC_LIBRARY") set(CUR_STATIC_LIB_OBJ_DIR ${STATIC_LIB_TEMP_DIR}/$) add_custom_command(TARGET onnxruntime POST_BUILD COMMAND ${CMAKE_COMMAND} -E make_directory ${CUR_STATIC_LIB_OBJ_DIR}) - - add_custom_command(TARGET onnxruntime POST_BUILD - COMMAND ar ARGS -x $ - WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR}) + if (PLATFORM_NAME STREQUAL "macabi") + # There exists several duplicate names for source files under different subdirectories within + # each onnxruntime library. (e.g. onnxruntime/contrib_ops/cpu/element_wise_ops.o + # vs. onnxruntime/providers/core/cpu/math/element_wise_ops.o) + # In that case, using 'ar ARGS -x' to extract the .o files from .a lib would possibly cause duplicate naming files being overwritten + # and lead to missing undefined symbol error in the generated binary. + # So we use the below python script as a sanity check to do a recursive find of all .o files in ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR} + # and verifies that matches the content of the .a, and then copy from the source dir. + # TODO: The copying action here isn't really necessary. For future fix, consider using the script extracts from the ar with the rename to potentially + # make both maccatalyst and other builds do the same thing. + set(CUR_TARGET_CMAKE_SOURCE_LIB_DIR ${CMAKE_CURRENT_BINARY_DIR}/CMakeFiles/${_LIB}.dir) + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ar -t $ | grep "\.o$" > ${_LIB}.object_file_list.txt + COMMAND ${CMAKE_COMMAND} -E env python3 ${CMAKE_CURRENT_SOURCE_DIR}/maccatalyst_prepare_objects_for_prelink.py ${CUR_TARGET_CMAKE_SOURCE_LIB_DIR} ${CUR_STATIC_LIB_OBJ_DIR} ${CUR_STATIC_LIB_OBJ_DIR}/${_LIB}.object_file_list.txt + WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR}) + else() + add_custom_command(TARGET onnxruntime POST_BUILD + COMMAND ar ARGS -x $ + WORKING_DIRECTORY ${CUR_STATIC_LIB_OBJ_DIR}) + endif() endif() endforeach() diff --git a/cmake/onnxruntime_mlas.cmake b/cmake/onnxruntime_mlas.cmake index 17de2aa4aaea6..6b7d4402be8eb 100644 --- a/cmake/onnxruntime_mlas.cmake +++ b/cmake/onnxruntime_mlas.cmake @@ -631,6 +631,12 @@ if (WIN32) endif() endif() +if (PLATFORM_NAME STREQUAL "macabi") + # Needed for maccatalyst C compilation + # i.e. the flags below add "--target=x86_64-apple-ios14.0-macabi -ffunction-sections -fdata-sections" + target_compile_options(onnxruntime_mlas PRIVATE ${CMAKE_C_FLAGS}) +endif() + if (NOT onnxruntime_BUILD_SHARED_LIB) install(TARGETS onnxruntime_mlas ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} diff --git a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj index f0582d41734bd..eb7345be3770b 100644 --- a/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj +++ b/onnxruntime/test/platform/apple/apple_package_test/apple_package_test.xcodeproj/project.pbxproj @@ -49,6 +49,7 @@ 229E595826586B4A006E41AE /* sigmoid.ort */ = {isa = PBXFileReference; lastKnownFileType = file; path = sigmoid.ort; sourceTree = ""; }; 22C1D8DE271A79AF002CEE67 /* ios_package_testUITests.xctest */ = {isa = PBXFileReference; explicitFileType = wrapper.cfbundle; includeInIndex = 0; path = ios_package_testUITests.xctest; sourceTree = BUILT_PRODUCTS_DIR; }; 22C1D8E9271A79FD002CEE67 /* ios_package_uitest_cpp_api.mm */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.cpp.objcpp; path = ios_package_uitest_cpp_api.mm; sourceTree = ""; }; + 513C65792B85789400E4EDFD /* ios_package_test.entitlements */ = {isa = PBXFileReference; lastKnownFileType = text.plist.entitlements; path = ios_package_test.entitlements; sourceTree = ""; }; 51C316B92B0881450033C70B /* macos_package_test.app */ = {isa = PBXFileReference; explicitFileType = wrapper.application; includeInIndex = 0; path = macos_package_test.app; sourceTree = BUILT_PRODUCTS_DIR; }; 51C316BB2B0881450033C70B /* AppDelegate.h */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.h; path = AppDelegate.h; sourceTree = ""; }; 51C316BC2B0881450033C70B /* AppDelegate.m */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.c.objc; path = AppDelegate.m; sourceTree = ""; }; @@ -117,6 +118,7 @@ 229E591E265869BF006E41AE /* ios_package_test */ = { isa = PBXGroup; children = ( + 513C65792B85789400E4EDFD /* ios_package_test.entitlements */, 229E591F265869BF006E41AE /* AppDelegate.h */, 229E5920265869BF006E41AE /* AppDelegate.m */, 229E5928265869BF006E41AE /* Main.storyboard */, @@ -521,8 +523,11 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; + CODE_SIGNING_REQUIRED = NO; + CODE_SIGNING_STYLE = Automatic; + CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements; INFOPLIST_FILE = ios_package_test/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", @@ -530,9 +535,9 @@ PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test"; PRODUCT_NAME = "$(TARGET_NAME)"; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; - SUPPORTS_MACCATALYST = NO; + SUPPORTS_MACCATALYST = YES; SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO; - TARGETED_DEVICE_FAMILY = 1; + TARGETED_DEVICE_FAMILY = "1,2"; }; name = Debug; }; @@ -541,8 +546,11 @@ buildSettings = { ASSETCATALOG_COMPILER_APPICON_NAME = AppIcon; ASSETCATALOG_COMPILER_GLOBAL_ACCENT_COLOR_NAME = AccentColor; - CODE_SIGN_STYLE = Automatic; + CODE_SIGNING_REQUIRED = NO; + CODE_SIGNING_STYLE = Automatic; + CODE_SIGN_ENTITLEMENTS = ios_package_test/ios_package_test.entitlements; INFOPLIST_FILE = ios_package_test/Info.plist; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", @@ -550,9 +558,9 @@ PRODUCT_BUNDLE_IDENTIFIER = "ai.onnxruntime.tests.ios-package-test"; PRODUCT_NAME = "$(TARGET_NAME)"; SUPPORTED_PLATFORMS = "iphoneos iphonesimulator"; - SUPPORTS_MACCATALYST = NO; + SUPPORTS_MACCATALYST = YES; SUPPORTS_MAC_DESIGNED_FOR_IPHONE_IPAD = NO; - TARGETED_DEVICE_FAMILY = 1; + TARGETED_DEVICE_FAMILY = "1,2"; }; name = Release; }; @@ -563,7 +571,7 @@ CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; GENERATE_INFOPLIST_FILE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 13.0; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", @@ -585,7 +593,7 @@ CODE_SIGN_STYLE = Automatic; CURRENT_PROJECT_VERSION = 1; GENERATE_INFOPLIST_FILE = YES; - IPHONEOS_DEPLOYMENT_TARGET = 13.0; + IPHONEOS_DEPLOYMENT_TARGET = 14.0; LD_RUNPATH_SEARCH_PATHS = ( "$(inherited)", "@executable_path/Frameworks", diff --git a/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements new file mode 100644 index 0000000000000..ee95ab7e582d4 --- /dev/null +++ b/onnxruntime/test/platform/apple/apple_package_test/ios_package_test/ios_package_test.entitlements @@ -0,0 +1,10 @@ + + + + + com.apple.security.app-sandbox + + com.apple.security.network.client + + + diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index 067f151844b1b..fd9f106f7ad9b 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -400,6 +400,12 @@ def convert_arg_line_to_args(self, arg_line): parser.add_argument("--ios", action="store_true", help="build for ios") + parser.add_argument( + "--macos", + choices=["MacOSX", "Catalyst"], + help="Specify the target platform for macOS build. Only specify this argument when --build_apple_framework is present.", + ) + parser.add_argument( "--apple_sysroot", default="", help="Specify the location name of the macOS platform SDK to be used" ) @@ -419,7 +425,7 @@ def convert_arg_line_to_args(self, arg_line): action="store_const", const="Xcode", dest="cmake_generator", - help="Use Xcode as cmake generator, this is only supported on MacOS. Equivalent to '--cmake_generator Xcode'.", + help="Use Xcode as cmake generator, this is only supported on MacOS. (non Catalyst build). Equivalent to '--cmake_generator Xcode'.", ) parser.add_argument( "--osx_arch", @@ -1323,8 +1329,12 @@ def generate_build_tree( if args.use_snpe: cmake_args += ["-Donnxruntime_USE_SNPE=ON"] - if args.build_apple_framework or args.ios: - if not args.cmake_generator == "Xcode": + if args.macos or args.ios: + # Note: Xcode CMake generator doesn't have a good support for Mac Catalyst yet. + if args.macos == "Catalyst" and args.cmake_generator == "Xcode": + raise BuildError("Xcode CMake generator ('--cmake_generator Xcode') doesn't support Mac Catalyst build.") + + if (args.ios or args.macos == "MacOSX") and not args.cmake_generator == "Xcode": raise BuildError( "iOS/MacOS framework build requires use of the Xcode CMake generator ('--cmake_generator Xcode')." ) @@ -1342,12 +1352,15 @@ def generate_build_tree( "iOS/MacOS framework build on MacOS canceled due to missing arguments: " + ", ".join(val for val, cond in zip(arg_names, needed_args) if not cond) ) + # note: this value is mainly used in framework_info.json file to specify the build osx type + platform_name = "macabi" if args.macos == "Catalyst" else args.apple_sysroot cmake_args += [ "-Donnxruntime_BUILD_SHARED_LIB=ON", "-DCMAKE_OSX_SYSROOT=" + args.apple_sysroot, "-DCMAKE_OSX_DEPLOYMENT_TARGET=" + args.apple_deploy_target, # we do not need protoc binary for ios cross build "-Dprotobuf_BUILD_PROTOC_BINARIES=OFF", + "-DPLATFORM_NAME=" + platform_name, ] if args.ios: cmake_args += [ @@ -1355,6 +1368,21 @@ def generate_build_tree( "-DCMAKE_TOOLCHAIN_FILE=" + (args.ios_toolchain_file if args.ios_toolchain_file else "../cmake/onnxruntime_ios.toolchain.cmake"), ] + # for catalyst build, we need to manually specify cflags for target e.g. x86_64-apple-ios14.0-macabi, etc. + # https://forums.developer.apple.com/forums/thread/122571 + if args.macos == "Catalyst": + macabi_target = f"{args.osx_arch}-apple-ios{args.apple_deploy_target}-macabi" + cmake_args += [ + "-DCMAKE_CXX_COMPILER_TARGET=" + macabi_target, + "-DCMAKE_C_COMPILER_TARGET=" + macabi_target, + "-DCMAKE_CC_COMPILER_TARGET=" + macabi_target, + f"-DCMAKE_CXX_FLAGS=--target={macabi_target}", + f"-DCMAKE_CXX_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}", + f"-DCMAKE_C_FLAGS=--target={macabi_target}", + f"-DCMAKE_C_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}", + f"-DCMAKE_CC_FLAGS=--target={macabi_target}", + f"-DCMAKE_CC_FLAGS_RELEASE=-O3 -DNDEBUG --target={macabi_target}", + ] if args.build_wasm: emsdk_dir = os.path.join(cmake_dir, "external", "emsdk") @@ -2740,7 +2768,13 @@ def main(): cmake_extra_args += ["-G", args.cmake_generator] if is_macOS(): - if not args.ios and not args.android and args.osx_arch == "arm64" and platform.machine() == "x86_64": + if ( + not args.ios + and args.macos != "Catalyst" + and not args.android + and args.osx_arch == "arm64" + and platform.machine() == "x86_64" + ): if args.test: log.warning("Cannot test ARM64 build on X86_64. Will skip test running after build.") args.test = False diff --git a/tools/ci_build/github/apple/build_apple_framework.py b/tools/ci_build/github/apple/build_apple_framework.py index 7b8a87632f5c7..e17bcd65d8814 100644 --- a/tools/ci_build/github/apple/build_apple_framework.py +++ b/tools/ci_build/github/apple/build_apple_framework.py @@ -50,9 +50,11 @@ def _build_for_apple_sysroot( # Build binary for each arch, one by one for current_arch in archs: build_dir_current_arch = os.path.join(intermediates_dir, sysroot + "_" + current_arch) + # Use MacOS SDK for Catalyst builds + apple_sysroot = "macosx" if sysroot == "macabi" else sysroot build_command = [ *base_build_command, - "--apple_sysroot=" + sysroot, + "--apple_sysroot=" + apple_sysroot, "--osx_arch=" + current_arch, "--build_dir=" + build_dir_current_arch, ] diff --git a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json index 86b4efdc63750..04a73ae450e5f 100644 --- a/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_full_apple_framework_build_settings.json @@ -23,6 +23,7 @@ "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF" ], "macosx": [ + "--macos=MacOSX", "--apple_deploy_target=11.0" ], "iphoneos": [ diff --git a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json index 445bfca9889ff..4bc978956d7fc 100644 --- a/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_full_ios_framework_build_settings.json @@ -6,25 +6,35 @@ "iphonesimulator": [ "arm64", "x86_64" + ], + "macabi": [ + "arm64", + "x86_64" ] }, "build_params": { "base": [ "--parallel", - "--use_xcode", "--build_apple_framework", "--use_coreml", - "--use_xnnpack", "--skip_tests", "--cmake_extra_defines=onnxruntime_BUILD_UNIT_TESTS=OFF" ], "iphoneos": [ "--ios", + "--use_xcode", + "--use_xnnpack", "--apple_deploy_target=12.0" ], "iphonesimulator": [ "--ios", + "--use_xcode", + "--use_xnnpack", "--apple_deploy_target=12.0" + ], + "macabi":[ + "--macos=Catalyst", + "--apple_deploy_target=14.0" ] } } diff --git a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json index f88934cd44a66..2066af7843e0a 100644 --- a/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json +++ b/tools/ci_build/github/apple/default_training_ios_framework_build_settings.json @@ -32,6 +32,7 @@ "--apple_deploy_target=12.0" ], "macosx": [ + "--macos=MacOSX", "--apple_deploy_target=11.0" ] } diff --git a/tools/ci_build/github/apple/framework_info.json.template b/tools/ci_build/github/apple/framework_info.json.template index b4c4fb8d16ebf..1f7eeb5948799 100644 --- a/tools/ci_build/github/apple/framework_info.json.template +++ b/tools/ci_build/github/apple/framework_info.json.template @@ -1,5 +1,5 @@ { - "@CMAKE_OSX_SYSROOT@": { + "@PLATFORM_NAME@": { "APPLE_DEPLOYMENT_TARGET": "@CMAKE_OSX_DEPLOYMENT_TARGET@", "WEAK_FRAMEWORK": "@APPLE_WEAK_FRAMEWORK@" } diff --git a/tools/ci_build/github/apple/test_apple_packages.py b/tools/ci_build/github/apple/test_apple_packages.py index 3c0df994ffd3d..3987a37fcc76c 100644 --- a/tools/ci_build/github/apple/test_apple_packages.py +++ b/tools/ci_build/github/apple/test_apple_packages.py @@ -176,6 +176,25 @@ def _test_apple_packages(args): break + if args.mac_catalyst_enabled: + subprocess.run( + [ + "xcrun", + "xcodebuild", + "test", + "-workspace", + "./apple_package_test.xcworkspace", + "-scheme", + "ios_package_test", + "-destination", + "platform=macOS,variant=Mac Catalyst", + "CODE_SIGNING_ALLOWED=NO", + ], + shell=False, + check=True, + cwd=target_proj_path, + ) + if PackageVariant[args.variant] != PackageVariant.Mobile and not args.skip_macos_test: subprocess.run( [ @@ -244,6 +263,12 @@ def parse_args(): help="Skip macos platform tests. Specify this argument when build targets only contain ios archs. ", ) + parser.add_argument( + "--mac_catalyst_enabled", + action="store_true", + help="Run tests for mac catalyst variants. Specify this argument when build targets contains catalyst archs. ", + ) + return parser.parse_args() diff --git a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml index 1ba0b02560aca..0bb9fad6716b7 100644 --- a/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/c-api-cpu.yml @@ -138,7 +138,8 @@ stages: --framework_info_file "$(Build.BinariesDirectory)/ios_framework/xcframework_info.json" \ --c_framework_dir "$(Build.BinariesDirectory)/ios_framework/framework_out" \ --variant Full \ - --skip_macos_test + --skip_macos_test \ + --mac_catalyst_enabled displayName: "Test Apple framework" - task: PublishBuildArtifacts@1 From 15219e2e71b82f906f92317510396b6ccc858c49 Mon Sep 17 00:00:00 2001 From: Yufeng Li Date: Wed, 20 Mar 2024 12:49:58 -0700 Subject: [PATCH 43/55] turn on neural_speed by default (#19627) ### Description the crash caused by the neural_speed turns out to be a very corn case. Turn it on by default. ### Motivation and Context --- cgmanifests/generated/cgmanifest.json | 2 +- cmake/CMakeLists.txt | 4 +-- cmake/deps.txt | 2 +- cmake/external/neural_speed.cmake | 1 + ...7527d5286ddd3a995c228dedf8d76a7a86bc.patch | 30 +++++++++++++++++++ .../cpu/quantization/neural_speed_wrapper.h | 1 + .../templates/download-deps.yml | 4 +-- 7 files changed, 38 insertions(+), 6 deletions(-) create mode 100644 cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index dc7e9c3fddb2f..3e13a567b1eaa 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -206,7 +206,7 @@ "component": { "type": "git", "git": { - "commitHash": "c11386eb632eec7c1c2aa323142f73519f946e2a", + "commitHash": "150e7527d5286ddd3a995c228dedf8d76a7a86bc", "repositoryUrl": "https://github.com/intel/neural-speed.git" }, "comments": "neural_speed" diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 655ca1c42ef93..49b6f06c76a64 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -88,7 +88,7 @@ option(onnxruntime_USE_QNN "Build with QNN support" OFF) option(onnxruntime_USE_SNPE "Build with SNPE support" OFF) option(onnxruntime_USE_RKNPU "Build with RKNPU support" OFF) option(onnxruntime_USE_DNNL "Build with DNNL support" OFF) -option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" OFF) +option(onnxruntime_USE_NEURAL_SPEED "Build with Neural Speed support" ON) option(onnxruntime_USE_JSEP "Build with JavaScript implemented kernels support" OFF) option(onnxruntime_BUILD_UNIT_TESTS "Build ONNXRuntime unit tests" ON) option(onnxruntime_BUILD_CSHARP "Build C# library" OFF) @@ -1206,7 +1206,7 @@ if (onnxruntime_USE_DNNL) add_compile_definitions(DNNL_OPENMP) endif() -if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD) +if (onnxruntime_USE_NEURAL_SPEED AND NOT onnxruntime_MINIMAL_BUILD AND NOT onnxruntime_USE_TVM) include(neural_speed) if (USE_NEURAL_SPEED) list(APPEND onnxruntime_EXTERNAL_LIBRARIES neural_speed::bestla) diff --git a/cmake/deps.txt b/cmake/deps.txt index 4111689c5def9..22ad9338ea59a 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -35,7 +35,7 @@ microsoft_gsl;https://github.com/microsoft/GSL/archive/refs/tags/v4.0.0.zip;cf36 microsoft_wil;https://github.com/microsoft/wil/archive/refs/tags/v1.0.230629.1.zip;e4a542a323c070376f7c2d1973d0f7ddbc1d2fa5 mimalloc;https://github.com/microsoft/mimalloc/archive/refs/tags/v2.1.1.zip;d5ee7d34223d0567892db5179849939c8769dc41 mp11;https://github.com/boostorg/mp11/archive/refs/tags/boost-1.82.0.zip;9bc9e01dffb64d9e0773b2e44d2f22c51aace063 -neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/bestlav0.1.1.zip;65b0f7a0d04f72f0d5a8d48af70f0366f2ab3939 +neural_speed;https://github.com/intel/neural-speed/archive/refs/tags/v0.3.zip;5ec64e3071edc7347ebd8a81679cf06e2bb9b851 onnx;https://github.com/onnx/onnx/archive/refs/tags/v1.15.0.zip;54c3f960a0541c5d8d3e60c2933e11f5d3688a11 #use the commit of Final DDS removal. DDS output is now supported by ORT TRT. onnx_tensorrt;https://github.com/onnx/onnx-tensorrt/archive/bacfaaa951653cd4e72efe727a543567cb38f7de.zip;26434329612e804164ab7baa6ae629ada56c1b26 diff --git a/cmake/external/neural_speed.cmake b/cmake/external/neural_speed.cmake index ed711351403a7..3fe9c660f89d6 100644 --- a/cmake/external/neural_speed.cmake +++ b/cmake/external/neural_speed.cmake @@ -9,6 +9,7 @@ if(USE_NEURAL_SPEED) neural_speed URL ${DEP_URL_neural_speed} URL_HASH SHA1=${DEP_SHA1_neural_speed} + PATCH_COMMAND ${Patch_EXECUTABLE} -p1 < ${PROJECT_SOURCE_DIR}/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch ) set(BTLA_USE_OPENMP OFF) onnxruntime_fetchcontent_makeavailable(neural_speed) diff --git a/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch new file mode 100644 index 0000000000000..e503a512a74ff --- /dev/null +++ b/cmake/patches/neural_speed/150e7527d5286ddd3a995c228dedf8d76a7a86bc.patch @@ -0,0 +1,30 @@ +diff --git a/bestla/bestla/bestla_prologue_b.h b/bestla/bestla/bestla_prologue_b.h +index 99f3ccc..a11de9d 100644 +--- a/bestla/bestla/bestla_prologue_b.h ++++ b/bestla/bestla/bestla_prologue_b.h +@@ -456,9 +456,8 @@ class WeightKBlockNInteger { + auto tmpscales = tmp; + auto tmpzeropoints = reinterpret_cast(tmpscales + N * blks); + if (scales) { +- for (size_t i = 0; i < N * blks; i += 2) { ++ for (size_t i = 0; i < N * blks; i ++) { + tmpscales[i] = scales[i] / 16; +- tmpscales[i + 1] = scales[i + 1] / 16; + } + } + if (zero_points) { +diff --git a/bestla/bestla/kernel_avx512f.h b/bestla/bestla/kernel_avx512f.h +index 6783ee8..59822e5 100644 +--- a/bestla/bestla/kernel_avx512f.h ++++ b/bestla/bestla/kernel_avx512f.h +@@ -673,8 +673,8 @@ inline BTLA_CODE decompress_kblock_s3_s8fp(utils::bit2x4* bit2ptr, utils::bit1x8 + zmm1 = _mm512_sllv_epi32(zmm1, zmm_shift); // int3_clip => int8 + zmm2 = _mm512_sllv_epi32(zmm2, zmm_shift); // int3_clip => int8 + +- _mm512_storeu_epi8((__m512i*)dst, zmm1); +- _mm512_storeu_epi8((__m512i*)(dst + 64), zmm2); ++ _mm512_storeu_si512((__m512i*)dst, zmm1); ++ _mm512_storeu_si512((__m512i*)(dst + 64), zmm2); + }; + + assert(head_ignore_num % 8 == 0); diff --git a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h index d3902f9bd68c7..e7df50408ef09 100644 --- a/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h +++ b/onnxruntime/contrib_ops/cpu/quantization/neural_speed_wrapper.h @@ -27,6 +27,7 @@ #pragma warning(disable : 4244) #pragma warning(disable : 4267) #pragma warning(disable : 4702) +#pragma warning(disable : 4127) #endif #include "bestla/bestla_prologue_a.h" diff --git a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml index c60b3e467d4f1..4fd33b4f0bc09 100644 --- a/tools/ci_build/github/azure-pipelines/templates/download-deps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/download-deps.yml @@ -11,7 +11,7 @@ steps: packageType: upack feed: '/7424c8e4-5c62-490e-95c4-79446f31017c' definition: '517c4f6f-5437-4392-a70d-4f15ec5be2f0' - version: 1.0.143 + version: 1.0.145 downloadPath: $(Build.BinariesDirectory)/deps # The private ADO project @@ -22,7 +22,7 @@ steps: packageType: upack feed: '/4c7631f5-24c0-4307-8822-1aa8f180c325' definition: 'fd9dd5ad-b73e-4678-890e-edcf680dbc1a' - version: 1.0.143 + version: 1.0.145 downloadPath: $(Build.BinariesDirectory)/deps # You can add more ADO accounts at here. From 0335ea9f1e1f105bb5aec0f02b22da4a61afd8fb Mon Sep 17 00:00:00 2001 From: Justin Chu Date: Wed, 20 Mar 2024 17:53:48 -0700 Subject: [PATCH 44/55] Use Java 11 to build project in the codeql pipeline (#19999) Codeql uses Java 8 by default, which is too old for the project. Related: https://learn.microsoft.com/en-us/java/openjdk/reasons-to-move-to-java-11 https://github.com/actions/setup-java --- .github/workflows/codeql.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml index 4a5b87b3e69ed..e4d1b91bab736 100644 --- a/.github/workflows/codeql.yml +++ b/.github/workflows/codeql.yml @@ -47,6 +47,14 @@ jobs: # Details on CodeQL's query packs refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs queries: security-extended,security-and-quality + # Setup Java to use a version that is not too old for the project + - if: ${{ matrix.language == 'java' }} + name: Setup Java 11 + uses: actions/setup-java@v4 + with: + java-version: '11' + distribution: 'microsoft' + # Autobuild attempts to build any compiled languages (C/C++, C#, or Java). # If this step fails, then you should remove it and run the build manually (see below) - if: ${{ matrix.language != 'cpp' }} From 175f149b30bae8e02d146f1a34d22e1415b6f154 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Thu, 21 Mar 2024 10:01:03 +0800 Subject: [PATCH 45/55] Remove downloading deps in CUDA package test stage (#19993) ### Description ### Motivation and Context downloading deps is not needed in test stage remove it to reduce random downloading errors --- .../templates/py-packaging-training-cuda-stage-steps.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml index 91d7b9f219f76..024b9b45591ba 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-packaging-training-cuda-stage-steps.yml @@ -172,6 +172,7 @@ stages: parameters: Dockerfile: tools/ci_build/github/linux/docker/${{ parameters.docker_file }} Context: tools/ci_build/github/linux/docker + UpdateDepsTxt: false DockerBuildArgs: >- --build-arg TORCH_VERSION=${{ parameters.torch_version }} --build-arg OPSET_VERSION=${{ parameters.opset_version }} From 0b958bb421267a60e42213322dddad62986a93a1 Mon Sep 17 00:00:00 2001 From: Prathik Rao Date: Wed, 20 Mar 2024 21:00:25 -0700 Subject: [PATCH 46/55] add random seed to layernorm tests (#19998) Adds random seed to layernorm tests to prevent random failure. ### Motivation and Context Fixes https://github.com/microsoft/onnxruntime/issues/19983 --- .../orttraining/test/training_ops/cuda/layer_norm_test.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc index e86aa871b6c5f..13ad2f6150acf 100644 --- a/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc +++ b/orttraining/orttraining/test/training_ops/cuda/layer_norm_test.cc @@ -49,7 +49,7 @@ static void TestLayerNormGrad( test.AddAttribute("axis", axis); - RandomValueGenerator random{}; + RandomValueGenerator random{optional{2345}}; const auto Y_grad_data = random.Uniform(n_x_m_dims, k_random_data_min, k_random_data_max); const auto X_data = random.Uniform(n_x_m_dims, k_random_data_min, k_random_data_max); const auto scale_data = random.Uniform(m_dims, k_random_data_min, k_random_data_max); @@ -152,7 +152,7 @@ static void TestInvertibleLayerNormGrad( test.AddAttribute("axis", axis); - RandomValueGenerator random{}; + RandomValueGenerator random{optional{2345}}; const auto Y_grad_data = random.Uniform(n_x_m_dims, k_random_data_min, k_random_data_max); const auto X_data = random.Uniform(n_x_m_dims, k_random_data_min, k_random_data_max); const auto scale_data = random.Uniform(m_dims, k_random_data_min, k_random_data_max); From 06fe4f31131a6873a295ba47ed60f4cb16584296 Mon Sep 17 00:00:00 2001 From: Tianlei Wu Date: Wed, 20 Mar 2024 23:40:27 -0700 Subject: [PATCH 47/55] Increase MNIST test tolerance (#20000) ### Description Found multiple occurrence of failures: https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1321061&view=logs&j=6df8fe70-7b8f-505a-8ef0-8bf93da2bac7&t=56a04c0b-9e7f-5c69-cb7b-c2a7b1a7392a&l=17537 https://dev.azure.com/onnxruntime/onnxruntime/_build/results?buildId=1329701&view=logs&j=6df8fe70-7b8f-505a-8ef0-8bf93da2bac7&t=4f6ef737-111d-50d1-a46b-5f86d9a970bc&s=3618b4c0-1011-591a-85b8-671e72e2cff1 1: [ RUN ] ModelTests/ModelTest.Run/ cuda__models_zoo_opset7_MNIST_model 1: D:\a\_work\1\s\onnxruntime\test\providers\cpu\model_tests.cc(358): error: Expected equality of these values: 1: COMPARE_RESULT::SUCCESS 1: Which is: 4-byte object <00-00 00-00> 1: ret.first 1: Which is: 4-byte object <01-00 00-00> 1: expected -2.33638 (c0158735), got -2.30239 (c0135a47), diff: 0.0339923, tol=0.0243638 idx=9 --- onnxruntime/test/providers/cpu/model_tests.cc | 1 + 1 file changed, 1 insertion(+) diff --git a/onnxruntime/test/providers/cpu/model_tests.cc b/onnxruntime/test/providers/cpu/model_tests.cc index 00d96a0664fa0..aa752ed7308c6 100644 --- a/onnxruntime/test/providers/cpu/model_tests.cc +++ b/onnxruntime/test/providers/cpu/model_tests.cc @@ -92,6 +92,7 @@ TEST_P(ModelTest, Run) { // when cuda or openvino is enabled, set it to a larger value for resolving random MNIST test failure if (model_path.find(ORT_TSTR("_MNIST")) > 0) { if (provider_name == "cuda" || provider_name == "openvino") { + per_sample_tolerance = 2.5e-2; relative_per_sample_tolerance = 1e-2; } } From 30a0d809255994af5685f2903c022446dceaaa10 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 22 Mar 2024 06:53:59 +0800 Subject: [PATCH 48/55] Fix exception in Publish unit test results step (#20007) ### Description Test results files are all in RelWithDebInfo\RelWithDebInfo directory. It's not necessary to stat the directory of _deps ### Motivation and Context Recently this exception in zip-nuget pipleine occurs many times. `##[error]Error: Failed find: EPERM: operation not permitted, stat 'D:\a\_work\1\b\RelWithDebInfo\_deps\flatbuffers-src\java\src\test\java\DictionaryLookup'` https://dev.azure.com/aiinfra/Lotus/_build/results?buildId=426981&view=logs&j=75fc0348-fe99-522b-3acb-90fd80ac5271&t=5d4ebcc1-bcde-574d-6f4e-8abd0f04ae4b --- .../github/azure-pipelines/nuget/templates/dml-vs-2022.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml index d6bb415a68ee6..3a3375a313ca5 100644 --- a/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml +++ b/tools/ci_build/github/azure-pipelines/nuget/templates/dml-vs-2022.yml @@ -188,7 +188,7 @@ stages: displayName: 'Publish unit test results' inputs: testResultsFiles: '**\*.results.xml' - searchFolder: '$(Build.BinariesDirectory)' + searchFolder: '$(Build.BinariesDirectory)\$(BuildConfig)\$(BuildConfig)' testRunTitle: 'Unit Test Run' condition: succeededOrFailed() From 983fd8393aa30b0c788275ea6e513a614274e1ba Mon Sep 17 00:00:00 2001 From: TP Boudreau Date: Thu, 21 Mar 2024 16:08:18 -0700 Subject: [PATCH 49/55] Recognize NaN operands in Min and Max ops (#19984) ### Description Update the Min and Max CUDA math operations on float/double types to propagate NaNs: if either operand is NaN, the result should be NaN. TODO: float16/bfloat16 need similar change. ### Motivation Currently, results differ between the CPU and CUDA implementations of the floating point Min and Max operators: the CPU operators correctly return NaN results if either operand is NaN. This PR updates the CUDA implementations to conform with this correct behavior. See the the issue and comments raised [here](https://github.com/onnx/onnx/issues/6003). ### Context Same behavior in numpy, torch and Java: ``` >>> numpy.min([numpy.NAN, 1]) nan >>> numpy.max([numpy.NAN, 1]) nan >>> torch.min(torch.tensor([1, float('nan')])) tensor(nan) >>> torch.max(torch.tensor([1, float('nan')])) tensor(nan) ``` C languguage [fmin](https://en.cppreference.com/w/c/numeric/math/fmin) and [fmax](https://en.cppreference.com/w/c/numeric/math/fmax) has different behavior: ``` fmax(NaN,1) = 1 fmin(NaN,1) = 1 ``` https://grouper.ieee.org/groups/msc/ANSI_IEEE-Std-754-2019/background/minNum_maxNum_Removal_Demotion_v3.pdf ![image](https://github.com/microsoft/onnxruntime/assets/30328909/62446cf1-f252-4ddc-8118-5ce605252331) https://www.open-std.org/jtc1/sc22/wg14/www/docs/n2273.pdf --- .../core/providers/cuda/cu_inc/common.cuh | 22 ++++ .../cpu/math/element_wise_ops_test.cc | 114 ++++++++++++++++++ 2 files changed, 136 insertions(+) diff --git a/onnxruntime/core/providers/cuda/cu_inc/common.cuh b/onnxruntime/core/providers/cuda/cu_inc/common.cuh index 1cd3532846114..052dd05574ab1 100644 --- a/onnxruntime/core/providers/cuda/cu_inc/common.cuh +++ b/onnxruntime/core/providers/cuda/cu_inc/common.cuh @@ -5,7 +5,9 @@ #include #include #include +#include #include +#include #include #include #include "core/providers/cuda/cuda_common.h" @@ -345,9 +347,29 @@ __device__ __inline__ half _Pow(half a, half b) { return half(powf((float)a, (fl template __device__ __inline__ T _Min(T a, T b) { return a < b ? a : b; } +template <> +__device__ __inline__ float _Min(float a, float b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a < b ? a : b ); +} + +template <> +__device__ __inline__ double _Min(double a, double b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a < b ? a : b ); +} + template __device__ __inline__ T _Max(T a, T b) { return a > b ? a : b; } +template <> +__device__ __inline__ float _Max(float a, float b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a > b ? a : b ); +} + +template <> +__device__ __inline__ double _Max(double a, double b) { + return (isnan(a) || isnan(b)) ? std::numeric_limits::quiet_NaN() : ( a > b ? a : b ); +} + template __device__ __inline__ T _Abs(T a) { return a > (T)0 ? a : -a; } diff --git a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc index c73dfcbce1b53..c02486a2ec26f 100644 --- a/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc +++ b/onnxruntime/test/providers/cpu/math/element_wise_ops_test.cc @@ -9,6 +9,7 @@ #include "test/common/trt_op_test_utils.h" #include "core/util/math.h" #include +#include #include namespace onnxruntime { @@ -1508,6 +1509,34 @@ TEST(MathOpTest, Min_12_Float_2_Input) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Min_12_Float_Nan) { + OpTester test("Min", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5f, 0.0f, -2.0f, + 0.5f, 0.0f, 2.0f}); + test.AddInput("data_1", {3, 1}, + {0.0f, -1.0f, 1.0f}); + test.AddOutput("min", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -1.0f, -1.0f, -2.0f, + 0.5f, 0.0f, 1.0f}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Min_12_Double) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, @@ -1525,6 +1554,34 @@ TEST(MathOpTest, Min_12_Double) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Min_12_Double_Nan) { + OpTester test("Min", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5, 0.0, -2.0, + 0.5, 0.0, 2.0}); + test.AddInput("data_1", {3, 1}, + {0.0, -1.0, 1.0}); + test.AddOutput("min", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -1.0, -1.0, -2.0, + 0.5, 0.0, 1.0}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Min_12_Int32) { OpTester test("Min", 12); test.AddInput("data_0", {1, 3}, @@ -1631,6 +1688,7 @@ TEST(MathOpTest, Min_12_MLFLoat16_Scalar1) { MakeMLFloat16({-10.f, -10.f, -10.f})); test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } + TEST(MathOpTest, Max_6) { OpTester test("Max", 6); std::vector dims{3, 3}; @@ -1719,6 +1777,34 @@ TEST(MathOpTest, Max_12_Float) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider, kOpenVINOExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Max_12_Float_Nan) { + OpTester test("Max", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5f, 0.0f, -2.0f, + 0.5f, 0.0f, 2.0f}); + test.AddInput("data_1", {3, 1}, + {0.0f, -1.0f, 1.0f}); + test.AddOutput("max", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5f, 0.0f, -1.0f, + 1.0f, 1.0f, 2.0f}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Max_12_Double) { OpTester test("Max", 12); test.AddInput("data_0", {1, 3}, @@ -1736,6 +1822,34 @@ TEST(MathOpTest, Max_12_Double) { test.Run(OpTester::ExpectResult::kExpectSuccess, "", {kTensorrtExecutionProvider}); // TensorRT: Input batch size is inconsistent } +TEST(MathOpTest, Max_12_Double_Nan) { + OpTester test("Max", 12); + test.AddInput("data_2", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5, 0.0, -2.0, + 0.5, 0.0, 2.0}); + test.AddInput("data_1", {3, 1}, + {0.0, -1.0, 1.0}); + test.AddOutput("max", {3, 3}, + {std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + std::numeric_limits::quiet_NaN(), + -0.5, 0.0, -1.0, + 1.0, 1.0, 2.0}); + if (nullptr != DefaultCpuExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCpuExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } + if (nullptr != DefaultCudaExecutionProvider().get()) { + std::vector> execution_providers; + execution_providers.push_back(DefaultCudaExecutionProvider()); + test.Run(OpTester::ExpectResult::kExpectSuccess, "", {}, nullptr, &execution_providers); + } +} + TEST(MathOpTest, Max_12_Int32) { OpTester test("Max", 12); test.AddInput("data_0", {1, 3}, From dafbef3a21c63a01dbb3ef7af1edef245244ec11 Mon Sep 17 00:00:00 2001 From: Changming Sun Date: Thu, 21 Mar 2024 17:58:59 -0700 Subject: [PATCH 50/55] CMake: support reading dependency zip files from a local mirror (#20005) ### Description To test this feature, run ```bat python cmake\deps_update_and_upload.py --root-path mirror ``` Then run build.py as usual. The zip files will be cached local. To avoid being downloaded again and again. --- cmake/external/onnxruntime_external_deps.cmake | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/cmake/external/onnxruntime_external_deps.cmake b/cmake/external/onnxruntime_external_deps.cmake index ac1e187f357aa..8839dbc8fda4f 100644 --- a/cmake/external/onnxruntime_external_deps.cmake +++ b/cmake/external/onnxruntime_external_deps.cmake @@ -14,6 +14,16 @@ foreach(ONNXRUNTIME_DEP IN LISTS ONNXRUNTIME_DEPS_LIST) set(DEP_URL_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP_URL}) # The third column is SHA1 hash value set(DEP_SHA1_${ONNXRUNTIME_DEP_NAME} ${ONNXRUNTIME_DEP}) + + if(ONNXRUNTIME_DEP_URL MATCHES "^https://") + # Search a local mirror folder + string(REGEX REPLACE "^https://" "${REPO_ROOT}/mirror/" LOCAL_URL "${ONNXRUNTIME_DEP_URL}") + + if(EXISTS "${LOCAL_URL}") + cmake_path(ABSOLUTE_PATH LOCAL_URL) + set(DEP_URL_${ONNXRUNTIME_DEP_NAME} "${LOCAL_URL}") + endif() + endif() endif() endforeach() From cd6d3aea458d5ae0c3f6ab661b8067d60f390583 Mon Sep 17 00:00:00 2001 From: Yi Zhang Date: Fri, 22 Mar 2024 09:16:00 +0800 Subject: [PATCH 51/55] Refactor Python CUDA packaging pipeline to fix random hangs in building (#19989) ### Description 1. Move building on CPU machine. 2. Optimize the pipeline 3. Since there isn't official ONNX package for python 12, the python 12 test stage uses the packages built with ONNX source in build stage. ### Motivation and Context 1. Resolve the random hang in compilation 4. Save a lot of GPU resources. --------- --- .../py-cuda-packaging-pipeline.yml | 13 +- .../stages/py-cuda-packaging-stage.yml | 94 ++-- .../templates/py-linux-gpu.yml | 106 ++--- .../azure-pipelines/templates/py-win-gpu.yml | 418 +++++++++++------- 4 files changed, 351 insertions(+), 280 deletions(-) diff --git a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml index aee42d3675087..20646d3ba4a26 100644 --- a/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/py-cuda-packaging-pipeline.yml @@ -21,6 +21,15 @@ parameters: values: - 11.8 - 12.2 + - name: SpecificArtifact + displayName: Use Specific Artifact + type: boolean + default: false + + - name: BuildId + displayName: Specific Artifact's BuildId + type: string + default: '0' resources: repositories: @@ -36,4 +45,6 @@ stages: enable_linux_gpu: ${{ parameters.enable_linux_gpu }} enable_windows_gpu: ${{ parameters.enable_windows_gpu }} cmake_build_type: ${{ parameters.cmake_build_type }} - cuda_version: ${{ parameters.cuda_version }} \ No newline at end of file + cuda_version: ${{ parameters.cuda_version }} + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} diff --git a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml index f82c80d4d7e93..a2c1eeef632c1 100644 --- a/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml +++ b/tools/ci_build/github/azure-pipelines/stages/py-cuda-packaging-stage.yml @@ -34,72 +34,40 @@ parameters: - 11.8 - 12.2 -stages: -- stage: Python_Packaging - dependsOn: [] - variables: - - name: docker_base_image - ${{ if eq(parameters.cuda_version, '11.8') }}: - value: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 - ${{ if eq(parameters.cuda_version, '12.2') }}: - value: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 - - name: linux_trt_version - ${{ if eq(parameters.cuda_version, '11.8') }}: - value: 8.6.1.6-1.cuda11.8 - ${{ if eq(parameters.cuda_version, '12.2') }}: - value: 8.6.1.6-1.cuda12.0 - - name: win_trt_home - ${{ if eq(parameters.cuda_version, '11.8') }}: - value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 - ${{ if eq(parameters.cuda_version, '12.2') }}: - value: $(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 - - name: win_cuda_home - ${{ if eq(parameters.cuda_version, '11.8') }}: - value: $(Agent.TempDirectory)\v11.8 - ${{ if eq(parameters.cuda_version, '12.2') }}: - value: $(Agent.TempDirectory)\v12.2 - jobs: - - ${{ if eq(parameters.enable_windows_gpu, true) }}: - - template: ../templates/py-win-gpu.yml - parameters: - MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' - PYTHON_VERSION: '3.8' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - EP_NAME: gpu - CudaVersion: ${{ parameters.cuda_version }} - - - template: ../templates/py-win-gpu.yml - parameters: - MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' - PYTHON_VERSION: '3.9' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - EP_NAME: gpu - CudaVersion: ${{ parameters.cuda_version }} +- name: SpecificArtifact + displayName: Use Specific Artifact + type: boolean + default: false - - template: ../templates/py-win-gpu.yml - parameters: - MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' - PYTHON_VERSION: '3.10' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - EP_NAME: gpu - CudaVersion: ${{ parameters.cuda_version }} +- name: BuildId + displayName: Specific Artifact's BuildId + type: string + default: '0' - - template: ../templates/py-win-gpu.yml - parameters: - MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' - PYTHON_VERSION: '3.11' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - EP_NAME: gpu - CudaVersion: ${{ parameters.cuda_version }} +- name: PythonVersions + type: object + displayName: 'Python versions to build' + default: + - '3.8' + - '3.9' + - '3.10' + - '3.11' + - '3.12' +stages: + - ${{ if eq(parameters.enable_windows_gpu, true) }}: + - ${{ each python_version in parameters.PythonVersions }}: - template: ../templates/py-win-gpu.yml parameters: - MACHINE_POOL: 'onnxruntime-Win2022-GPU-T4' - PYTHON_VERSION: '3.12' - EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=${{ variables.win_trt_home }} --cuda_home=${{ variables.win_cuda_home }} --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + PYTHON_VERSION: ${{ python_version }} EP_NAME: gpu CudaVersion: ${{ parameters.cuda_version }} - + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} + ${{ if eq(parameters.cuda_version, '11.8') }}: + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-11.8 --cuda_home=$(Agent.TempDirectory)\v11.8 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" + ${{ if eq(parameters.cuda_version, '12.2') }}: + EP_BUILD_FLAGS: --use_tensorrt --tensorrt_home=$(Agent.TempDirectory)\TensorRT-8.6.1.6.Windows10.x86_64.cuda-12.0 --cuda_home=$(Agent.TempDirectory)\v12.2 --cmake_extra_defines "CMAKE_CUDA_ARCHITECTURES=52;60;61;70;75;80" - ${{ if eq(parameters.enable_linux_gpu, true) }}: - template: ../templates/py-linux-gpu.yml @@ -108,6 +76,10 @@ stages: machine_pool: 'onnxruntime-Ubuntu2204-AMD-CPU' extra_build_arg: ${{ parameters.build_py_parameters }} cmake_build_type: ${{ parameters.cmake_build_type }} - docker_base_image: ${{ variables.docker_base_image }} - trt_version: ${{ variables.linux_trt_version }} cuda_version: ${{ parameters.cuda_version }} + ${{ if eq(parameters.cuda_version, '11.8') }}: + docker_base_image: nvidia/cuda:11.8.0-cudnn8-devel-ubi8 + trt_version: 8.6.1.6-1.cuda11.8 + ${{ if eq(parameters.cuda_version, '12.2') }}: + docker_base_image: nvidia/cuda:12.2.2-cudnn8-devel-ubi8 + trt_version: 8.6.1.6-1.cuda12.0 diff --git a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml index 8cc48aac7a3b9..318ffd21febf5 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-linux-gpu.yml @@ -35,62 +35,66 @@ parameters: values: - 11.8 - 12.2 -jobs: -- job: Linux_py_GPU_Wheels_${{ parameters.arch }} - timeoutInMinutes: 240 - workspace: - clean: all - pool: ${{ parameters.machine_pool }} - variables: - # The build machine pool doesn't have dotnet, so it can't run CG. - - name: skipComponentGovernanceDetection - value: true - - name: extra_build_args - ${{ if ne(parameters.extra_build_arg, '') }}: - value: -x ${{ parameters.extra_build_arg }} - ${{ if eq(parameters.extra_build_arg, '') }}: - value: '' - steps: - - checkout: self - clean: true - submodules: recursive - - template: set-nightly-build-option-variable-step.yml +stages: +- stage: Linux_py_GPU_Wheels_${{ parameters.arch }} + dependsOn: [] + jobs: + - job: Linux_py_GPU_Wheels_${{ parameters.arch }} + timeoutInMinutes: 240 + workspace: + clean: all + pool: ${{ parameters.machine_pool }} + variables: + # The build machine pool doesn't have dotnet, so it can't run CG. + - name: skipComponentGovernanceDetection + value: true + - name: extra_build_args + ${{ if ne(parameters.extra_build_arg, '') }}: + value: -x ${{ parameters.extra_build_arg }} + ${{ if eq(parameters.extra_build_arg, '') }}: + value: '' + steps: + - checkout: self + clean: true + submodules: recursive - - template: get-docker-image-steps.yml - parameters: - Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda - Context: tools/ci_build/github/linux/docker - DockerBuildArgs: " - --network=host - --build-arg BASEIMAGE=${{ parameters.docker_base_image }} - --build-arg TRT_VERSION=${{ parameters.trt_version }} - --build-arg BUILD_UID=$( id -u ) - --build-arg PLATFORM=${{ parameters.arch }} - " - Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} + - template: set-nightly-build-option-variable-step.yml + - template: get-docker-image-steps.yml + parameters: + Dockerfile: tools/ci_build/github/linux/docker/Dockerfile.manylinux2_28_cuda + Context: tools/ci_build/github/linux/docker + DockerBuildArgs: " + --network=host + --build-arg BASEIMAGE=${{ parameters.docker_base_image }} + --build-arg TRT_VERSION=${{ parameters.trt_version }} + --build-arg BUILD_UID=$( id -u ) + --build-arg PLATFORM=${{ parameters.arch }} + " + Repository: onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} - - task: Bash@3 - displayName: 'Build Python Wheel' - inputs: - targetType: filePath - filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh - arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args) - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime python wheel' - inputs: - PathtoPublish: '$(Build.BinariesDirectory)/dist' - ArtifactName: onnxruntime_gpu + - task: Bash@3 + displayName: 'Build Python Wheel' + inputs: + targetType: filePath + filePath: tools/ci_build/github/linux/run_python_dockerbuild.sh + arguments: -i onnxruntimecuda${{ replace(parameters.cuda_version, '.', '') }}xtrt86build${{ parameters.arch }} -d "GPU" -c ${{ parameters.cmake_build_type }} $(extra_build_args) - - task: PublishPipelineArtifact@0 - displayName: 'Publish Test Binaries' - inputs: - artifactName: 'drop-linux-gpu-${{ parameters.arch }}' - targetPath: '$(Build.BinariesDirectory)/Release' + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime python wheel' + inputs: + PathtoPublish: '$(Build.BinariesDirectory)/dist' + ArtifactName: onnxruntime_gpu + - task: PublishPipelineArtifact@0 + displayName: 'Publish Test Binaries' + inputs: + artifactName: 'drop-linux-gpu-${{ parameters.arch }}' + targetPath: '$(Build.BinariesDirectory)/Release' - - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 - displayName: 'Clean Agent Directories' - condition: always() + + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() diff --git a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml index 4315eae503ebd..17915d107dbe6 100644 --- a/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml +++ b/tools/ci_build/github/azure-pipelines/templates/py-win-gpu.yml @@ -1,8 +1,4 @@ parameters: - -- name: MACHINE_POOL - type: string - - name: EP_NAME type: string @@ -27,169 +23,257 @@ parameters: values: - 11.8 - 12.2 -jobs: -- job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }} - timeoutInMinutes: 240 - workspace: - clean: all - pool: - name: ${{ parameters.MACHINE_POOL }} -# demands: -# - ImageVersionOverride -equals 1.0.367516 - variables: - GRADLE_OPTS: '-Dorg.gradle.daemon=false' - VSGenerator: 'Visual Studio 17 2022' - CUDA_MODULE_LOADING: 'LAZY' - steps: - - checkout: self - clean: true - submodules: recursive - - - template: telemetry-steps.yml - - - task: UsePythonVersion@0 - inputs: - versionSpec: ${{ parameters.PYTHON_VERSION }} - addToPath: true - architecture: 'x64' - - - task: onebranch.pipeline.tsaoptions@1 - displayName: 'OneBranch TSAOptions' - inputs: - tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json' - appendSourceBranchName: false - - - task: PythonScript@0 - inputs: - scriptSource: inline - script: | - import sys - np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.24.2' - import subprocess - subprocess.call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version]) - workingDirectory: '$(Build.BinariesDirectory)' - displayName: 'Install python modules' - - - template: download-deps.yml - - - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}: - - template: jobs/set-winenv.yml + +- name: SpecificArtifact + displayName: Use Specific Artifact + type: boolean + default: false + +- name: BuildId + displayName: Specific Artifact's BuildId + type: string + default: '0' + +stages: + - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build + dependsOn: [] + jobs: + - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build + timeoutInMinutes: 120 + workspace: + clean: all + pool: + name: onnxruntime-Win-CPU-2022 + # demands: + # - ImageVersionOverride -equals 1.0.367516 + variables: + GRADLE_OPTS: '-Dorg.gradle.daemon=false' + VSGenerator: 'Visual Studio 17 2022' + CUDA_MODULE_LOADING: 'LAZY' + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: recursive + + - template: telemetry-steps.yml + + - task: UsePythonVersion@0 + inputs: + versionSpec: ${{ parameters.PYTHON_VERSION }} + addToPath: true + architecture: 'x64' + + - task: onebranch.pipeline.tsaoptions@1 + displayName: 'OneBranch TSAOptions' + inputs: + tsaConfigFilePath: '$(Build.SourcesDirectory)\.config\tsaoptions.json' + appendSourceBranchName: false + + - task: PythonScript@0 + inputs: + scriptSource: inline + script: | + import sys + np_version = 'numpy==1.21.6' if sys.version_info < (3, 11) else 'numpy==1.26' + import subprocess + try: + subprocess.check_call(['pip', 'install', '-q', 'setuptools', 'wheel', np_version]) + except subprocess.CalledProcessError: + sys.exit(1) + workingDirectory: '$(Build.BinariesDirectory)' + displayName: 'Install python modules' + + - template: download-deps.yml + + - ${{ if ne(parameters.ENV_SETUP_SCRIPT, '') }}: + - template: jobs/set-winenv.yml + parameters: + EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }} + ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}: + DownloadCUDA: true + + - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}: + - template: jobs/download_win_gpu_library.yml + parameters: + CudaVersion: ${{ parameters.CudaVersion }} + ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}: + DownloadCUDA: true + ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}: + DownloadTRT: true + + - task: PythonScript@0 + displayName: 'Update deps.txt' + inputs: + scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py + arguments: --new_dir $(Build.BinariesDirectory)/deps + workingDirectory: $(Build.BinariesDirectory) + + - task: PowerShell@2 + displayName: 'Install ONNX' + inputs: + filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1' + workingDirectory: '$(Build.BinariesDirectory)' + arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo + + # it could be removed once there's onnx wheel for python 3.12 + - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}: + - task: PublishPipelineArtifact@1 + displayName: 'Publish Artifact: ONNX python 12 wheel' + inputs: + targetPath: '$(Agent.TempDirectory)\onnx\onnx-1.15.0\dist\' + publishLocation: 'pipeline' + artifactName: onnx_py12_wheel + + - template: set-nightly-build-option-variable-step.yml + + - task: PythonScript@0 + displayName: 'Generate cmake config' + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: > + --config RelWithDebInfo + --build_dir $(Build.BinariesDirectory) + --skip_submodule_sync + --cmake_generator "$(VSGenerator)" + --enable_pybind + --enable_onnx_tests + --parallel --use_binskim_compliant_compile_flags --update + $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} + workingDirectory: '$(Build.BinariesDirectory)' + + # building with build.py so the parallelization parameters are added to the msbuild command + - task: PythonScript@0 + displayName: 'Build' + inputs: + scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' + arguments: > + --config RelWithDebInfo + --build_dir $(Build.BinariesDirectory) + --parallel --build + $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} + workingDirectory: '$(Build.BinariesDirectory)' + + # Esrp signing + - template: win-esrp-dll.yml + parameters: + FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi' + DisplayName: 'ESRP - Sign Native dlls' + DoEsrp: true + Pattern: '*.pyd,*.dll' + + - task: PythonScript@0 + displayName: 'Build wheel' + inputs: + scriptPath: '$(Build.SourcesDirectory)\setup.py' + arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}' + workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' + + - task: CopyFiles@2 + displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' + inputs: + SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist' + Contents: '*.whl' + TargetFolder: '$(Build.ArtifactStagingDirectory)' + + - task: PublishBuildArtifacts@1 + displayName: 'Publish Artifact: ONNXRuntime python wheel' + inputs: + ArtifactName: onnxruntime_${{ parameters.EP_NAME }} + + - script: | + 7z x *.whl + workingDirectory: '$(Build.ArtifactStagingDirectory)' + displayName: 'unzip the package' + + - task: CredScan@3 + displayName: 'Run CredScan' + inputs: + debugMode: false + continueOnError: true + + - task: BinSkim@4 + displayName: 'Run BinSkim' + inputs: + AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll' + + - stage: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests + dependsOn: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Build + jobs: + - job: Win_py_${{ parameters.EP_NAME }}_Wheels_${{ replace(parameters.PYTHON_VERSION,'.','_') }}_Tests + workspace: + clean: all + pool: + name: onnxruntime-Win2022-GPU-T4 + steps: + - task: mspremier.PostBuildCleanup.PostBuildCleanup-task.PostBuildCleanup@3 + displayName: 'Clean Agent Directories' + condition: always() + + - checkout: self + clean: true + submodules: none + + - task: UsePythonVersion@0 + inputs: + versionSpec: ${{ parameters.PYTHON_VERSION }} + addToPath: true + architecture: 'x64' + + - template: flex-downloadPipelineArtifact.yml parameters: - EnvSetupScript: ${{ parameters.ENV_SETUP_SCRIPT }} - ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}: - DownloadCUDA: true + ArtifactName: "onnxruntime_${{ parameters.EP_NAME }}" + StepName: 'Download Pipeline Artifact - Windows GPU Build' + TargetPath: '$(Build.ArtifactStagingDirectory)' + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} + + # It could be remove once there's onnx wheel for python 3.12 + - ${{ if eq(parameters.PYTHON_VERSION, '3.12') }}: + - template: flex-downloadPipelineArtifact.yml + parameters: + ArtifactName: "onnx_py12_wheel" + StepName: 'Download Pipeline Artifact - Onnx Python12 wheel' + TargetPath: '$(Agent.TempDirectory)\onnx\' + SpecificArtifact: ${{ parameters.SpecificArtifact }} + BuildId: ${{ parameters.BuildId }} + + - powershell: | + python -m pip install upgrade pip + Get-ChildItem -Path $(Agent.TempDirectory)\onnx\*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} + python -m pip install pytest + workingDirectory: '$(Build.SourcesDirectory)' + displayName: 'Install ONNX and pytest' + - ${{ else }}: + - powershell: | + pushd onnxruntime/test/python + python -m pip install --upgrade pip + python -m pip install -r requirements.txt + popd + workingDirectory: '$(Build.SourcesDirectory)' + displayName: 'Install ONNX' + + - powershell: | + python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq + Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*cp${{ replace(parameters.PYTHON_VERSION,'.','') }}*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} + mkdir -p $(Agent.TempDirectory)\ort_test_data + Copy-Item -Path $(Build.sourcesDirectory)/onnxruntime/test/python/onnx_backend_test_series.py -Destination $(Agent.TempDirectory)\ort_test_data + Copy-Item -Recurse -Path $(Build.sourcesDirectory)/onnxruntime/test/testdata -Destination $(Agent.TempDirectory)\ort_test_data + cd $(Agent.TempDirectory)\ort_test_data + python onnx_backend_test_series.py + workingDirectory: '$(Build.sourcesDirectory)' + displayName: 'Run Python Tests' + + - task: TSAUpload@2 + displayName: 'TSA upload' + condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) + inputs: + GdnPublishTsaOnboard: false + GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' - - ${{ if eq(parameters.ENV_SETUP_SCRIPT, '') }}: - - template: jobs/download_win_gpu_library.yml + - template: component-governance-component-detection-steps.yml parameters: - CudaVersion: ${{ parameters.CudaVersion }} - ${{ if or(contains(parameters.EP_BUILD_FLAGS, 'use_cuda'), contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt')) }}: - DownloadCUDA: true - ${{ if contains(parameters.EP_BUILD_FLAGS, 'use_tensorrt') }}: - DownloadTRT: true - - - task: PythonScript@0 - displayName: 'Update deps.txt' - inputs: - scriptPath: $(Build.SourcesDirectory)/tools/ci_build/replace_urls_in_deps.py - arguments: --new_dir $(Build.BinariesDirectory)/deps - workingDirectory: $(Build.BinariesDirectory) - - - task: PowerShell@2 - displayName: 'Install ONNX' - inputs: - filePath: '$(Build.SourcesDirectory)/tools/ci_build/github/windows/install_third_party_deps.ps1' - workingDirectory: '$(Build.BinariesDirectory)' - arguments: -cpu_arch x64 -install_prefix $(Build.BinariesDirectory)\RelWithDebInfo\installed -build_config RelWithDebInfo - - - template: set-nightly-build-option-variable-step.yml - - - - task: PythonScript@0 - displayName: 'Generate cmake config' - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: > - --config RelWithDebInfo - --build_dir $(Build.BinariesDirectory) - --skip_submodule_sync - --cmake_generator "$(VSGenerator)" - --enable_pybind - --enable_onnx_tests - --parallel --use_binskim_compliant_compile_flags --update - $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} - workingDirectory: '$(Build.BinariesDirectory)' - - # building with build.py so the parallelization parameters are added to the msbuild command - - task: PythonScript@0 - displayName: 'Build' - inputs: - scriptPath: '$(Build.SourcesDirectory)\tools\ci_build\build.py' - arguments: > - --config RelWithDebInfo - --build_dir $(Build.BinariesDirectory) - --parallel --build - $(TelemetryOption) ${{ parameters.BUILD_PY_PARAMETERS }} ${{ parameters.EP_BUILD_FLAGS }} - workingDirectory: '$(Build.BinariesDirectory)' - - # Esrp signing - - template: win-esrp-dll.yml - parameters: - FolderPath: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\onnxruntime\capi' - DisplayName: 'ESRP - Sign Native dlls' - DoEsrp: true - Pattern: '*.pyd,*.dll' - - - task: PythonScript@0 - displayName: 'Build wheel' - inputs: - scriptPath: '$(Build.SourcesDirectory)\setup.py' - arguments: 'bdist_wheel ${{ parameters.BUILD_PY_PARAMETERS }} $(NightlyBuildOption) --wheel_name_suffix=${{ parameters.EP_NAME }}' - workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' - - - task: CopyFiles@2 - displayName: 'Copy Python Wheel to: $(Build.ArtifactStagingDirectory)' - inputs: - SourceFolder: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo\dist' - Contents: '*.whl' - TargetFolder: '$(Build.ArtifactStagingDirectory)' - - - task: PublishBuildArtifacts@1 - displayName: 'Publish Artifact: ONNXRuntime python wheel' - inputs: - ArtifactName: onnxruntime_${{ parameters.EP_NAME }} - - - script: | - 7z x *.whl - workingDirectory: '$(Build.ArtifactStagingDirectory)' - displayName: 'unzip the package' - - - task: CredScan@3 - displayName: 'Run CredScan' - inputs: - debugMode: false - continueOnError: true - - - task: BinSkim@4 - displayName: 'Run BinSkim' - inputs: - AnalyzeTargetGlob: '+:file|$(Build.ArtifactStagingDirectory)\**\*.dll;-:file|$(Build.ArtifactStagingDirectory)\**\DirectML.dll' - - - powershell: | - python -m pip uninstall -y ort-nightly-gpu ort-nightly onnxruntime onnxruntime-gpu -qq - Get-ChildItem -Path $(Build.ArtifactStagingDirectory)/*.whl | foreach {pip --disable-pip-version-check install --upgrade $_.fullname tabulate} - Remove-Item -Recurse -Force onnxruntime - python onnx_backend_test_series.py - workingDirectory: '$(Build.BinariesDirectory)\RelWithDebInfo\RelWithDebInfo' - displayName: 'Run Python Tests' - - - task: TSAUpload@2 - displayName: 'TSA upload' - condition: and (succeeded(), eq(variables['Build.SourceBranch'], 'refs/heads/main')) - inputs: - GdnPublishTsaOnboard: false - GdnPublishTsaConfigFile: '$(Build.sourcesDirectory)\.gdn\.gdntsa' - - - template: component-governance-component-detection-steps.yml - parameters: - condition: 'succeeded' + condition: 'succeeded' From eab35c20fc1afa301ac7c616d45a7f1f0f3d15b7 Mon Sep 17 00:00:00 2001 From: sfatimar Date: Fri, 22 Mar 2024 07:14:00 +0530 Subject: [PATCH 52/55] Ort openvino npu 1.17 master (#19966) ### Description Add NPU to list of device supported. Added changes for Support to OV 2024.0 Nuget packages removes packaging of OpenVINO DLL Bug Fixes with Python API Reverted Dockerfiles not being maintained. ### Motivation and Context NPU Device has been introduced by Intel in latest client systems OpenVINO 2024.0 release is out. --------- Co-authored-by: Suryaprakash Shanmugam Co-authored-by: Preetha Veeramalai Co-authored-by: Ubuntu Co-authored-by: hmamidix Co-authored-by: vthaniel Co-authored-by: saurabhkale17 --- cmake/CMakeLists.txt | 37 +--- cmake/onnxruntime_providers_openvino.cmake | 27 +-- dockerfiles/Dockerfile.openvino | 6 +- dockerfiles/Dockerfile.openvino-centos7 | 105 ---------- dockerfiles/Dockerfile.openvino-csharp | 90 --------- dockerfiles/Dockerfile.openvino-rhel8 | 87 -------- .../providers/openvino/backend_manager.cc | 66 ++++-- .../core/providers/openvino/backend_manager.h | 2 +- .../core/providers/openvino/backend_utils.cc | 40 +--- .../core/providers/openvino/backend_utils.h | 3 +- .../openvino/backends/backend_factory.cc | 6 +- .../openvino/backends/basic_backend.cc | 83 ++++---- .../openvino/backends/basic_backend.h | 2 +- .../core/providers/openvino/contexts.h | 4 +- .../core/providers/openvino/ibackend.h | 2 +- .../openvino/openvino_execution_provider.cc | 26 +-- .../openvino/openvino_execution_provider.h | 20 +- .../openvino/openvino_provider_factory.cc | 23 ++- .../core/providers/openvino/ov_interface.cc | 43 ++-- .../core/providers/openvino/ov_interface.h | 5 +- .../openvino/ov_versions/capability.cc | 29 +-- .../openvino/ov_versions/capability.h | 5 +- .../openvino/ov_versions/data_ops.cc | 191 +++--------------- .../providers/openvino/ov_versions/data_ops.h | 9 +- .../providers/openvino/ov_versions/utils.cc | 24 +-- .../providers/openvino/ov_versions/utils.h | 2 +- .../python/onnxruntime_pybind_state.cc | 16 +- .../python/onnxruntime_pybind_state_common.h | 7 +- .../test/contrib_ops/activation_op_test.cc | 4 + onnxruntime/test/perftest/ort_test_session.cc | 4 +- .../cpu/activation/activation_op_test.cc | 4 +- .../test/python/onnx_backend_test_series.py | 4 +- .../onnx_backend_test_series_filters.jsonc | 7 +- tools/ci_build/build.py | 9 +- .../linux-openvino-ci-pipeline.yml | 2 +- .../linux/docker/Dockerfile.ubuntu_openvino | 13 +- ...kerfile_manylinux2014_openvino_multipython | 83 -------- .../nuget/generate_nuspec_for_native_nuget.py | 28 +-- 38 files changed, 275 insertions(+), 843 deletions(-) delete mode 100755 dockerfiles/Dockerfile.openvino-centos7 delete mode 100644 dockerfiles/Dockerfile.openvino-csharp delete mode 100644 dockerfiles/Dockerfile.openvino-rhel8 delete mode 100644 tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython diff --git a/cmake/CMakeLists.txt b/cmake/CMakeLists.txt index 49b6f06c76a64..ee1959bb357fe 100644 --- a/cmake/CMakeLists.txt +++ b/cmake/CMakeLists.txt @@ -1290,34 +1290,6 @@ if (onnxruntime_USE_OPENVINO) add_definitions(-DUSE_OPENVINO=1) - if (EXISTS "$ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt") - file(READ $ENV{INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/version.txt VER) - endif() - - if (NOT DEFINED ENV{INTEL_OPENVINO_DIR}) - message(FATAL_ERROR "[Couldn't locate OpenVINO] OpenVINO may not have been initialized") - endif() - - # Check OpenVINO version for support - if ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.0") - set(OPENVINO_VERSION "2023.0") - add_definitions(-DOPENVINO_2023_0=1) - elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.1") - set(OPENVINO_VERSION "2023.1") - add_definitions(-DOPENVINO_2023_1=1) - elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.2") - set(OPENVINO_VERSION "2023.2") - add_definitions(-DOPENVINO_2023_2=1) - elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "2023.3") - set(OPENVINO_VERSION "2023.3") - add_definitions(-DOPENVINO_2023_3=1) - elseif ($ENV{INTEL_OPENVINO_DIR} MATCHES "openvino") - set(OPENVINO_VERSION "2023.3") - add_definitions(-DOPENVINO_2023_3=1) - else() - message(FATAL_ERROR "Unsupported OpenVINO version: ${INTEL_OPENVINO_DIR}") - endif() - if (onnxruntime_USE_OPENVINO_GPU_FP32) add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1) endif() @@ -1334,6 +1306,10 @@ if (onnxruntime_USE_OPENVINO) add_definitions(-DOPENVINO_CONFIG_CPU_FP16=1) endif() + if (onnxruntime_USE_OPENVINO_NPU) + add_definitions(-DOPENVINO_CONFIG_NPU=1) + endif() + if (onnxruntime_USE_OPENVINO_GPU_FP32_NP) add_definitions(-DOPENVINO_CONFIG_GPU_FP32=1) add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1) @@ -1354,6 +1330,11 @@ if (onnxruntime_USE_OPENVINO) add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1) endif() + if (onnxruntime_USE_OPENVINO_NPU_NP) + add_definitions(-DOPENVINO_CONFIG_NPU=1) + add_definitions(-DOPENVINO_DISABLE_GRAPH_PARTITION=1) + endif() + if (onnxruntime_USE_OPENVINO_HETERO) add_definitions(-DOPENVINO_CONFIG_HETERO=1) add_definitions(-DDEVICE_NAME="${onnxruntime_USE_OPENVINO_DEVICE}") diff --git a/cmake/onnxruntime_providers_openvino.cmake b/cmake/onnxruntime_providers_openvino.cmake index e26f0bfc0b751..5876b2b5c448b 100644 --- a/cmake/onnxruntime_providers_openvino.cmake +++ b/cmake/onnxruntime_providers_openvino.cmake @@ -16,23 +16,19 @@ endif() # Header paths - find_package(InferenceEngine REQUIRED) - find_package(ngraph REQUIRED) - - if (OPENVINO_2022_1 OR OPENVINO_2022_2) find_package(OpenVINO REQUIRED COMPONENTS Runtime ONNX) - list (OV_20_LIBS openvino::frontend::onnx openvino::runtime) + if(OpenVINO_VERSION VERSION_LESS 2023.0) + message(FATAL_ERROR "OpenVINO 2023.0 and newer are supported. Please, latest OpenVINO release") endif() if (WIN32) unset(CMAKE_MAP_IMPORTED_CONFIG_RELWITHDEBINFO) endif() + list(APPEND OPENVINO_LIB_LIST openvino::frontend::onnx openvino::runtime ${PYTHON_LIBRARIES}) if ((DEFINED ENV{OPENCL_LIBS}) AND (DEFINED ENV{OPENCL_INCS})) add_definitions(-DIO_BUFFER_ENABLED=1) - list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS} ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES}) - else() - list(APPEND OPENVINO_LIB_LIST ${OV_20_LIBS} ${InferenceEngine_LIBRARIES} ${NGRAPH_LIBRARIES} ngraph::onnx_importer ${PYTHON_LIBRARIES}) + list(APPEND OPENVINO_LIB_LIST $ENV{OPENCL_LIBS}) endif() source_group(TREE ${ONNXRUNTIME_ROOT}/core FILES ${onnxruntime_providers_openvino_cc_srcs}) @@ -75,7 +71,14 @@ message(FATAL_ERROR "onnxruntime_providers_openvino unknown platform, need to specify shared library exports for it") endif() - install(TARGETS onnxruntime_providers_openvino - ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} - LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} - RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) \ No newline at end of file + if (CMAKE_OPENVINO_LIBRARY_INSTALL_DIR) + install(TARGETS onnxruntime_providers_openvino + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_OPENVINO_LIBRARY_INSTALL_DIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + else() + install(TARGETS onnxruntime_providers_openvino + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR}) + endif() diff --git a/dockerfiles/Dockerfile.openvino b/dockerfiles/Dockerfile.openvino index 78d04a51ba162..049916fac92f1 100644 --- a/dockerfiles/Dockerfile.openvino +++ b/dockerfiles/Dockerfile.openvino @@ -1,9 +1,9 @@ #------------------------------------------------------------------------- -# Copyright(C) 2021-2023 Intel Corporation. +# Copyright(C) 2021-2024 Intel Corporation. # SPDX-License-Identifier: MIT #-------------------------------------------------------------------------- -ARG OPENVINO_VERSION=2023.0.0 +ARG OPENVINO_VERSION=2024.0.0 # Build stage @@ -17,7 +17,7 @@ ARG DEVICE=CPU_FP32 ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git ARG ONNXRUNTIME_BRANCH=main -ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake +ENV OpenVINO_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake USER root RUN apt update; apt install -y git protobuf-compiler libprotobuf-dev diff --git a/dockerfiles/Dockerfile.openvino-centos7 b/dockerfiles/Dockerfile.openvino-centos7 deleted file mode 100755 index 697db44801e3b..0000000000000 --- a/dockerfiles/Dockerfile.openvino-centos7 +++ /dev/null @@ -1,105 +0,0 @@ -#------------------------------------------------------------------------- -# Copyright(C) 2021 Intel Corporation. -# SPDX-License-Identifier: MIT -#-------------------------------------------------------------------------- - -FROM centos:7.8.2003 - -WORKDIR /code - -ARG MY_ROOT=/code -ARG YUM_OV_PACKAGE=intel-openvino-runtime-centos7-2021.4.752.x86_64 -ARG DEVICE=CPU_FP32 -ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime -ARG ONNXRUNTIME_BRANCH=main - -ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2021.4.752 -ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/share -ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/lib/intel64 -ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/ngraph/cmake -ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/inference_engine/external/gna/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/mkltiny_lnx/lib:$INTEL_OPENVINO_DIR/deployment_tools/ngraph/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/omp/lib:${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH} -ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/opencv/share/OpenCV -ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/opencv/lib:${INTEL_OPENVINO_DIR}/opencv/share/OpenCV/3rdparty/lib:${LD_LIBRARY_PATH} -ENV HDDL_INSTALL_DIR=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl -ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/deployment_tools/inference_engine/external/hddl/lib:$LD_LIBRARY_PATH -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:$LD_LIBRARY_PATH - -# Install packages -RUN yum update -y && \ - yum groupinstall "Development Tools" -y && \ - yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel boost-devel-1.53.0 && \ - yum clean packages && yum clean all && rm -rf /var/cache/yum && \ -# Install cmake - cd $MY_ROOT && \ - wget https://github.com/Kitware/CMake/releases/download/v3.27.3/cmake-3.27.3.tar.gz && \ - tar -zxvf cmake-3.27.3.tar.gz && rm -rf cmake-3.27.3.tar.gz && \ - cd cmake-3.27.3 && \ - ./bootstrap && \ - make && \ - make install && \ - cd $MY_ROOT && \ -# libusb1.0.22 - cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \ - unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd /opt/libusb-1.0.22 && \ -# bootstrap steps - ./bootstrap.sh && \ - ./configure --disable-udev --enable-shared && \ - make -j4 && \ -# configure libusb1.0.22 - cd /opt/libusb-1.0.22/libusb && \ - /bin/mkdir -p '/usr/local/lib' && \ - /bin/bash ../libtool --mode=install /usr/bin/install -c libusb-1.0.la '/usr/local/lib' && \ - /bin/mkdir -p '/usr/local/include/libusb-1.0' && \ - /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \ - /bin/mkdir -p '/usr/local/lib/pkgconfig' && \ -# Install openvino - yum-config-manager --add-repo https://yum.repos.intel.com/openvino/2021/setup/intel-openvino-2021.repo && \ - rpm --import https://yum.repos.intel.com/openvino/2021/setup/RPM-GPG-KEY-INTEL-OPENVINO-2021 && \ - yum update -y && yum list intel-openvino* && \ - yum install -y $YUM_OV_PACKAGE && \ - cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && \ - printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2021.4.752/bin/setupvars.sh && \ - cd /opt/libusb-1.0.22 && \ - /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \ - cp /opt/intel/openvino_2021/deployment_tools/inference_engine/external/97-myriad-usbboot.rules /etc/udev/rules.d/ && \ - ldconfig && \ -# Install GPU runtime and drivers - cd ${MY_ROOT} && \ - mkdir /tmp/opencl && \ - cd /tmp/opencl && \ - yum install -y epel-release && \ - yum install -y ocl-icd ocl-icd-devel && \ - wget -O intel-igc-core-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-core-1.0.2597-1.el7.x86_64.rpm/download && \ - wget -O intel-opencl-19.41.14441-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-opencl-19.41.14441-1.el7.x86_64.rpm/download && \ - wget -O intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-devel-1.0.2597-1.el7.x86_64.rpm/download && \ - wget -O intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-igc-opencl-1.0.2597-1.el7.x86_64.rpm/download && \ - wget -O intel-gmmlib-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-19.3.2-1.el7.x86_64.rpm/download && \ - wget -O intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm https://sourceforge.net/projects/intel-compute-runtime/files/19.41.14441/centos-7/intel-gmmlib-devel-19.3.2-1.el7.x86_64.rpm/download && \ - rpm -i /tmp/opencl/*.rpm && \ - ldconfig && \ - rm -rf /tmp/opencl && \ -# Installing gcc-10 - yum install -y centos-release-scl && \ - yum install -y devtoolset-10-gcc* && \ - echo 'source scl_source enable devtoolset-10' >> ~/.bashrc && \ -# python installation - source scl_source enable devtoolset-10 && \ - cd /code/ && \ - wget https://www.python.org/ftp/python/3.8.3/Python-3.8.3.tgz && tar xvf Python-3.8.3.tgz && \ - cd Python-3.8*/ && ./configure && make && make install && \ - cd ../ && mkdir -p /usr/bin/Python38 && ln -s Python-3.8.3/ /usr/bin/Python38 && \ -# installing dependancies - yum install -y python3-lxml python3-six libusb.x86_64 && \ - yum clean packages && yum clean all && rm -rf /var/cache/yum && \ -# Build onnxruntime - cd $MY_ROOT && \ - pip3 install numpy wheel setuptools cython && \ - git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \ - pip3 install onnx && \ - cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \ - pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \ -# Clean up - cd $MY_ROOT && rm -rf onnxruntime Python-3* && \ - cd ${MY_ROOT}/ && rm -rf cmake* && \ - cd /usr/share/ && rm -rf gcc* && cd /usr/lib/ && rm -rf gcc cd && rm -rf .cache && \ - cd ${INTEL_OPENVINO_DIR}/ && rm -rf documentation data_processing && cd deployment_tools/ && rm -rf tools diff --git a/dockerfiles/Dockerfile.openvino-csharp b/dockerfiles/Dockerfile.openvino-csharp deleted file mode 100644 index 2529ef4b73209..0000000000000 --- a/dockerfiles/Dockerfile.openvino-csharp +++ /dev/null @@ -1,90 +0,0 @@ -#------------------------------------------------------------------------- -# Copyright(C) 2021-2023 Intel Corporation. -# SPDX-License-Identifier: MIT -#-------------------------------------------------------------------------- - -ARG OPENVINO_VERSION=2023.0.0 - -# Build stage -FROM openvino/ubuntu20_runtime:${OPENVINO_VERSION} AS base - -ENV WORKDIR_PATH=/home/openvino -WORKDIR $WORKDIR_PATH -ENV DEBIAN_FRONTEND noninteractive - -USER root -RUN apt update; apt install -y --no-install-recommends wget gnupg && \ - rm -rf /var/lib/apt/lists/* - -# Install Mono -RUN wget http://download.mono-project.com/repo/xamarin.gpg && apt-key add xamarin.gpg && rm xamarin.gpg && \ - echo "deb https://download.mono-project.com/repo/ubuntu stable-bionic main" | tee /etc/apt/sources.list.d/mono-official-stable.list && \ - apt update -y && \ - apt install -y mono-devel - -# Install nuget.exe -RUN wget https://dist.nuget.org/win-x86-commandline/latest/nuget.exe && \ - mv nuget.exe /usr/local/bin/nuget.exe && \ - echo 'mono /usr/local/bin/nuget.exe $@' > /usr/local/bin/nuget && \ - chmod a+x /usr/local/bin/nuget - -# Install .NET core -RUN wget https://packages.microsoft.com/config/ubuntu/20.04/packages-microsoft-prod.deb -O packages-microsoft-prod.deb && \ - dpkg -i packages-microsoft-prod.deb && \ - apt-get update -y &&\ - apt-get install -y apt-transport-https && \ - apt-get update -y && \ - apt-get install -y dotnet-sdk-5.0 - -# Build stage -FROM base AS builder - -ENV WORKDIR_PATH=/home/openvino -WORKDIR $WORKDIR_PATH -ENV DEBIAN_FRONTEND noninteractive - -ARG DEVICE=CPU_FP32 -ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime.git -ARG ONNXRUNTIME_BRANCH=main - -ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake -ENV LANG en_US.UTF-8 - -USER root -RUN apt update; apt install -y --no-install-recommends git protobuf-compiler libprotobuf-dev ca-certificates unattended-upgrades && \ - unattended-upgrade && \ - rm -rf /var/lib/apt/lists/* - -RUN git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} -RUN /bin/sh onnxruntime/dockerfiles/scripts/install_common_deps.sh -RUN ln -s cmake-* cmake-dir -RUN python3 -m pip install wheel -ENV PATH=${WORKDIR_PATH}/cmake-dir/bin:$PATH -RUN pip3 install onnx -RUN ln -s /usr/bin/python3 /usr/bin/python -RUN apt install locales && \ - locale-gen en_US en_US.UTF-8 && \ - dpkg-reconfigure locales -RUN cd onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_nuget --build_shared_lib -RUN cp /home/openvino/onnxruntime/build/Linux/Release/Microsoft.ML.OnnxRuntime.Managed* /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts - -# Deploy stage -FROM base - -ENV DEBIAN_FRONTEND noninteractive -USER root - -RUN apt update; apt install -y unattended-upgrades fonts-freefont-ttf && \ - unattended-upgrade -ARG BUILD_UID=1001 -ARG BUILD_USER=onnxruntimedev -RUN adduser --uid $BUILD_UID $BUILD_USER -RUN usermod -a -G video,users ${BUILD_USER} -ENV WORKDIR_PATH /home/${BUILD_USER} -WORKDIR ${WORKDIR_PATH} -COPY --from=builder /home/openvino/onnxruntime/build/Linux/Release/nuget-artifacts ${WORKDIR_PATH}/nuget-artifacts - -USER ${BUILD_USER} -ENV PATH=${WORKDIR_PATH}/miniconda/bin:${WORKDIR_PATH}/cmake-dir/bin:$PATH -ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64 -ENV LD_LIBRARY_PATH=/opt/intel/opencl:${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH} diff --git a/dockerfiles/Dockerfile.openvino-rhel8 b/dockerfiles/Dockerfile.openvino-rhel8 deleted file mode 100644 index 5c504cfa553a1..0000000000000 --- a/dockerfiles/Dockerfile.openvino-rhel8 +++ /dev/null @@ -1,87 +0,0 @@ -# Build stage -FROM registry.access.redhat.com/ubi8/ubi:8.4 - -WORKDIR /code - -ARG MY_ROOT=/code -ARG DEVICE=CPU_FP32 -ARG ONNXRUNTIME_REPO=https://github.com/microsoft/onnxruntime -ARG ONNXRUNTIME_BRANCH=main - -ENV INTEL_OPENVINO_DIR=/opt/intel/openvino_2022.3.0 - -ENV InferenceEngine_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake -ENV IE_PLUGINS_PATH=${INTEL_OPENVINO_DIR}/runtime/lib/intel64/ -ENV ngraph_DIR=${INTEL_OPENVINO_DIR}/runtime/cmake -ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/runtime/3rdparty/tbb/lib/:${IE_PLUGINS_PATH}:${LD_LIBRARY_PATH} -ENV OpenCV_DIR=${INTEL_OPENVINO_DIR}/extras/opencv/cmake -ENV LD_LIBRARY_PATH=${INTEL_OPENVINO_DIR}/extras/opencv/lib:${LD_LIBRARY_PATH} -ENV LD_LIBRARY_PATH=/usr/local/lib:/usr/lib:/usr/local/lib64:/usr/lib64:/lib64:${LD_LIBRARY_PATH} -ENV PATH=${MY_ROOT}/cmake-dir/bin:$PATH - -# Install packages -RUN yum install -y yum-utils autoconf automake libtool unzip udev wget zlib-devel libffi-devel openssl-devel git make gcc && \ - yum clean packages && yum clean all && rm -rf /var/cache/yum && \ -# Install python 3.8 - cd $MY_ROOT && \ - wget https://www.python.org/ftp/python/3.8.9/Python-3.8.9.tgz && tar xvf Python-3.8.9.tgz && rm -rf Python-3.8.9.tgz && \ - cd Python-3.8*/ && ./configure && make && make install && \ - cd ../ && mkdir -p /usr/bin/Python38 && ln -s Python-3.8.9/ /usr/bin/Python38 && ln -s /usr/bin/pip3 /usr/bin/pip && \ -# libusb1.0.22 - cd /opt/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \ - unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd /opt/libusb-1.0.22 && \ -# bootstrap steps - ./bootstrap.sh && \ - ./configure --disable-udev --enable-shared && \ - make -j4 && \ -# configure libusb1.0.22 - cd /opt/libusb-1.0.22/libusb && \ - /bin/mkdir -p '/usr/local/lib' && \ - /bin/bash ../libtool --mode=install /usr/bin/install -c libusb-1.0.la '/usr/local/lib' && \ - /bin/mkdir -p '/usr/local/include/libusb-1.0' && \ - /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \ - /bin/mkdir -p '/usr/local/lib/pkgconfig' && \ -# Install openvino - cd /opt/ && mkdir intel/ && cd intel && \ - wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2022.3/linux/l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \ - tar xvf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \ - rm -rf l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64.tgz && \ - mv l_openvino_toolkit_rhel8_2022.3.0.9052.9752fafe8eb_x86_64 openvino_2022.3.0 && \ - cd ${INTEL_OPENVINO_DIR}/install_dependencies/ && ./install_openvino_dependencies.sh -y && ./install_NEO_OCL_driver.sh -y && \ - printf "\nexport LD_LIBRARY_PATH=\${LD_LIBRARY_PATH}:/usr/local/lib\n" >> /opt/intel/openvino_2022.3.0/setupvars.sh && \ - cd /opt/libusb-1.0.22 && \ - /usr/bin/install -c -m 644 libusb-1.0.pc '/usr/local/lib/pkgconfig' && \ - # MYRIAD plugins are not available for openvino 2022.3.0 release - #cp /opt/intel/openvino_2022.3.0/install_dependencies/97-myriad-usbboot.rules /etc/udev/rules.d/ && \ - ldconfig && \ -#Install protobuf - cd $MY_ROOT && \ - git clone https://github.com/protocolbuffers/protobuf.git && \ - cd protobuf && \ - git checkout v3.16.0 && \ - git submodule update --init --recursive && \ - mkdir build_source && cd build_source && \ - cmake ../cmake -DCMAKE_INSTALL_LIBDIR=lib64 -Dprotobuf_BUILD_SHARED_LIBS=OFF -DCMAKE_INSTALL_PREFIX=/usr -DCMAKE_INSTALL_SYSCONFDIR=/etc -DCMAKE_POSITION_INDEPENDENT_CODE=ON -Dprotobuf_BUILD_TESTS=OFF -DCMAKE_BUILD_TYPE=Release && \ - make -j$(nproc) && \ - make install && \ -# Build onnxruntime - cd $MY_ROOT && \ - pip3 install numpy wheel setuptools cython onnx && \ - git clone --recursive -b ${ONNXRUNTIME_BRANCH} ${ONNXRUNTIME_REPO} && \ - bash onnxruntime/dockerfiles/scripts/install_common_deps.sh && \ - ln -s cmake-* cmake-dir && \ - source /opt/intel/openvino_2022.3.0/setupvars.sh && \ - cd /code/onnxruntime && ./build.sh --allow_running_as_root --config Release --update --build --parallel --use_openvino ${DEVICE} --build_shared_lib --build_wheel && \ - pip3 install /code/onnxruntime/build/Linux/Release/dist/*-linux_x86_64.whl && \ -# Clean up - cd ${MY_ROOT} && rm -rf onnxruntime && rm -rf Python-3.8.9 && rm -rf protobuf - -# Deploy stage -ARG BUILD_UID=1001 -ARG BUILD_USER=onnxruntimedev -RUN adduser --uid $BUILD_UID $BUILD_USER -RUN usermod -a -G video,users,render ${BUILD_USER} -ENV WORKDIR_PATH /home/${BUILD_USER} - -WORKDIR ${WORKDIR_PATH} -USER ${BUILD_USER} diff --git a/onnxruntime/core/providers/openvino/backend_manager.cc b/onnxruntime/core/providers/openvino/backend_manager.cc index 330b464ffd1bb..3252603e33389 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.cc +++ b/onnxruntime/core/providers/openvino/backend_manager.cc @@ -1,8 +1,9 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include #include +#include #include "core/providers/shared_library/provider_api.h" #include "contexts.h" @@ -24,15 +25,6 @@ BackendManager::BackendManager(const GlobalContext& global_context, global_context_ = global_context; auto prec_str = GetGlobalContext().precision_str; - if (prec_str == "FP32") { - subgraph_context_.precision = "FP32"; - } else if (prec_str == "FP16") { - subgraph_context_.precision = "FP16"; - } else if (prec_str == "U8") { - subgraph_context_.precision = "U8"; - } else { - throw std::string("Invalid OpenVINO Precision type: " + prec_str); - } // Save the indexes of graph inputs among fused_node's inputDefs // (which also contains initializers). @@ -47,7 +39,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, for (auto input : graph_inputs) { auto it = subgraph_context_.input_names.find(input->Name()); if (it == subgraph_context_.input_names.end()) { - throw std::string("Input not found in the input defs list"); + ORT_THROW("Input not found in the input defs list"); } int index = it->second; subgraph_context_.input_indexes.push_back(index); @@ -61,6 +53,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, } subgraph_context_.subgraph_name = fused_node.Name(); model_proto_ = GetModelProtoFromFusedNode(fused_node, subgraph, logger); + std::string device_type = openvino_ep::BackendManager::GetGlobalContext().device_type; if (ModelHasSymbolicInputDims(subgraph)) { subgraph_context_.has_dynamic_input_shape = true; @@ -75,7 +68,7 @@ BackendManager::BackendManager(const GlobalContext& global_context, GetGlobalContext(), subgraph_context_); } catch (std::string const& msg) { - throw msg; + ORT_THROW(msg); } LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " << "Backend created for graph " << subgraph_context_.subgraph_name; @@ -87,12 +80,29 @@ BackendManager::BackendManager(const GlobalContext& global_context, << subgraph_context_.subgraph_name; subgraph_context_.has_dynamic_input_shape = false; + + // OV NPU plugin is supported with fallback to OV CPU upon compilation failures. try { concrete_backend_ = BackendFactory::MakeBackend(*model_proto_, GetGlobalContext(), subgraph_context_); - } catch (std::string const& msg) { - throw msg; + } catch (const OnnxRuntimeException& ex) { + if (device_type.find("NPU") != std::string::npos) { + LOGS_DEFAULT(WARNING) << ex.what(); + LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." + << "Falling back to OV CPU for execution"; + GetGlobalContext().device_type = "CPU"; + GetGlobalContext().precision_str = "FP32"; + try { + concrete_backend_ = BackendFactory::MakeBackend(*model_proto_, + GetGlobalContext(), + subgraph_context_); + } catch (std::string const& msg) { + ORT_THROW(msg); + } + } else { + ORT_THROW(ex.what()); + } } } } @@ -254,8 +264,13 @@ void BackendManager::Compute(OrtKernelContext* context) { LOGS_DEFAULT(INFO) << "Start Compute"; } #endif + // OV NPU doesn't support dynamic shaped model inference. + // if disable_dynamic_shapes is set to true then execution of dynamic model is done + // by rewriting the model to static shaped model at runtime based on input shape. + // disable_dynamic_shapes is always set to true for OV NPU plugin. bool use_dynamic_backend = true; - if (!GetGlobalContext().disable_dynamic_shapes && subgraph_context_.has_dynamic_input_shape && + if (subgraph_context_.has_dynamic_input_shape && + !GetGlobalContext().disable_dynamic_shapes && (GetGlobalContext().device_type.find("CPU") != std::string::npos || GetGlobalContext().device_type.find("GPU") != std::string::npos)) { concrete_backend_->Infer(context); @@ -263,12 +278,11 @@ void BackendManager::Compute(OrtKernelContext* context) { } else if (use_dynamic_backend && subgraph_context_.has_dynamic_input_shape) { std::vector> tensor_shapes = GetInputTensorShapes(ctx); auto key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); - std::shared_ptr dynamic_backend; auto search = backend_map_.find(key); if (search == backend_map_.end()) { LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " - << "Creating concrete backend for key: " << key; + << "Creating dynamic backend for key: " << key; LOGS_DEFAULT(INFO) << "[OpenVINO-EP] " << "Backend created for graph " << subgraph_context_.subgraph_name; auto modelproto_with_concrete_shapes = ReWriteInputShapeInfo(*model_proto_, tensor_shapes); @@ -276,8 +290,22 @@ void BackendManager::Compute(OrtKernelContext* context) { dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes, GetGlobalContext(), subgraph_context_); - } catch (std::string const& msg) { - throw msg; + } catch (const OnnxRuntimeException& ex) { + if (GetGlobalContext().device_type.find("NPU") != std::string::npos) { + LOGS_DEFAULT(WARNING) << ex.what(); + LOGS_DEFAULT(WARNING) << "Model compilation failed at OV NPU." + << "Falling back to OV CPU for execution"; + GetGlobalContext().device_type = "CPU"; + GetGlobalContext().precision_str = "FP32"; + key = MakeMapKeyString(tensor_shapes, GetGlobalContext().device_type); + try { + dynamic_backend = BackendFactory::MakeBackend(*modelproto_with_concrete_shapes, + GetGlobalContext(), + subgraph_context_); + } catch (std::string const& msg) { + ORT_THROW(msg); + } + } } backend_map_.insert({key, dynamic_backend}); } else { diff --git a/onnxruntime/core/providers/openvino/backend_manager.h b/onnxruntime/core/providers/openvino/backend_manager.h index 59bda7ca640ee..376ebea225a2b 100644 --- a/onnxruntime/core/providers/openvino/backend_manager.h +++ b/onnxruntime/core/providers/openvino/backend_manager.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once diff --git a/onnxruntime/core/providers/openvino/backend_utils.cc b/onnxruntime/core/providers/openvino/backend_utils.cc index 50c839017df2a..32b5ad7d5b66d 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.cc +++ b/onnxruntime/core/providers/openvino/backend_utils.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include @@ -11,12 +11,7 @@ #include "core/providers/shared_library/provider_api.h" #include "backend_utils.h" -#if defined(OV_API_20) using Exception = ov::Exception; -#else -using Exception = InferenceEngine::details::InferenceEngineException; -using WaitMode = InferenceEngine::IInferRequest::WaitMode; -#endif namespace onnxruntime { namespace openvino_ep { @@ -47,7 +42,6 @@ struct static_cast_int64 { std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, - const SubGraphContext& subgraph_context, std::map>& const_outputs_map) { if (IsCILogEnabled()) { std::cout << "CreateNgraphFunc" << std::endl; @@ -55,28 +49,6 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext const std::string model = model_proto.SerializeAsString(); try { auto cnn_network = global_context.ie_core.ReadModel(model, global_context.onnx_model_path_name); - if ((subgraph_context.precision == "FP16") && - (global_context.device_type.find("NPU") == std::string::npos)) { - // FP16 transformations - ov::pass::ConvertFP32ToFP16 pass_obj; - pass_obj.run_on_model(cnn_network); - cnn_network->validate_nodes_and_infer_types(); - - auto proc = ov::preprocess::PrePostProcessor(cnn_network); - for (size_t i = 0; i < cnn_network->inputs().size(); i++) { - if (cnn_network->inputs()[i].get_element_type() == ov::element::f16) { - proc.input(i).tensor().set_element_type(ov::element::f32); - proc.input(i).preprocess().convert_element_type(ov::element::f16); - } - } - - for (size_t i = 0; i < cnn_network->outputs().size(); i++) { - if (cnn_network->outputs()[i].get_element_type() == ov::element::f16) { - proc.output(i).postprocess().convert_element_type(ov::element::f32); - } - } - cnn_network = proc.build(); - } // Check for Constant Folding if (!global_context.is_wholly_supported_graph) { @@ -103,7 +75,7 @@ CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext #endif return cnn_network; } catch (std::string const& msg) { - throw msg; + ORT_THROW(msg); } } @@ -127,7 +99,7 @@ GetOutputTensor(Ort::KernelContext& context, size_t batch_size, } auto it = output_names.find(output_name); if (it == output_names.end()) { - throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX"); + ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX"); } int index = it->second; return context.GetOutput(index, output_shape.get(), num_dims); @@ -145,7 +117,7 @@ GetOutputTensor(Ort::KernelContext& context, auto it = output_names.find(output_name); if (it == output_names.end()) { - throw std::string(log_tag + "Output names mismatch between OpenVINO and ONNX"); + ORT_THROW(log_tag + "Output names mismatch between OpenVINO and ONNX"); } int index = it->second; auto shape = node->get_shape(); @@ -204,7 +176,7 @@ void FillOutputsWithConstantData(std::shared_ptr node, Ort::UnownedVal break; } default: - throw std::string(log_tag + "Unsupported output data type"); + ORT_THROW(log_tag + "Unsupported output data type"); } } @@ -232,7 +204,7 @@ void FillInputBlob(OVTensorPtr inputBlob, size_t batch_slice_idx, auto tensor = context.GetInput(subgraph_context.input_names.at(input_name)); auto mem_info = tensor.GetTensorMemoryInfo(); if (mem_info.GetAllocatorName() == OpenVINO_GPU) { - throw std::string(log_tag + "IO Buffering is not enabled, Please enable Input on CPU"); + ORT_THROW(log_tag + "IO Buffering is not enabled, Please enable Input on CPU"); } // Copy input data into OpenVINO's input buffer const char* tensor_data = tensor.GetTensorData(); diff --git a/onnxruntime/core/providers/openvino/backend_utils.h b/onnxruntime/core/providers/openvino/backend_utils.h index 82b0351e87da5..93fa874774469 100644 --- a/onnxruntime/core/providers/openvino/backend_utils.h +++ b/onnxruntime/core/providers/openvino/backend_utils.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once @@ -65,7 +65,6 @@ void FillOutputBlob(OVTensorPtr outputBlob, Ort::UnownedValue& output_tensor, std::shared_ptr CreateOVModel(const ONNX_NAMESPACE::ModelProto& model_proto, const GlobalContext& global_context, - const SubGraphContext& subgraph_context, std::map>& const_outputs_map); void printPerformanceCounts(const std::vector& performanceMap, diff --git a/onnxruntime/core/providers/openvino/backends/backend_factory.cc b/onnxruntime/core/providers/openvino/backends/backend_factory.cc index c586dd8b38af9..a0f4ce8f843b0 100644 --- a/onnxruntime/core/providers/openvino/backends/backend_factory.cc +++ b/onnxruntime/core/providers/openvino/backends/backend_factory.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include @@ -24,11 +24,11 @@ BackendFactory::MakeBackend(const ONNX_NAMESPACE::ModelProto& model_proto, try { concrete_backend_ = std::make_shared(model_proto, global_context, subgraph_context); } catch (std::string const& msg) { - throw msg; + ORT_THROW(msg); } return concrete_backend_; } else { - throw std::string("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type); + ORT_THROW("[OpenVINO-EP] Backend factory error: Unknown backend type: " + type); } } } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.cc b/onnxruntime/core/providers/openvino/backends/basic_backend.cc index 0779940983aea..69d234a7c55ef 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.cc +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include @@ -79,20 +79,20 @@ BasicBackend::BasicBackend(const ONNX_NAMESPACE::ModelProto& model_proto, subgraph_context_.subgraph_name); LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } else { - ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); + ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_); exe_network_ = global_context_.ie_core.LoadNetwork( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } #endif } else { - ie_cnn_network_ = CreateOVModel(model_proto, global_context_, subgraph_context_, const_outputs_map_); + ie_cnn_network_ = CreateOVModel(model_proto, global_context_, const_outputs_map_); exe_network_ = global_context_.ie_core.LoadNetwork( ie_cnn_network_, hw_target, device_config, subgraph_context_.subgraph_name); LOGS_DEFAULT(INFO) << log_tag << "Loaded model to the plugin"; } } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } inferRequestsQueue_ = std::unique_ptr(new InferRequestsQueue(exe_network_, 1)); @@ -125,21 +125,17 @@ void BasicBackend::PopulateConfigValue(ov::AnyMap& device_config) { if (global_context_.device_type.find("NPU") != std::string::npos) { std::pair device_property; device_property = std::make_pair("NPU_COMPILER_TYPE", "DRIVER"); + + const std::string env_npu_compiler_type = onnxruntime::GetEnvironmentVar("ORT_OPENVINO_NPU_COMPILER_TYPE"); + if (!env_npu_compiler_type.empty()) { + device_property = std::make_pair("NPU_COMPILER_TYPE", env_npu_compiler_type); + } device_config.emplace(ov::device::properties("NPU", device_property)); } } void BasicBackend::EnableCaching() { if (!global_context_.cache_dir.empty()) { - if (global_context_.is_wholly_supported_graph) { -#if defined(OPENVINO_2022_3) -#if defined(_WIN32) || defined(WIN32) || defined(__CYGWIN__) || defined(__MINGW32__) || defined(__BORLANDC__) - _putenv_s("OV_GPU_CACHE_MODEL", "1"); -#else - setenv("OV_GPU_CACHE_MODEL", "1", 1); -#endif -#endif - } LOGS_DEFAULT(INFO) << log_tag << "Enables Caching"; global_context_.ie_core.SetCache(global_context_.cache_dir); } @@ -162,7 +158,7 @@ void BasicBackend::EnableStreams() { (global_context_.device_type.find("HETERO") != std::string::npos) || (global_context_.device_type.find("AUTO") != std::string::npos)) { if (global_context_.num_streams != 1) { - throw(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type); + ORT_THROW(log_tag + "Cannot set NUM_STREAMS to " + std::to_string(global_context_.num_streams) + " for device " + global_context_.device_type); } // Do nothing } else { @@ -198,9 +194,9 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque if (input_names.find(onnx_input_name) != input_names.end()) { input_name = onnx_input_name; } else { - throw(log_tag + - "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name + - " doesn't exist in the list of OpenVINO input tensor names"); + ORT_THROW(log_tag + + "Input names mismatch between OpenVINO and ONNX. " + onnx_input_name + + " doesn't exist in the list of OpenVINO input tensor names"); } size_t batch_slice_idx = 0; if (subgraph_context_.has_dynamic_input_shape && @@ -232,14 +228,14 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque try { infer_request->SetTensor(input_name, tensor_ptr); } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } } else { OVTensorPtr graph_input_blob; try { graph_input_blob = infer_request->GetTensor(input_name); } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } FillInputBlob(graph_input_blob, batch_slice_idx, input_name, context, subgraph_context_); } @@ -248,7 +244,7 @@ void BasicBackend::StartAsyncInference(Ort::KernelContext& context, OVInferReque // Start Async inference infer_request->StartAsync(); } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } } @@ -274,10 +270,10 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe if (input_names.find(onnx_input_name) != input_names.end()) { input_name = onnx_input_name; } else { - throw(log_tag + - "Input names mismatch between OpenVINO and ONNX. " + - onnx_input_name + - " doesn't exist in the list of OpenVINO input tensor names"); + ORT_THROW(log_tag + + "Input names mismatch between OpenVINO and ONNX. " + + onnx_input_name + + " doesn't exist in the list of OpenVINO input tensor names"); } input_idx++; // Kernel Context Input Buffer @@ -322,7 +318,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe } } if (!output_name_found) { - throw std::string( + ORT_THROW( log_tag + "Output names mismatch between OpenVINO and ONNX. [ONNX Output: ] " + onnx_output_name + " doesn't exist in the list of OpenVINO output tensor names"); @@ -344,7 +340,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe try { infer_request->SetTensor(output_name, tensor_ptr); } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } } } @@ -352,7 +348,7 @@ void BasicBackend::StartRemoteAsyncInference(Ort::KernelContext& context, OVInfe // Start Async inference infer_request->StartAsync(); } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } } #endif @@ -382,17 +378,18 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe } } if (!output_name_found) { - throw(log_tag + - "Output names mismatch between OpenVINO and ONNX. " - "[ONNX Output: ] " + - onnx_output_name + - " doesn't exist in the " - "list of OpenVINO output tensor names"); + ORT_THROW( + log_tag + + "Output names mismatch between OpenVINO and ONNX. " + "[ONNX Output: ] " + + onnx_output_name + + " doesn't exist in the " + "list of OpenVINO output tensor names"); } try { graph_output_blob = infer_request->GetTensor(output_name); } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } size_t batch_size = 1; auto output_tensor = @@ -413,14 +410,14 @@ void BasicBackend::CompleteAsyncInference(Ort::KernelContext& context, OVInferRe auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node); auto mem_info = output_tensor.GetTensorMemoryInfo(); if (mem_info.GetAllocatorName() == OpenVINO_GPU) { - throw(log_tag + "IO Buffering is not supported for constant subgraphs"); + ORT_THROW(log_tag + "IO Buffering is not supported for constant subgraphs"); } else { FillOutputsWithConstantData(node, output_tensor); } } } } catch (const char* msg) { - throw(msg); + ORT_THROW(msg); } } @@ -440,7 +437,7 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { auto output_tensor = GetOutputTensor(context, out_name, subgraph_context_.output_names, node); FillOutputsWithConstantData(node, output_tensor); } catch (std::string const& msg) { - throw msg; + ORT_THROW(msg); } } // Get Output tensors @@ -461,26 +458,26 @@ void BasicBackend::Infer(OrtKernelContext* ctx) { try { StartRemoteAsyncInference(context, infer_request); } catch (std::string const& msg) { - throw msg; + ORT_THROW(msg); } } else { try { StartAsyncInference(context, infer_request); } catch (std::string const& msg) { - throw msg; + ORT_THROW(msg); } } #else try { StartAsyncInference(context, infer_request); - } catch (std::string const& msg) { - throw msg; + } catch (const std::runtime_error& e) { + ORT_THROW(log_tag + " Exception at StartAsyncInference: " + e.what()); } #endif try { CompleteAsyncInference(context, infer_request); - } catch (std::string const& msg) { - throw msg; + } catch (const std::runtime_error& e) { + ORT_THROW(log_tag + " Exception at CompleteAsyncInference: " + e.what()); } // Get Output tensors diff --git a/onnxruntime/core/providers/openvino/backends/basic_backend.h b/onnxruntime/core/providers/openvino/backends/basic_backend.h index aa96dadbf0e2d..3502f660bbb20 100644 --- a/onnxruntime/core/providers/openvino/backends/basic_backend.h +++ b/onnxruntime/core/providers/openvino/backends/basic_backend.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once diff --git a/onnxruntime/core/providers/openvino/contexts.h b/onnxruntime/core/providers/openvino/contexts.h index 5f19c71683f24..8701d9f676ffd 100644 --- a/onnxruntime/core/providers/openvino/contexts.h +++ b/onnxruntime/core/providers/openvino/contexts.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once @@ -31,6 +31,7 @@ struct GlobalContext { int onnx_opset_version; void* context = 0; bool use_api_2; + std::vector OpenVINO_Version = {}; // Ov Major and OV minor version from OV headers }; // Holds context specific to subgraph. @@ -44,7 +45,6 @@ struct SubGraphContext { std::vector input_indexes; std::unordered_map input_names; std::unordered_map output_names; - std::string precision; }; } // namespace openvino_ep diff --git a/onnxruntime/core/providers/openvino/ibackend.h b/onnxruntime/core/providers/openvino/ibackend.h index 8aacce19c14d5..ece855c6167c6 100644 --- a/onnxruntime/core/providers/openvino/ibackend.h +++ b/onnxruntime/core/providers/openvino/ibackend.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc index e3948cc94b348..913440d2fb6ea 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.cc +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include "core/providers/shared_library/provider_api.h" @@ -6,6 +6,7 @@ #include "contexts.h" #include "backend_manager.h" #include "ov_versions/capability.h" +#include "openvino/core/version.hpp" #define MEMCPY_S(dest, src, destsz, srcsz) memcpy(dest, src, std::min(destsz, srcsz)) @@ -25,6 +26,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv global_context_->enable_opencl_throttling = info.enable_opencl_throttling_; global_context_->disable_dynamic_shapes = info.disable_dynamic_shapes_; global_context_->num_of_threads = info.num_of_threads_; + global_context_->OpenVINO_Version = {OPENVINO_VERSION_MAJOR, OPENVINO_VERSION_MINOR}; // to check if target device is available // using ie_core capability GetAvailableDevices to fetch list of devices plugged in @@ -50,8 +52,7 @@ OpenVINOExecutionProvider::OpenVINOExecutionProvider(const OpenVINOExecutionProv device_found = true; break; } - if ((info.device_type_.find("NPU") != std::string::npos) && - (info.precision_ == "FP16" || info.precision_ == "U8")) { + if (info.device_type_.find("NPU") != std::string::npos) { device_found = true; break; } @@ -113,27 +114,10 @@ OpenVINOExecutionProvider::GetCapability(const GraphViewer& graph_viewer, global_context_->onnx_opset_version = graph_viewer.DomainToVersionMap().at(kOnnxDomain); -#if defined(OPENVINO_2023_0) openvino_ep::GetCapability obj(graph_viewer, global_context_->device_type, - global_context_->precision_str, "V_2023_0"); + global_context_->precision_str); result = obj.Execute(); -#elif defined(OPENVINO_2023_1) - openvino_ep::GetCapability obj(graph_viewer, - global_context_->device_type, - global_context_->precision_str, "V_2023_1"); - result = obj.Execute(); -#elif defined(OPENVINO_2023_2) - openvino_ep::GetCapability obj(graph_viewer, - global_context_->device_type, - global_context_->precision_str, "V_2023_2"); - result = obj.Execute(); -#elif defined(OPENVINO_2023_3) - openvino_ep::GetCapability obj(graph_viewer, - global_context_->device_type, - global_context_->precision_str, "V_2023_3"); - result = obj.Execute(); -#endif global_context_->is_wholly_supported_graph = obj.IsWhollySupportedGraph(); diff --git a/onnxruntime/core/providers/openvino/openvino_execution_provider.h b/onnxruntime/core/providers/openvino/openvino_execution_provider.h index b0c92828d8a38..b0dc881c36f33 100644 --- a/onnxruntime/core/providers/openvino/openvino_execution_provider.h +++ b/onnxruntime/core/providers/openvino/openvino_execution_provider.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once @@ -20,7 +20,7 @@ static void print_build_options() { << "you want to build" << std::endl; std::cout << "The different hardware devices that can be added with HETERO/MULTI/AUTO build " - << "are ['CPU','GPU']" + << "are ['CPU','GPU','NPU']" << std::endl; std::cout << "An example of how to specify the HETERO or MULTI or AUTO build type. " << "Ex: HETERO:GPU,CPU Ex: MULTI:GPU,CPU Ex: AUTO:GPU,CPU" @@ -48,7 +48,7 @@ static std::vector parseDevices(const std::string& device_string) { print_build_options(); ORT_THROW("Invalid device string: " + device_string); } - std::vector dev_options = {"CPU", "GPU"}; + std::vector dev_options = {"CPU", "GPU", "NPU"}; for (std::string dev : devices) { if (!std::count(dev_options.begin(), dev_options.end(), dev)) { print_build_options(); @@ -98,12 +98,9 @@ struct OpenVINOExecutionProviderInfo { #elif defined OPENVINO_CONFIG_GPU_FP16 device_type_ = "GPU"; precision_ = "FP16"; -#elif defined OPENVINO_CONFIG_NPU_FP16 +#elif defined OPENVINO_CONFIG_NPU device_type_ = "NPU"; - precision_ = "FP16"; -#elif defined OPENVINO_CONFIG_NPU_U8 - device_type_ = "NPU"; - precision_ = "U8"; + precision_ = ""; #elif defined OPENVINO_CONFIG_HETERO || defined OPENVINO_CONFIG_MULTI || defined OPENVINO_CONFIG_AUTO #ifdef DEVICE_NAME #define DEVICE DEVICE_NAME @@ -142,12 +139,9 @@ struct OpenVINOExecutionProviderInfo { } else if (dev_type == "GPU.1_FP16") { device_type_ = "GPU.1"; precision_ = "FP16"; - } else if (dev_type == "NPU_FP16") { - device_type_ = "NPU"; - precision_ = "FP16"; - } else if (dev_type == "NPU_U8") { + } else if (dev_type == "NPU") { device_type_ = "NPU"; - precision_ = "U8"; + precision_ = ""; } else if (dev_type.find("HETERO") == 0 || dev_type.find("MULTI") == 0) { std::vector devices = parseDevices(dev_type); precision_ = "FP16"; diff --git a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc index 068456777bece..17511c54aab86 100644 --- a/onnxruntime/core/providers/openvino/openvino_provider_factory.cc +++ b/onnxruntime/core/providers/openvino/openvino_provider_factory.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include "core/providers/shared_library/provider_api.h" @@ -78,7 +78,6 @@ struct OpenVINO_Provider : Provider { // with this value at runtime. bool enable_opencl_throttling = false; // [enable_opencl_throttling]: Enables OpenCL queue throttling for GPU // device (Reduces CPU Utilization when using GPU) - bool disable_dynamic_shapes = false; // [disable_dynamic_shapes]: Execute model with default static shape for optimal performance. void* context = nullptr; if (provider_options_map.find("device_type") != provider_options_map.end()) { @@ -86,7 +85,7 @@ struct OpenVINO_Provider : Provider { std::set ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", - "GPU.0_FP16", "GPU.1_FP16"}; + "GPU.0_FP16", "GPU.1_FP16", "NPU"}; if (!((ov_supported_device_types.find(device_type) != ov_supported_device_types.end()) || (device_type.find("HETERO:") == 0) || (device_type.find("MULTI:") == 0) || @@ -94,7 +93,7 @@ struct OpenVINO_Provider : Provider { ORT_THROW( "[ERROR] [OpenVINO] You have selcted wrong configuration value for the key 'device_type'. " "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', " - "'GPU.0_FP16', 'GPU.1_FP16' or from" + "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from" " HETERO/MULTI/AUTO options available. \n"); } } @@ -147,12 +146,24 @@ struct OpenVINO_Provider : Provider { bool_flag = ""; } + // [disable_dynamic_shapes]: Rewrite dynamic shaped models to static shape at runtime and execute. + // Always true for NPU plugin. + bool disable_dynamic_shapes = false; + if (device_type.find("NPU") != std::string::npos) { + disable_dynamic_shapes = true; + } if (provider_options_map.find("disable_dynamic_shapes") != provider_options_map.end()) { bool_flag = provider_options_map.at("disable_dynamic_shapes"); if (bool_flag == "true" || bool_flag == "True") disable_dynamic_shapes = true; - else if (bool_flag == "false" || bool_flag == "False") - disable_dynamic_shapes = false; + else if (bool_flag == "false" || bool_flag == "False") { + if (device_type.find("NPU") != std::string::npos) { + disable_dynamic_shapes = true; + LOGS_DEFAULT(INFO) << "[OpenVINO-EP] The value for the key 'disable_dynamic_shapes' will be set to TRUE for NPU backend.\n "; + } else { + disable_dynamic_shapes = false; + } + } } return std::make_shared(const_cast(device_type.c_str()), enable_npu_fast_compile, diff --git a/onnxruntime/core/providers/openvino/ov_interface.cc b/onnxruntime/core/providers/openvino/ov_interface.cc index ea481791111fc..d7c6654c90f81 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.cc +++ b/onnxruntime/core/providers/openvino/ov_interface.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include "ov_interface.h" @@ -8,12 +8,7 @@ #include "core/providers/shared_library/provider_api.h" #include "backend_utils.h" -#if defined(OV_API_20) using Exception = ov::Exception; -#else -using Exception = InferenceEngine::details::InferenceEngineException; -using WaitMode = InferenceEngine::IInferRequest::WaitMode; -#endif namespace onnxruntime { namespace openvino_ep { @@ -36,9 +31,9 @@ std::shared_ptr OVCore::ReadModel(const std::string& model, const std } return FE->convert(inputModel); } catch (const Exception& e) { - throw std::string(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what())); + ORT_THROW(log_tag + "[OpenVINO-EP] Exception while Reading network: " + std::string(e.what())); } catch (...) { - throw std::string(log_tag + "[OpenVINO-EP] Unknown exception while Reading network"); + ORT_THROW(log_tag + "[OpenVINO-EP] Unknown exception while Reading network"); } } @@ -81,9 +76,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr& ie_cnn_network, OVExeNetwork exe(obj); return exe; } catch (const Exception& e) { - throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what()); + ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); } catch (...) { - throw std::string(log_tag + " Exception while Loading Network for graph " + name); + ORT_THROW(log_tag + " Exception while Loading Network for graph " + name); } } @@ -113,9 +108,9 @@ OVExeNetwork OVCore::LoadNetwork(std::shared_ptr& model, OVRemoteCont auto obj = oe.compile_model(model, *context); return OVExeNetwork(obj); } catch (const Exception& e) { - throw std::string(log_tag + " Exception while Loading Network for graph: " + name + e.what()); + ORT_THROW(log_tag + " Exception while Loading Network for graph: " + name + e.what()); } catch (...) { - throw std::string(log_tag + " Exception while Loading Network for graph " + name); + ORT_THROW(log_tag + " Exception while Loading Network for graph " + name); } } #endif @@ -135,9 +130,9 @@ OVInferRequest OVExeNetwork::CreateInferRequest() { OVInferRequest inf_obj(infReq); return inf_obj; } catch (const Exception& e) { - throw std::string(log_tag + "Exception while creating InferRequest object: " + e.what()); + ORT_THROW(log_tag + "Exception while creating InferRequest object: " + e.what()); } catch (...) { - throw std::string(log_tag + "Exception while creating InferRequest object."); + ORT_THROW(log_tag + "Exception while creating InferRequest object."); } } @@ -147,9 +142,9 @@ OVTensorPtr OVInferRequest::GetTensor(const std::string& input_name) { OVTensorPtr blob = std::make_shared(tobj); return blob; } catch (const Exception& e) { - throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name + e.what()); + ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name + e.what()); } catch (...) { - throw std::string(log_tag + " Cannot access IE Blob for input: " + input_name); + ORT_THROW(log_tag + " Cannot access IE Blob for input: " + input_name); } } @@ -157,9 +152,9 @@ void OVInferRequest::SetTensor(const std::string& name, OVTensorPtr& blob) { try { ovInfReq.set_tensor(name, *(blob.get())); } catch (const Exception& e) { - throw std::string(log_tag + " Cannot set Remote Blob for output: " + name + e.what()); + ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name + e.what()); } catch (...) { - throw std::string(log_tag + " Cannot set Remote Blob for output: " + name); + ORT_THROW(log_tag + " Cannot set Remote Blob for output: " + name); } } @@ -167,9 +162,9 @@ void OVInferRequest::StartAsync() { try { ovInfReq.start_async(); } catch (const Exception& e) { - throw std::string(log_tag + " Couldn't start Inference: " + e.what()); + ORT_THROW(log_tag + " Couldn't start Inference: " + e.what()); } catch (...) { - throw std::string(log_tag + " In Error Couldn't start Inference"); + ORT_THROW(log_tag + " In Error Couldn't start Inference"); } } @@ -177,9 +172,9 @@ void OVInferRequest::Infer() { try { ovInfReq.infer(); } catch (const Exception& e) { - throw std::string(log_tag + " Couldn't start Inference: " + e.what()); + ORT_THROW(log_tag + " Couldn't start Inference: " + e.what()); } catch (...) { - throw std::string(log_tag + " In Error Couldn't start Inference"); + ORT_THROW(log_tag + " In Error Couldn't start Inference"); } } @@ -187,9 +182,9 @@ void OVInferRequest::WaitRequest() { try { ovInfReq.wait(); } catch (const Exception& e) { - throw std::string(log_tag + " Wait Model Failed: " + e.what()); + ORT_THROW(log_tag + " Wait Model Failed: " + e.what()); } catch (...) { - throw std::string(log_tag + " Wait Mode Failed"); + ORT_THROW(log_tag + " Wait Mode Failed"); } } diff --git a/onnxruntime/core/providers/openvino/ov_interface.h b/onnxruntime/core/providers/openvino/ov_interface.h index cf4d867d4df55..2a13fafb99fd3 100644 --- a/onnxruntime/core/providers/openvino/ov_interface.h +++ b/onnxruntime/core/providers/openvino/ov_interface.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once @@ -6,14 +6,11 @@ #include #include -#define OV_API_20 #include "openvino/openvino.hpp" #include "openvino/pass/convert_fp32_to_fp16.hpp" #include "openvino/frontend/manager.hpp" #ifdef IO_BUFFER_ENABLED -#include -#include #include #endif diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.cc b/onnxruntime/core/providers/openvino/ov_versions/capability.cc index 11c8a1629b073..3970bf6ff68a7 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) 2019- Intel Corporation // Licensed under the MIT License #include "core/providers/shared_library/provider_api.h" @@ -6,6 +6,7 @@ #include "../backend_manager.h" #include "capability.h" #include "utils.h" +#include "openvino/core/version.hpp" #if defined(_MSC_VER) #pragma warning(disable : 4244 4245 5208) @@ -25,20 +26,22 @@ namespace openvino_ep { // Constructor GetCapability::GetCapability(const GraphViewer& graph_viewer_param, const std::string device_type_param, - const std::string device_precision, - const std::string version_param) + const std::string device_precision) : graph_viewer_(graph_viewer_param), device_type_(device_type_param), device_precision_(device_precision) { - if (version_param == "V_2023_0") { - data_ops_ = new DataOps(graph_viewer_, V_2023_0, device_type_, device_precision_); - } else if (version_param == "V_2023_1") { - data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_); - } else if (version_param == "V_2023_2") { - data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_); - } else if (version_param == "V_2023_3") { - data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_); - } else { - data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_); + if (device_type_.find("NPU") != std::string::npos) { + device_type_ = "CPU_FP32"; } +#if OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 1 + data_ops_ = new DataOps(graph_viewer_, V_2023_1, device_type_, device_precision_); +#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 2 + data_ops_ = new DataOps(graph_viewer_, V_2023_2, device_type_, device_precision_); +#elif OPENVINO_VERSION_MAJOR == 2023 && OPENVINO_VERSION_MINOR == 3 + data_ops_ = new DataOps(graph_viewer_, V_2023_3, device_type_, device_precision_); +#elif OPENVINO_VERSION_MAJOR == 2024 && OPENVINO_VERSION_MINOR == 0 + data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_); +#else + data_ops_ = new DataOps(graph_viewer_, V_2024_0, device_type_, device_precision_); +#endif } std::vector> GetCapability::Execute() { diff --git a/onnxruntime/core/providers/openvino/ov_versions/capability.h b/onnxruntime/core/providers/openvino/ov_versions/capability.h index 2040634cc45d9..d9fe5a95ef833 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/capability.h +++ b/onnxruntime/core/providers/openvino/ov_versions/capability.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once @@ -21,8 +21,7 @@ class GetCapability { public: GetCapability(const GraphViewer& graph_viewer_param, const std::string device_type_param, - const std::string precision, - const std::string version_param); + const std::string precision); virtual std::vector> Execute(); bool IsWhollySupportedGraph() { return is_wholly_supported_graph_; diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc index e829bf377b195..c7c3e93595719 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include @@ -14,6 +14,7 @@ #include "data_ops.h" #include "capability.h" #include "utils.h" +#include "../ov_interface.h" #if defined(_MSC_VER) #pragma warning(disable : 4244 4245 5208) @@ -36,6 +37,7 @@ namespace openvino_ep { std::set ops_supported_only_in_model = { "Add", "Cast", + "Celu", "Concat", "ConstantOfShape", "DequantizeLinear", @@ -46,6 +48,7 @@ std::set ops_supported_only_in_model = { "EyeLike", "GatherElements", "GatherND", + "GridSample", "Identity", "LayerNormalization", "Loop", @@ -72,293 +75,171 @@ std::set ops_supported_only_in_model = { std::set ops_supported_as_function = { "LessOrEqual", "GreaterOrEqual", - "LayerNormalization"}; + "LayerNormalization", + "Celu"}; std::vector supported_op_mode = { {"Abs", V_2020_4, {"CPU", "GPU"}}, - {"Abs", V_2023_0, {"NPU"}}, {"Acos", V_2020_4, {"CPU"}}, {"Acos", V_2022_1, {"GPU"}}, - {"Acos", V_2023_1, {"NPU"}}, {"Acosh", V_2020_4, {"CPU"}}, {"Acosh", V_2022_1, {"GPU"}}, - {"Acosh", V_2023_1, {"NPU"}}, {"Add", V_2020_4, {"CPU", "GPU"}}, - {"Add", V_2023_0, {"NPU"}}, {"And", V_2020_4, {"CPU", "GPU"}}, - {"And", V_2023_1, {"NPU"}}, {"ArgMax", V_2020_4, {"CPU"}}, {"ArgMax", V_2021_1, {"GPU"}}, {"ArgMin", V_2020_4, {"CPU"}}, {"ArgMin", V_2022_1, {"GPU"}}, {"Asin", V_2020_4, {"CPU", "GPU"}}, - {"Asin", V_2023_1, {"NPU"}}, {"Asinh", V_2020_4, {"CPU", "GPU"}}, - {"Asinh", V_2023_1, {"NPU"}}, {"Atan", V_2020_4, {"CPU", "GPU"}}, - {"Atan", V_2023_1, {"NPU"}}, {"Atanh", V_2020_4, {"CPU"}}, {"Atanh", V_2022_1, {"GPU"}}, - {"Atanh", V_2023_1, {"NPU"}}, {"AveragePool", V_2020_4, {"CPU", "GPU"}}, - {"AveragePool", V_2023_0, {"NPU"}}, {"BatchNormalization", V_2020_4, {"CPU", "GPU"}}, - {"BatchNormalization", V_2023_0, {"NPU"}}, {"BitShift", V_2022_1, {"CPU"}}, - {"BitShift", V_2023_1, {"NPU"}}, {"Cast", V_2020_4, {"CPU", "GPU"}}, - {"Cast", V_2023_0, {"NPU"}}, - {"CastLike", V_2023_1, {"CPU", "GPU", "NPU"}}, + {"CastLike", V_2023_1, {"CPU", "GPU"}}, {"Ceil", V_2020_4, {"GPU"}}, {"Ceil", V_2021_4, {"CPU"}}, - {"Ceil", V_2023_1, {"NPU"}}, {"Celu", V_2022_1, {"CPU", "GPU"}}, {"Clip", V_2020_4, {"CPU", "GPU"}}, - {"Clip", V_2023_0, {"NPU"}}, {"Compress", V_2023_1, {"CPU", "GPU"}}, {"Concat", V_2020_4, {"CPU", "GPU"}}, - {"Concat", V_2023_0, {"NPU"}}, {"Constant", V_2020_4, {"CPU", "GPU"}}, - {"Constant", V_2023_0, {"NPU"}}, {"ConstantOfShape", V_2020_4, {"CPU", "GPU"}}, - {"ConstantOfShape", V_2023_0, {"NPU"}}, // Gets mapped to broadcast op in the plugin. {"Conv", V_2020_4, {"CPU", "GPU"}}, - {"Conv", V_2023_0, {"NPU"}}, {"ConvInteger", V_2022_1, {"CPU", "GPU"}}, - {"ConvInteger", V_2023_1, {"NPU"}}, {"ConvTranspose", V_2020_4, {"CPU", "GPU"}}, - {"ConvTranspose", V_2023_1, {"NPU"}}, {"Cos", V_2020_4, {"CPU"}}, {"Cos", V_2022_1, {"GPU"}}, - {"Cos", V_2023_0, {"NPU"}}, {"Cosh", V_2020_4, {"CPU"}}, {"Cosh", V_2022_1, {"GPU"}}, - {"Cosh", V_2023_1, {"NPU"}}, {"CumSum", V_2022_1, {"CPU", "GPU"}}, - {"CumSum", V_2023_0, {"NPU"}}, {"DepthToSpace", V_2020_4, {"CPU", "GPU"}}, - {"DepthToSpace", V_2023_0, {"NPU"}}, {"DequantizeLinear", V_2021_4, {"CPU", "GPU"}}, - {"DequantizeLinear", V_2023_0, {"NPU"}}, {"Div", V_2020_4, {"CPU", "GPU"}}, - {"Div", V_2023_0, {"NPU"}}, {"Dropout", V_2020_4, {"CPU", "GPU"}}, - {"Dropout", V_2023_0, {"NPU"}}, {"Elu", V_2020_4, {"CPU", "GPU"}}, - {"Elu", V_2023_0, {"NPU"}}, {"Einsum", V_2023_1, {"CPU", "GPU"}}, {"Equal", V_2020_4, {"CPU", "GPU"}}, - {"Equal", V_2023_0, {"NPU"}}, // Added for whisper decoder model. {"Erf", V_2020_4, {"CPU", "GPU"}}, - {"Erf", V_2023_0, {"NPU"}}, {"Exp", V_2020_4, {"CPU", "GPU"}}, - {"Exp", V_2023_0, {"NPU"}}, {"Expand", V_2022_1, {"CPU", "GPU"}}, - {"Expand", V_2023_0, {"NPU"}}, // Gets mapped to broadcast op and multiply op in the plugin. {"EyeLike", V_2022_1, {"CPU"}}, - {"EyeLike", V_2023_0, {"NPU"}}, // NoOP {"Flatten", V_2020_4, {"CPU", "GPU"}}, - {"Flatten", V_2023_0, {"NPU"}}, {"Floor", V_2020_4, {"CPU", "GPU"}}, - {"Floor", V_2023_1, {"NPU"}}, {"Gather", V_2020_4, {"CPU", "GPU"}}, - {"Gather", V_2023_0, {"NPU"}}, {"GatherElements", V_2022_2, {"CPU", "GPU"}}, - {"GatherElements", V_2023_1, {"NPU"}}, {"GatherND", V_2021_4, {"CPU", "GPU"}}, - {"GatherND", V_2023_1, {"NPU"}}, + {"Gelu", V_2023_1, {"CPU", "GPU"}}, {"Gemm", V_2020_4, {"CPU", "GPU"}}, - {"Gemm", V_2023_0, {"NPU"}}, {"GlobalAveragePool", V_2020_4, {"CPU", "GPU"}}, - {"GlobalAveragePool", V_2023_0, {"NPU"}}, {"GlobalLpPool", V_2020_4, {"CPU", "GPU"}}, - {"GlobalLpPool", V_2023_1, {"NPU"}}, {"GlobalMaxPool", V_2022_1, {"CPU", "GPU"}}, - {"GlobalMaxPool", V_2023_1, {"NPU"}}, {"Greater", V_2020_4, {"CPU", "GPU"}}, - {"Greater", V_2023_0, {"NPU"}}, {"GreaterOrEqual", V_2022_1, {"CPU", "GPU"}}, - {"GreaterOrEqual", V_2023_0, {"NPU"}}, {"GridSample", V_2022_3, {"CPU"}}, {"GridSample", V_2023_0, {"GPU"}}, - {"GridSample", V_2023_1, {"NPU"}}, - {"HardMax", V_2023_1, {"CPU", "GPU", "NPU"}}, + {"HardMax", V_2023_1, {"CPU", "GPU"}}, {"Identity", V_2020_4, {"CPU", "GPU"}}, - {"Identity", V_2023_0, {"NPU"}}, // NoOP {"If", V_2022_3, {"CPU", "GPU"}}, - {"If", V_2023_1, {"NPU"}}, {"ImageScaler", V_2022_1, {"CPU", "GPU"}}, - {"ImageScaler", V_2023_0, {"NPU"}}, {"InstanceNormalization", V_2020_4, {"CPU", "GPU"}}, - {"InstanceNormalization", V_2023_0, {"NPU"}}, {"HardSigmoid", V_2020_4, {"CPU", "GPU"}}, - {"HardSigmoid", V_2023_1, {"NPU"}}, {"HardMax", V_2022_1, {"CPU", "GPU"}}, + {"LayerNormalization", V_2023_0, {"CPU", "GPU"}}, {"LeakyRelu", V_2020_4, {"CPU", "GPU"}}, - {"LeakyRelu", V_2023_0, {"NPU"}}, {"Less", V_2020_4, {"CPU", "GPU"}}, - {"Less", V_2023_0, {"NPU"}}, // Added for whisper decoder model. {"LessOrEqual", V_2022_1, {"CPU", "GPU"}}, - {"LessOrEqual", V_2023_0, {"NPU"}}, {"Log", V_2020_4, {"CPU", "GPU"}}, - {"Log", V_2023_0, {"NPU"}}, {"LogSoftMax", V_2022_1, {"CPU", "GPU"}}, {"Loop", V_2021_4, {"CPU", "GPU"}}, - {"LpNormalization", V_2023_1, {"CPU", "GPU", "NPU"}}, - {"LpPool", V_2023_1, {"CPU", "GPU", "NPU"}}, + {"LpNormalization", V_2023_1, {"CPU", "GPU"}}, {"LRN", V_2020_4, {"CPU", "GPU"}}, - {"LRN", V_2023_0, {"NPU"}}, {"LSTM", V_2020_4, {"CPU", "GPU"}}, - {"LSTM", V_2023_1, {"NPU"}}, {"MatMul", V_2020_4, {"CPU", "GPU"}}, - {"MatMul", V_2023_0, {"NPU"}}, {"MatMulInteger", V_2022_1, {"CPU"}}, - {"MatMulInteger", V_2023_1, {"NPU"}}, {"Max", V_2020_4, {"CPU", "GPU"}}, - {"Max", V_2023_0, {"NPU"}}, {"MaxPool", V_2020_4, {"CPU", "GPU"}}, - {"MaxPool", V_2023_0, {"NPU"}}, {"Mean", V_2020_4, {"CPU", "GPU"}}, - {"Mean", V_2023_0, {"NPU"}}, {"MeanVarianceNormalization", V_2022_1, {"CPU", "GPU"}}, - {"MeanVarianceNormalization", V_2023_1, {"NPU"}}, {"Min", V_2020_4, {"CPU", "GPU"}}, - {"Min", V_2023_0, {"NPU"}}, {"Mod", V_2022_1, {"CPU", "GPU"}}, {"Mul", V_2020_4, {"CPU", "GPU"}}, - {"Mul", V_2023_0, {"NPU"}}, {"Neg", V_2020_4, {"CPU", "GPU"}}, - {"Neg", V_2023_0, {"NPU"}}, {"NonMaxSuppression", V_2021_1, {"CPU", "GPU"}}, - {"NonMaxSuppression", V_2023_1, {"NPU"}}, {"NonZero", V_2021_1, {"CPU"}}, {"NonZero", V_2023_0, {"GPU"}}, {"Not", V_2021_1, {"CPU", "GPU"}}, {"Not", V_2020_4, {"CPU", "GPU"}}, - {"Not", V_2023_1, {"NPU"}}, {"OneHot", V_2020_4, {"CPU", "GPU"}}, - {"OneHot", V_2023_1, {"NPU"}}, {"Or", V_2022_1, {"CPU", "GPU"}}, - {"Or", V_2023_1, {"NPU"}}, {"Pad", V_2020_4, {"CPU", "GPU"}}, - {"Pad", V_2023_0, {"NPU"}}, {"Pow", V_2020_4, {"CPU", "GPU"}}, - {"Pow", V_2023_0, {"NPU"}}, {"PRelu", V_2020_4, {"CPU", "GPU"}}, - {"PRelu", V_2023_0, {"NPU"}}, {"QLinearMatMul", V_2022_3, {"CPU"}}, - // {"QLinearMatMul", V_2023_1, {"NPU"}}, {"QuantizeLinear", V_2021_4, {"CPU", "GPU"}}, - {"QuantizeLinear", V_2023_0, {"NPU"}}, {"RNN", V_2023_1, {"CPU", "GPU"}}, {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}}, {"RandomNormalLike", V_2023_0, {"CPU", "GPU"}}, - {"RandomNormalLike", V_2023_1, {"NPU"}}, {"RandomNormal", V_2023_0, {"CPU", "GPU"}}, - {"RandomNormal", V_2023_1, {"NPU"}}, {"Range", V_2022_1, {"CPU", "GPU"}}, - {"Range", V_2023_0, {"NPU"}}, {"Reciprocal", V_2020_4, {"CPU", "GPU"}}, - {"Reciprocal", V_2023_0, {"NPU"}}, {"ReduceL1", V_2022_1, {"CPU", "GPU"}}, - {"ReduceL1", V_2023_1, {"NPU"}}, {"ReduceL2", V_2022_1, {"CPU", "GPU"}}, - {"ReduceL2", V_2023_1, {"NPU"}}, {"ReduceLogSum", V_2020_4, {"CPU"}}, {"ReduceLogSum", V_2022_1, {"CPU", "GPU"}}, - {"ReduceLogSum", V_2023_1, {"NPU"}}, {"ReduceLogSumExp", V_2022_1, {"CPU", "GPU"}}, - {"ReduceLogSumExp", V_2023_1, {"NPU"}}, {"ReduceMax", V_2020_4, {"CPU", "GPU"}}, - {"ReduceMax", V_2023_1, {"NPU"}}, {"ReduceMean", V_2020_4, {"CPU", "GPU"}}, - {"ReduceMean", V_2023_0, {"NPU"}}, {"ReduceMin", V_2020_4, {"CPU", "GPU"}}, - {"ReduceMin", V_2023_1, {"NPU"}}, {"ReduceProd", V_2020_4, {"CPU"}}, {"ReduceProd", V_2022_1, {"GPU"}}, - {"ReduceProd", V_2023_1, {"NPU"}}, {"ReduceSum", V_2020_4, {"CPU", "GPU"}}, - // {"ReduceSum", V_2023_1, {"NPU"}}, {"ReduceSumSquare", V_2020_4, {"CPU"}}, {"ReduceSumSquare", V_2022_1, {"CPU", "GPU"}}, - {"ReduceSumSquare", V_2023_1, {"NPU"}}, {"Relu", V_2020_4, {"CPU", "GPU"}}, - {"Relu", V_2023_0, {"NPU"}}, {"Resize", V_2020_4, {"CPU"}}, {"Resize", V_2022_1, {"GPU"}}, - {"Resize", V_2023_1, {"NPU"}}, {"Reshape", V_2020_4, {"CPU", "GPU"}}, - {"Reshape", V_2023_0, {"NPU"}}, {"ReverseSequence", V_2022_1, {"CPU", "GPU"}}, {"RoiAlign", V_2021_1, {"CPU", "GPU"}}, - {"RoiAlign", V_2023_1, {"NPU"}}, {"Round", V_2021_4, {"CPU", "GPU"}}, - {"Round", V_2023_1, {"NPU"}}, {"Scatter", V_2022_1, {"CPU", "GPU"}}, - {"Scatter", V_2023_1, {"NPU"}}, {"ScatterElements", V_2022_1, {"CPU", "GPU"}}, - {"ScatterElements", V_2023_1, {"NPU"}}, {"ScatterND", V_2022_1, {"CPU", "GPU"}}, - {"ScatterND", V_2023_1, {"NPU"}}, {"Selu", V_2020_4, {"CPU", "GPU"}}, - {"Selu", V_2023_1, {"NPU"}}, {"Shape", V_2020_4, {"CPU", "GPU"}}, - {"Shape", V_2023_0, {"NPU"}}, {"Shrink", V_2022_1, {"CPU", "GPU"}}, - {"Shrink", V_2023_0, {"NPU"}}, {"Sigmoid", V_2020_4, {"CPU", "GPU"}}, - {"Sigmoid", V_2023_0, {"NPU"}}, {"Sign", V_2020_4, {"CPU"}}, {"Sign", V_2022_1, {"GPU"}}, - {"Sign", V_2023_0, {"NPU"}}, {"Sin", V_2022_1, {"CPU", "GPU"}}, - {"Sin", V_2023_0, {"NPU"}}, {"Sinh", V_2020_4, {"CPU"}}, - {"Sinh", V_2023_1, {"NPU"}}, {"Size", V_2022_1, {"CPU", "GPU"}}, - {"Size", V_2023_1, {"NPU"}}, {"Slice", V_2020_4, {"CPU", "GPU"}}, - {"Slice", V_2023_0, {"NPU"}}, {"Softmax", V_2020_4, {"CPU", "GPU"}}, - {"Softmax", V_2023_0, {"NPU"}}, {"Softplus", V_2022_1, {"CPU", "GPU"}}, - {"Softplus", V_2023_0, {"NPU"}}, {"Softsign", V_2022_1, {"CPU", "GPU"}}, {"SpaceToDepth", V_2020_4, {"CPU", "GPU"}}, - {"SpaceToDepth", V_2023_0, {"NPU"}}, {"Split", V_2020_4, {"CPU", "GPU"}}, - {"Split", V_2023_0, {"NPU"}}, {"Sqrt", V_2020_4, {"CPU", "GPU"}}, - {"Sqrt", V_2023_0, {"NPU"}}, {"Squeeze", V_2020_4, {"CPU", "GPU"}}, - {"Squeeze", V_2023_0, {"NPU"}}, {"Softsign", V_2020_4, {"CPU"}}, {"Sub", V_2020_4, {"CPU", "GPU"}}, - {"Sub", V_2023_0, {"NPU"}}, {"Sum", V_2020_4, {"CPU", "GPU"}}, - {"Sum", V_2023_0, {"NPU"}}, {"Tan", V_2020_4, {"CPU", "GPU"}}, - {"Tan", V_2023_1, {"NPU"}}, {"Tanh", V_2020_4, {"CPU", "GPU"}}, - {"Tanh", V_2023_0, {"NPU"}}, {"ThresholdedRelu", V_2022_1, {"CPU", "GPU"}}, - {"ThresholdedRelu", V_2023_0, {"NPU"}}, {"Tile", V_2021_3, {"CPU", "GPU"}}, - {"Tile", V_2023_0, {"NPU"}}, {"Transpose", V_2020_4, {"CPU", "GPU"}}, - {"Transpose", V_2023_0, {"NPU"}}, {"Trilu", V_2023_0, {"CPU", "GPU"}}, - {"Trilu", V_2023_1, {"NPU"}}, {"TopK", V_2020_4, {"CPU", "GPU"}}, - {"TopK", V_2023_0, {"NPU"}}, {"Upsample", V_2020_4, {"CPU", "GPU"}}, {"Unsqueeze", V_2020_4, {"CPU", "GPU"}}, - {"Unsqueeze", V_2023_0, {"NPU"}}, {"Where", V_2022_1, {"CPU", "GPU"}}, - {"Where", V_2023_0, {"NPU"}}, // Added for whisper decoder model. {"Xor", V_2022_1, {"CPU", "GPU"}}, - {"Xor", V_2023_1, {"NPU"}}, }; void DataOps::populate_types_supported() { @@ -370,6 +251,8 @@ void DataOps::populate_types_supported() { std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32)); supported_types_initializer_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT64)); + supported_types_initializer_.insert( + std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)); supported_types_initializer_.insert( std::make_pair(V_2021_1, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_FLOAT16)); supported_types_initializer_.insert( @@ -387,6 +270,8 @@ void DataOps::populate_types_supported() { std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8)); supported_types_npu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16)); + supported_types_npu_.insert( + std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)); supported_types_npu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32)); supported_types_npu_.insert( @@ -402,6 +287,8 @@ void DataOps::populate_types_supported() { std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT32)); supported_types_cpu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT16)); + supported_types_cpu_.insert( + std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_UINT16)); supported_types_cpu_.insert( std::make_pair(V_2020_4, ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8)); supported_types_cpu_.insert( @@ -437,13 +324,12 @@ void DataOps::populate_op_mode_supported() { no_dimension_supported_.push_back({"DequantizeLinear", V_2021_4, {"All"}}); no_dimension_supported_.push_back({"Equal", V_2022_1, {"CPU"}}); no_dimension_supported_.push_back({"Equal", V_2023_0, {"GPU"}}); + no_dimension_supported_.push_back({"Expand", V_2023_3, {"CPU"}}); no_dimension_supported_.push_back({"Floor", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"Gather", V_2020_4, {"All"}}); - no_dimension_supported_.push_back({"Greater", V_2023_0, {"NPU"}}); no_dimension_supported_.push_back({"Identity", V_2023_0, {"All"}}); no_dimension_supported_.push_back({"Less", V_2022_1, {"CPU"}}); no_dimension_supported_.push_back({"Loop", V_2021_4, {"All"}}); - no_dimension_supported_.push_back({"Max", V_2023_0, {"NPU"}}); no_dimension_supported_.push_back({"Min", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"Mul", V_2020_4, {"All"}}); no_dimension_supported_.push_back({"Neg", V_2023_0, {"CPU", "GPU"}}); @@ -476,9 +362,8 @@ void DataOps::populate_op_mode_supported() { { UnsupportedOpMode obj = {{V_2022_1, V_2022_2, V_2022_3}, [this](const Node* node, const InitializedTensorSet&) { - // Abs is not supproted with INT8 or INT32 as input data type on GPU and NPU - if ((device_id_.find("GPU") != std::string::npos) || - (device_id_.find("NPU") != std::string::npos)) { + // Abs is not supproted with INT8 or INT32 as input data type on GPU + if ((device_id_.find("GPU") != std::string::npos)) { for (size_t i = 0; i < node->InputDefs().size(); i++) { if (node->InputDefs()[i]->TypeAsProto()->tensor_type().elem_type() == ONNX_NAMESPACE::TensorProto_DataType::TensorProto_DataType_INT8 || @@ -706,7 +591,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"PRelu", obj}); } { - UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0}, [this](const Node* node, const InitializedTensorSet&) { const auto& input_arg = node->InputDefs()[1]; auto shape = input_arg->Shape(); @@ -821,7 +706,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Squeeze", obj}); } { - UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0}, [this](const Node* node, const InitializedTensorSet&) { // If the operator is unsqueeze // If axes is an input, then we cannot produce a static graph. @@ -836,7 +721,7 @@ void DataOps::populate_op_mode_supported() { op_list_.insert({"Unsqueeze", obj}); } { - UnsupportedOpMode obj = {{V_2023_0, V_2023_1, V_2023_2, V_2023_3}, + UnsupportedOpMode obj = {{V_2023_1, V_2023_2, V_2023_3, V_2024_0}, [this](const Node* node, const InitializedTensorSet&) { // check for attributes auto& upsample_attr = node->GetAttributes(); @@ -961,7 +846,7 @@ bool DataOps::type_is_supported(const NodeArg* node_arg, bool is_initializer) { } else { auto dtype = type_proto->tensor_type().elem_type(); - if (device_id_.find("NPU") != std::string::npos || device_id_.find("HETERO") != std::string::npos || + if (device_id_.find("HETERO") != std::string::npos || device_id_.find("MULTI") != std::string::npos || device_id_.find("AUTO") != std::string::npos) { for (auto const& var : supported_types_npu_) { if ((var.first <= version_id_) && @@ -1063,8 +948,7 @@ bool DataOps::dimension_unsupported(const Node* node) { return true; } -bool DataOps::node_is_supported(const std::map>& op_map, - const NodeIndex node_idx) { +bool DataOps::node_is_supported(const NodeIndex node_idx) { const auto& node = graph_viewer_.GetNode(node_idx); const auto& optype = node->OpType(); @@ -1174,37 +1058,14 @@ bool DataOps::node_is_supported(const std::mapOpType()); - if (opset == op_map.end()) { -#ifndef NDEBUG - if (openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "Failed in Unsupported onnx model domain" << std::endl; - } -#endif - return false; - } - if (opset->second.find(optype) == opset->second.end() && op_fun == ops_supported_as_function.end()) { -#ifndef NDEBUG - if (openvino_ep::backend_utils::IsDebugEnabled()) { - std::cout << "The operator is not available in OpenVINO ngraph operators list" - << "nor the operator is a special ONNX function" - << std::endl; - } -#endif - return false; - } return true; } std::vector DataOps::GetUnsupportedNodeIndices(std::unordered_set& ng_required_initializers) { - const auto ng_supported_ops = GetNgSupportedOps(GetOnnxOpSet(graph_viewer_)); - std::vector unsupported_nodes_idx; for (const auto& node_idx : graph_viewer_.GetNodesInTopologicalOrder()) { - if (node_is_supported(ng_supported_ops, node_idx)) { + if (node_is_supported(node_idx)) { // Collect inputs that are initializers graph_viewer_.GetNode(node_idx)->ForEachDef([&ng_required_initializers, this](const NodeArg& node_arg, bool is_input) { diff --git a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h index 87688601ad692..0990904908111 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/data_ops.h +++ b/onnxruntime/core/providers/openvino/ov_versions/data_ops.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once @@ -26,7 +26,8 @@ enum versionNum { V_2023_0, V_2023_1, V_2023_2, - V_2023_3 + V_2023_3, + V_2024_0 }; using VersionNum = enum versionNum; @@ -67,9 +68,7 @@ class DataOps { bool dimension_unsupported(const Node* node); bool unsupported_op_mode(const Node* node); bool type_is_supported(const NodeArg* node_arg, bool is_initializer); - bool node_is_supported(const std::map>& op_map, - const NodeIndex node_idx); + bool node_is_supported(const NodeIndex node_idx); public: DataOps(const GraphViewer& graph_viewer_param, VersionNum ver, const std::string dev_id, const std::string device_precision) diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.cc b/onnxruntime/core/providers/openvino/ov_versions/utils.cc index ee0bfddb7dc83..c5ed29df487b4 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/utils.cc +++ b/onnxruntime/core/providers/openvino/ov_versions/utils.cc @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #include "core/providers/shared_library/provider_api.h" @@ -11,14 +11,6 @@ #pragma GCC diagnostic ignored "-Wunused-parameter" #endif -#include "openvino/core/deprecated.hpp" -#define IN_OV_COMPONENT -#define NGRAPH_LEGACY_HEADER_INCLUDED -#include - -#undef NGRAPH_LEGACY_HEADER_INCLUDED -#undef IN_OV_COMPONENT - #if defined(_MSC_VER) #pragma warning(default : 4244 4245) #elif __GNUC__ @@ -95,20 +87,6 @@ int GetOnnxOpSet(const GraphViewer& graph_viewer) { return dm_to_ver.at(kOnnxDomain); } -std::map> GetNgSupportedOps(const int onnx_opset) { - std::map> ng_supported_ops; - OPENVINO_SUPPRESS_DEPRECATED_START - ng_supported_ops.emplace(kOnnxDomain, ngraph::onnx_import::get_supported_operators(onnx_opset, kOnnxDomain)); - - const std::set ng_disabled_ops = {"LSTM"}; // Place-holder for ops not supported. - - for (const auto& disabled_op : ng_disabled_ops) { - ng_supported_ops.at(kOnnxDomain).erase(disabled_op); - } - OPENVINO_SUPPRESS_DEPRECATED_END - return ng_supported_ops; -} - /** * Returns a vector clusters(or node_idx). For each unsupported node, the graph is split into 3 parts. * supported_cluster + (UNsupported_node + rest_of_the_graph). This functions returns vector of all supported_clusters by nGraph diff --git a/onnxruntime/core/providers/openvino/ov_versions/utils.h b/onnxruntime/core/providers/openvino/ov_versions/utils.h index b3edeef88dfec..34aa762ba9b67 100644 --- a/onnxruntime/core/providers/openvino/ov_versions/utils.h +++ b/onnxruntime/core/providers/openvino/ov_versions/utils.h @@ -1,4 +1,4 @@ -// Copyright (C) 2019-2022 Intel Corporation +// Copyright (C) Intel Corporation // Licensed under the MIT License #pragma once diff --git a/onnxruntime/python/onnxruntime_pybind_state.cc b/onnxruntime/python/onnxruntime_pybind_state.cc index e5e0e81cb7da8..7b56f0c68427a 100644 --- a/onnxruntime/python/onnxruntime_pybind_state.cc +++ b/onnxruntime/python/onnxruntime_pybind_state.cc @@ -937,6 +937,20 @@ std::unique_ptr CreateExecutionProviderInstance( ORT_THROW("Invalid value passed for disable_dynamic_shapes: ", option.second); } OV_provider_options_map[option.first] = option.second; + } else if (option.first == "enable_dynamic_shapes") { + LOGS_DEFAULT(WARNING) << " Deprecation notice - 'enable_dynamic_shapes' is Deprected. Upgrade the API to disable_dynamic_shapes parameter." + "Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met."; + std::string value; + if (!(option.second == "True" || option.second == "true" || + option.second == "False" || option.second == "false")) { + ORT_THROW("Invalid value passed for enable_dynamic_shapes: ", option.second); + } + if (option.second == "True" || option.second == "true") { + value = "false"; + } else { + value = "true"; + } + OV_provider_options_map["disable_dynamic_shapes"] = value; } else if (option.first == "device_id") { OV_provider_options_map[option.first] = option.second; continue; @@ -967,7 +981,7 @@ std::unique_ptr CreateExecutionProviderInstance( if (!Env::Default().GetEnvironmentVar("INTEL_OPENVINO_DIR").empty()) { ORT_THROW("INTEL_OPENVINO_DIR is set but OpenVINO library wasn't able to be loaded. Please install a supported version of OpenVINO as mentioned in the requirements page (https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements), ensure dependency libraries are in the PATH and your hardware is supported."); } else { - LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please reference https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met."; + LOGS_DEFAULT(WARNING) << "Failed to create " << type << ". Please refer https://onnxruntime.ai/docs/execution-providers/OpenVINO-ExecutionProvider.html#requirements to ensure all dependencies are met."; } } #endif diff --git a/onnxruntime/python/onnxruntime_pybind_state_common.h b/onnxruntime/python/onnxruntime_pybind_state_common.h index 6827f2c9dfd91..22314610dbee9 100644 --- a/onnxruntime/python/onnxruntime_pybind_state_common.h +++ b/onnxruntime/python/onnxruntime_pybind_state_common.h @@ -60,11 +60,8 @@ struct OrtStatus { #elif OPENVINO_CONFIG_GPU_FP16 #define BACKEND_OPENVINO "-OPENVINO_GPU_FP16" -#elif OPENVINO_CONFIG_NPU_FP16 -#define BACKEND_OPENVINO "-OPENVINO_NPU_FP16" - -#elif OPENVINO_CONFIG_NPU_U8 -#define BACKEND_OPENVINO "-OPENVINO_NPU_U8" +#elif OPENVINO_CONFIG_NPU +#define BACKEND_OPENVINO "-OPENVINO_NPU" #elif OPENVINO_CONFIG_MULTI #define BACKEND_OPENVINO "-OPENVINO_MULTI" diff --git a/onnxruntime/test/contrib_ops/activation_op_test.cc b/onnxruntime/test/contrib_ops/activation_op_test.cc index 2a56991ec5af4..061fffa572be2 100644 --- a/onnxruntime/test/contrib_ops/activation_op_test.cc +++ b/onnxruntime/test/contrib_ops/activation_op_test.cc @@ -50,11 +50,15 @@ TEST_F(ActivationOpTest, ParametricSoftplus) { {{"alpha", alpha}, {"beta", beta}}, {}, false); // Disable TensorRT due to result mismatch } +// [TODO] Temporarily ignore this test for OpenVINO +// Fails due to accuracy mismatch +#if !defined(USE_OPENVINO) TEST_F(ActivationOpTest, Gelu) { TestActivationOp( "Gelu", input_values, [](float x) { return x * 0.5f * (1.0f + std::erf(x * static_cast(M_SQRT1_2))); }, {}, {}, false, 1, kMSDomain); } +#endif #if defined(USE_DNNL) std::vector expected_output_bfloat16(const std::vector& input_data) { diff --git a/onnxruntime/test/perftest/ort_test_session.cc b/onnxruntime/test/perftest/ort_test_session.cc index 6e10763d8f293..9743ed18a6cc0 100644 --- a/onnxruntime/test/perftest/ort_test_session.cc +++ b/onnxruntime/test/perftest/ort_test_session.cc @@ -247,7 +247,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device if (key == "device_type") { std::set ov_supported_device_types = {"CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU.0_FP32", "GPU.1_FP32", "GPU_FP16", - "GPU.0_FP16", "GPU.1_FP16"}; + "GPU.0_FP16", "GPU.1_FP16", "NPU"}; if (ov_supported_device_types.find(value) != ov_supported_device_types.end()) { ov_options[key] = value; } else if (value.find("HETERO:") == 0) { @@ -260,7 +260,7 @@ OnnxRuntimeTestSession::OnnxRuntimeTestSession(Ort::Env& env, std::random_device ORT_THROW( "[ERROR] [OpenVINO] You have selected a wrong configuration value for the key 'device_type'. " "Select from 'CPU_FP32', 'CPU_FP16', 'GPU_FP32', 'GPU.0_FP32', 'GPU.1_FP32', 'GPU_FP16', " - "'GPU.0_FP16', 'GPU.1_FP16' or from" + "'GPU.0_FP16', 'GPU.1_FP16', 'NPU' or from" " HETERO/MULTI/AUTO options available. \n"); } } else if (key == "device_id") { diff --git a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc index acd513172f95d..d2e883331acd4 100644 --- a/onnxruntime/test/providers/cpu/activation/activation_op_test.cc +++ b/onnxruntime/test/providers/cpu/activation/activation_op_test.cc @@ -697,7 +697,9 @@ TEST(LeakyReluGradInferenceTest, Basic) { // Remove DNNL from running this test because DNNL Gelu op seems not check domain for kernel implementation. // It will run the DNNL Gelu op which only be part of standard of Gelu-20 op. -#if !defined(USE_DNNL) && !defined(USE_QNN) +// [TODO] Temporarily ignore this test for OpenVINO to avoid an exception due to mishandling of the +// approximate parameter. Re-enable it later when the issue is fixed +#if !defined(USE_DNNL) && !defined(USE_QNN) && !defined(USE_OPENVINO) TEST_F(ActivationOpTest, ONNX_Gelu) { TestActivationOp( "Gelu", diff --git a/onnxruntime/test/python/onnx_backend_test_series.py b/onnxruntime/test/python/onnx_backend_test_series.py index c48b07422d452..e441230537410 100644 --- a/onnxruntime/test/python/onnx_backend_test_series.py +++ b/onnxruntime/test/python/onnx_backend_test_series.py @@ -140,8 +140,8 @@ def create_backend_test(test_name=None): if backend.supports_device("OPENVINO_CPU_FP16"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_CPU_FP16") - if backend.supports_device("OPENVINO_NPU_FP16"): - current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU_FP16") + if backend.supports_device("OPENVINO_NPU"): + current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_NPU") if backend.supports_device("OPENVINO"): current_failing_tests += apply_filters(filters, "current_failing_tests_OPENVINO_opset18") diff --git a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc index ca089c42032b1..f120bf9968558 100644 --- a/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc +++ b/onnxruntime/test/testdata/onnx_backend_test_series_filters.jsonc @@ -493,9 +493,12 @@ "test_range_float_type_positive_delta_expanded_cpu", // Error but not a failure. "test_scan_sum_cpu", // Disabled due to output mismatch with tolerance. "test_scan9_sum_cpu", // Disabled due to output mismatch with tolerance. - "test_reduce_max_bool_inputs_cpu" + "test_reduce_max_bool_inputs_cpu", + "test_gelu_default_1_cpu", // Disabled due to accuracy mismatch + "test_gelu_default_2_cpu" + ], - "current_failing_tests_OPENVINO_NPU_FP16": [ + "current_failing_tests_OPENVINO_NPU": [ "^test_prelu_broadcast", "test_loop11_cpu" ], diff --git a/tools/ci_build/build.py b/tools/ci_build/build.py index fd9f106f7ad9b..3c1bdfc54c12e 100644 --- a/tools/ci_build/build.py +++ b/tools/ci_build/build.py @@ -73,13 +73,14 @@ def _str_to_bool(s): def _openvino_verify_device_type(device_read): - choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16"] + choices = ["CPU_FP32", "CPU_FP16", "GPU_FP32", "GPU_FP16", "NPU"] choices1 = [ "CPU_FP32_NO_PARTITION", "CPU_FP16_NO_PARTITION", "GPU_FP32_NO_PARTITION", "GPU_FP16_NO_PARTITION", + "NPU_NO_PARTITION", ] status_hetero = True res = False @@ -94,7 +95,7 @@ def _openvino_verify_device_type(device_read): if len(comma_separated_devices) < 2: print("At least two devices required in Hetero/Multi/Auto Mode") status_hetero = False - dev_options = ["CPU", "GPU"] + dev_options = ["CPU", "GPU", "NPU"] for dev in comma_separated_devices: if dev not in dev_options: status_hetero = False @@ -105,7 +106,7 @@ def invalid_hetero_build(): print("specify the keyword HETERO or MULTI or AUTO followed by the devices ") print("in the order of priority you want to build\n") print("The different hardware devices that can be added in HETERO or MULTI or AUTO") - print("are ['CPU','GPU'] \n") + print("are ['CPU','GPU','NPU'] \n") print("An example of how to specify the hetero build type. Ex: HETERO:GPU,CPU \n") print("An example of how to specify the MULTI build type. Ex: MULTI:GPU,CPU \n") print("An example of how to specify the AUTO build type. Ex: AUTO:GPU,CPU \n") @@ -1226,6 +1227,7 @@ def generate_build_tree( "-Donnxruntime_USE_OPENVINO_GPU_FP16=" + ("ON" if args.use_openvino == "GPU_FP16" else "OFF"), "-Donnxruntime_USE_OPENVINO_CPU_FP32=" + ("ON" if args.use_openvino == "CPU_FP32" else "OFF"), "-Donnxruntime_USE_OPENVINO_CPU_FP16=" + ("ON" if args.use_openvino == "CPU_FP16" else "OFF"), + "-Donnxruntime_USE_OPENVINO_NPU=" + ("ON" if args.use_openvino == "NPU" else "OFF"), "-Donnxruntime_USE_OPENVINO_GPU_FP32_NP=" + ("ON" if args.use_openvino == "GPU_FP32_NO_PARTITION" else "OFF"), "-Donnxruntime_USE_OPENVINO_GPU_FP16_NP=" @@ -1234,6 +1236,7 @@ def generate_build_tree( + ("ON" if args.use_openvino == "CPU_FP32_NO_PARTITION" else "OFF"), "-Donnxruntime_USE_OPENVINO_CPU_FP16_NP=" + ("ON" if args.use_openvino == "CPU_FP16_NO_PARTITION" else "OFF"), + "-Donnxruntime_USE_OPENVINO_NPU_NP=" + ("ON" if args.use_openvino == "NPU_NO_PARTITION" else "OFF"), "-Donnxruntime_USE_OPENVINO_HETERO=" + ("ON" if args.use_openvino.startswith("HETERO") else "OFF"), "-Donnxruntime_USE_OPENVINO_DEVICE=" + (args.use_openvino), "-Donnxruntime_USE_OPENVINO_MULTI=" + ("ON" if args.use_openvino.startswith("MULTI") else "OFF"), diff --git a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml index c92fc93abba37..03e0274fc198a 100644 --- a/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml +++ b/tools/ci_build/github/azure-pipelines/linux-openvino-ci-pipeline.yml @@ -32,5 +32,5 @@ jobs: parameters: AgentPool : 'Linux-CPU-2019' JobName: 'Linux_CI_Dev' - RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2023.0.0 -x "--use_openvino CPU_FP32 --build_wheel"' + RunDockerBuildArgs: '-o ubuntu20.04 -d openvino -v 2024.0.0 -x "--use_openvino CPU_FP32 --build_wheel"' TimeoutInMinutes: 120 diff --git a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino index a0ba5ea232ca3..45682c797bbb8 100644 --- a/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino +++ b/tools/ci_build/github/linux/docker/Dockerfile.ubuntu_openvino @@ -1,8 +1,8 @@ ARG UBUNTU_VERSION=20.04 FROM ubuntu:${UBUNTU_VERSION} -ARG OPENVINO_VERSION=2023.0.0 -ARG PYTHON_VERSION=3.8 +ARG OPENVINO_VERSION=2024.0.0 +ARG PYTHON_VERSION=3.9 ADD scripts /tmp/scripts RUN /tmp/scripts/install_ubuntu.sh -p ${PYTHON_VERSION} -d EdgeDevice && \ @@ -14,15 +14,14 @@ RUN apt update && apt install -y libnuma1 ocl-icd-libopencl1 && \ ENV INTEL_OPENVINO_DIR /opt/intel/openvino_${OPENVINO_VERSION} ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH -ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake -ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake +ENV OpenVINO_DIR $INTEL_OPENVINO_DIR/runtime/cmake ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64 ENV DEBIAN_FRONTEND=noninteractive RUN cd /opt && mkdir -p intel && cd intel && \ - wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2023.0/linux/l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \ - tar xzf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64.tgz && \ - mv l_openvino_toolkit_ubuntu20_2023.0.0.10926.b4452d56304_x86_64 openvino_2023.0.0 && \ + wget https://storage.openvinotoolkit.org/repositories/openvino/packages/2024.0/linux/l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \ + tar xzf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && rm -rf l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64.tgz && \ + mv l_openvino_toolkit_ubuntu20_2024.0.0.14509.34caeefd078_x86_64 openvino_2024.0.0 && \ cd $INTEL_OPENVINO_DIR/install_dependencies && ./install_openvino_dependencies.sh -y WORKDIR /root diff --git a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython b/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython deleted file mode 100644 index bc0b412773286..0000000000000 --- a/tools/ci_build/github/linux/docker/Dockerfile_manylinux2014_openvino_multipython +++ /dev/null @@ -1,83 +0,0 @@ -FROM quay.io/pypa/manylinux2014_x86_64:latest - -ENV PATH /opt/rh/devtoolset-10/root/usr/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin -ADD scripts /tmp/scripts -RUN cd /tmp/scripts && /tmp/scripts/manylinux/install_centos.sh && /tmp/scripts/manylinux/install_deps.sh && rm -rf /tmp/scripts - -ARG PYTHON_VER_PATH="cp38-cp38" -ARG PYTHON_VERSION="3.8" -ARG BUILD_UID=1001 -ARG BUILD_USER=onnxruntimedev -ARG OV_DEVICE_PRECISION="CPU_FP32" -ARG ENABLE_TRAINING=true -ARG ORT_BRANCH="rel-1.13.1" -ARG OV_VERSION="2022.2.0" -RUN adduser --uid $BUILD_UID $BUILD_USER -WORKDIR /home/$BUILD_USER -ENV PYTHON_EXE="/opt/python/$PYTHON_VER_PATH/bin/python$PYTHON_VERSION" - -RUN yum -y install wget git - -# libusb1.0.22 -RUN cd /home/ && wget https://github.com/libusb/libusb/archive/v1.0.22.zip && \ - unzip v1.0.22.zip && rm -rf v1.0.22.zip && cd /home/libusb-1.0.22 && \ -# bootstrap steps - ./bootstrap.sh && \ - ./configure --disable-udev --enable-shared && \ - make -j4 && \ -# configure libusb1.0.22 - cd /home/libusb-1.0.22/libusb && \ - /bin/mkdir -p '/usr/local/lib' && \ - /bin/bash ../libtool --mode=install /usr/bin/install -c libusb-1.0.la '/usr/local/lib' && \ - /bin/mkdir -p '/usr/local/include/libusb-1.0' && \ - /usr/bin/install -c -m 644 libusb.h '/usr/local/include/libusb-1.0' && \ - /bin/mkdir -p '/usr/local/lib/pkgconfig' - -RUN ${PYTHON_EXE} -m pip install onnx numpy wheel -USER $BUILD_USER -RUN cd $WORKDIR && git clone https://github.com/openvinotoolkit/openvino.git && \ - cd openvino && \ - git checkout $OV_VERSION && \ - git submodule init && \ - git submodule update --recursive - -RUN cd $WORKDIR && cd openvino && mkdir build && cd build && \ - cmake .. -DCMAKE_BUILD_TYPE=Release -DCMAKE_CXX_FLAGS=-D_GLIBCXX_USE_CXX11_ABI=0 -DENABLE_PYTHON=ON -DPYTHON_EXECUTABLE=$PYTHON_EXE -DCMAKE_INSTALL_PREFIX=/home/onnxruntimedev/openvino_$OV_VERSION && \ - make -j8 && make install - -ENV INTEL_OPENVINO_DIR /home/onnxruntimedev/openvino_$OV_VERSION -ENV LD_LIBRARY_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64:$INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib:/usr/local/openblas/lib:$LD_LIBRARY_PATH -ENV TBB_LIBS $INTEL_OPENVINO_DIR/runtime/3rdparty/tbb/lib -ENV InferenceEngine_DIR $INTEL_OPENVINO_DIR/runtime/cmake -ENV ngraph_DIR $INTEL_OPENVINO_DIR/runtime/cmake -ENV IE_PLUGINS_PATH $INTEL_OPENVINO_DIR/runtime/lib/intel64 -ENV OPENVINO_MANYLINUX 1 - -RUN cd $WORKDIR && \ - git clone --recursive -b $ORT_BRANCH https://github.com/intel/onnxruntime.git -RUN cd onnxruntime/onnxruntime/core/providers/openvino && mkdir scripts - -RUN cp ${IE_PLUGINS_PATH}/libopenvino.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_c.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_onnx_frontend.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_cpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_gpu_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_intel_myriad_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_hetero_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/libopenvino_auto_plugin.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/plugins.xml /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${IE_PLUGINS_PATH}/usb-ma2x8x.mvcmd /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${TBB_LIBS}/libtbb.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${TBB_LIBS}/libtbb.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${TBB_LIBS}/libtbbmalloc.so /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cp ${TBB_LIBS}/libtbbmalloc.so.2 /home/onnxruntimedev/onnxruntime/onnxruntime/core/providers/openvino/scripts/ -RUN cd /home/onnxruntimedev/onnxruntime && git pull -RUN if $ENABLE_TRAINING; then \ - ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \ - --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \ - --config Release --build_wheel --skip_tests --enable_training ; \ - else \ - ${PYTHON_EXE} ./onnxruntime/tools/ci_build/build.py \ - --build_dir ./onnxruntime/build --use_openvino $(OV_DEVICE_PRECISION) --build_shared_lib \ - --config Release --build_wheel --skip_tests ;\ - fi diff --git a/tools/nuget/generate_nuspec_for_native_nuget.py b/tools/nuget/generate_nuspec_for_native_nuget.py index d5139f00e2f04..31c920c6e4438 100644 --- a/tools/nuget/generate_nuspec_for_native_nuget.py +++ b/tools/nuget/generate_nuspec_for_native_nuget.py @@ -734,7 +734,7 @@ def generate_files(line_list, args): ) if args.execution_provider == "openvino": - openvino_path = get_env_var("INTEL_OPENVINO_DIR") + get_env_var("INTEL_OPENVINO_DIR") files_list.append( "' ) - if is_windows(): - dll_list_path = os.path.join(openvino_path, "runtime\\bin\\intel64\\Release\\") - tbb_list_path = os.path.join(openvino_path, "runtime\\3rdparty\\tbb\\bin\\") - - for dll_element in os.listdir(dll_list_path): - if dll_element.endswith("dll"): - files_list.append( - "' - ) - - for tbb_element in os.listdir(tbb_list_path): - if tbb_element.endswith("dll"): - files_list.append( - "' - ) - if args.execution_provider == "cuda" or is_cuda_gpu_win_sub_package and not is_ado_packaging_build: files_list.append( " actual prompt's length, the benchmark E2E tool will automatically trim the actual prompt's length so that {prompt_length} = actual prompt's length. + 2) You can add a new key-value entry in '{args.prompts_file}' of the form '{prompt_length}': 'your prompt goes here'. + """ + ) + ) + prompt = [size_to_prompt[prompt_length]] * batch_size + csv_metrics = [batch_size, prompt_length] + + try: + # Measure prompt processing + logger.info("Measuring prompt processing...") + inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt) + accelerator_prompt_latency_s, outputs = run_inference(args, model, args.num_runs, inputs, outputs) + + # Calculate prompt metrics + accelerator_prompt_latency_ms = accelerator_prompt_latency_s * 1000 + accelerator_prompt_thrpt = batch_size * (prompt_length / accelerator_prompt_latency_s) + logger.info(f"Average Latency of Prompt Processing: {accelerator_prompt_latency_ms} ms") + logger.info( + f"Average Throughput of Prompt Processing: {batch_size * (prompt_length / accelerator_prompt_latency_s)} tps" + ) + csv_metrics.extend([accelerator_prompt_latency_ms, accelerator_prompt_thrpt]) + + # Measure token generation + logger.info("Measuring token generation...") + clear_cache() + inputs, outputs = prepare_model_for_inference(args, model, config, tokenizer, prompt_length, prompt) + + all_token_ids = inputs["input_ids"].clone() + current_length = all_token_ids.shape[-1] + num_heads = config.num_key_value_heads + head_size = ( + config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads + ) + + has_eos = torch.zeros(batch_size, device=args.target_device, dtype=torch.bool) + + # 0th entry will have prompt accelerator time, 1st entry onwards will have token generation accelerator time + accelerator_times = [] + sampling_times = [] # cost to sample after each model run + + wall_clock_start_time = time.perf_counter() + while current_length <= max_length: + # Run inference + accelerator_time_latency_s, outputs = run_inference(args, model, 1, inputs, outputs) + accelerator_times.append(accelerator_time_latency_s) + + # Sample with argmax (greedy search) + sampling_start_time = time.perf_counter() + if outputs["logits"].shape[1] > 1: + prompt_end_indices = inputs["attention_mask"].sum(1) - 1 + idxs = ( + prompt_end_indices.unsqueeze(dim=1) + .repeat(1, config.vocab_size) + .view(batch_size, 1, config.vocab_size) + ) + next_token_logits = torch.gather(outputs["logits"], 1, idxs).squeeze() + else: + next_token_logits = outputs["logits"][:, -1, :] + next_tokens = torch.argmax(next_token_logits, dim=-1) + + # Check if we previously reached EOS token id or if generated token id is EOS token id + has_eos = has_eos | next_tokens == tokenizer.eos_token_id + + # Determine which new tokens to add to list of all token ids + # Add EOS token ids for batch entries that ended early (ragged batching scenario where some batch entries ended early and some haven't) + tokens_to_add = next_tokens.masked_fill(has_eos, tokenizer.eos_token_id).reshape([batch_size, 1]) + sampling_end_time = time.perf_counter() + sampling_times.append(sampling_end_time - sampling_start_time) + + all_token_ids = torch.cat([all_token_ids, tokens_to_add], dim=-1) + + # Return early if all batch entries have reached EOS token id + current_length += 1 + if torch.all(has_eos) or current_length > max_length: + break + + # Update inputs for next inference run + inputs["input_ids"] = tokens_to_add + inputs["attention_mask"] = torch.cat( + [inputs["attention_mask"], (~has_eos).to(torch.int64).reshape(batch_size, 1)], 1 + ) + inputs["position_ids"] = ( + None + if "position_ids" not in inputs + else torch.max(inputs["position_ids"], dim=1)[0].reshape(batch_size, 1) + 1 + ) + + # Set logits to zeros for next inference run and re-use memory buffer + if outputs["logits"].shape[1] != 1: + outputs["logits"] = outputs["logits"][:, :1, :].contiguous() + outputs["logits"].zero_() + + # Update KV caches for next inference run + if args.engine == "pt": + # Update KV caches for PyTorch + inputs["past_key_values"] = outputs["past_key_values"] + elif not args.use_buffer_share: + # Update KV caches for ONNX Runtime if buffer sharing is not used + for i in range(config.num_hidden_layers): + inputs[f"past_key_values.{i}.key"] = outputs[f"present.{i}.key"] + inputs[f"past_key_values.{i}.value"] = outputs[f"present.{i}.value"] + + new_sequence_length = inputs["attention_mask"].shape[1] + for i in range(config.num_hidden_layers): + present_key = torch.zeros( + batch_size, + num_heads, + new_sequence_length, + head_size, + device=args.target_device, + dtype=args.torch_dtype, + ) + present_value = torch.zeros( + batch_size, + num_heads, + new_sequence_length, + head_size, + device=args.target_device, + dtype=args.torch_dtype, + ) + outputs.update( + { + f"present.{i}.key": present_key.contiguous(), + f"present.{i}.value": present_value.contiguous(), + } + ) + + wall_clock_end_time = time.perf_counter() + + # Filter out any anomaly accelerator times (e.g. for `torch.compile`) + accelerator_times.pop(0) # Remove prompt processing time + if args.anomaly_filtering: + anomaly_threshold_factor = 10 + min_time_s = min(accelerator_times) + orig_size = len(accelerator_times) + accelerator_times = list( + filter(lambda acc_time: acc_time < anomaly_threshold_factor * min_time_s, accelerator_times) + ) + new_size = len(accelerator_times) + logger.info( + f"Filtered out {orig_size - new_size} anomaly accelerator times that are {anomaly_threshold_factor}x greater than {min_time_s * 1000} ms..." + ) + + ####################################################### + # Calculate sampling and first token generated metrics + ####################################################### + + # Calculate sampling metrics + avg_sampling_latency_s = sum(sampling_times) / len(sampling_times) + avg_sampling_latency_ms = avg_sampling_latency_s * 1000 + avg_sampling_thrpt = batch_size * (1 / avg_sampling_latency_s) + logger.info(f"Average Latency of Sampling: {avg_sampling_latency_ms} ms") + logger.info(f"Average Throughput of Sampling: {avg_sampling_thrpt} tps") + + # Calculate first token generated metrics + first_token_latency_s = accelerator_times[0] + first_token_latency_ms = first_token_latency_s * 1000 + first_token_thrpt = batch_size * (1 / first_token_latency_s) + logger.info(f"Latency of First Token Generated: {first_token_latency_ms} ms") + logger.info(f"Throughput of First Token Generated: {first_token_thrpt} tps") + + #################################################### + # Calculate first `halfway` token generated metrics + #################################################### + + halfway = args.generation_length // 2 + halfway_token_latency_s = sum(accelerator_times[:halfway]) / len(accelerator_times[:halfway]) + halfway_token_latency_ms = halfway_token_latency_s * 1000 + halfway_token_thrpt = batch_size * (1 / halfway_token_latency_s) + logger.info(f"Average Latency of First {halfway} Tokens Generated: {halfway_token_latency_ms} ms") + logger.info(f"Average Throughput of First {halfway} Tokens Generated: {halfway_token_thrpt} tps") + + ######################################### + # Calculate all tokens generated metrics + ######################################### + + all_token_latency_s = sum(accelerator_times) / len(accelerator_times) + all_token_latency_ms = all_token_latency_s * 1000 + all_token_thrpt = batch_size * (1 / all_token_latency_s) + logger.info( + f"Average Latency of First {args.generation_length} Tokens Generated: {all_token_latency_ms} ms" + ) + logger.info(f"Average Throughput of First {args.generation_length} Tokens Generated: {all_token_thrpt} tps") + + ############################### + # Calculate wall clock metrics + ############################### + + wall_clock_latency_s = wall_clock_end_time - wall_clock_start_time + wall_clock_thrpt = batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s) + logger.info(f"Wall-Clock Latency: {wall_clock_latency_s} s") + logger.info( + f"Wall-Clock Throughput: {batch_size * ((prompt_length + args.generation_length) / wall_clock_latency_s)} tps" + ) + + # Add metrics to CSV + logger.info("Adding results to CSV") + csv_metrics.extend( + [ + avg_sampling_latency_ms, + avg_sampling_thrpt, + first_token_latency_ms, + first_token_thrpt, + halfway_token_latency_ms, + halfway_token_thrpt, + all_token_latency_ms, + all_token_thrpt, + wall_clock_latency_s, + wall_clock_thrpt, + ] + ) + all_csv_metrics.append(csv_metrics) + + except: # noqa: E722 + logger.info(f"Could not benchmark at batch size = {batch_size}, prompt length = {prompt_length}") + + filename = f"benchmark_{args.engine}_e2e_{datetime.datetime.now():%Y-%m-%d_%H:%M:%S}.csv" + save_results(all_csv_metrics, filename, args.generation_length) + + +if __name__ == "__main__": + main() diff --git a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py index 1ad58327b7fc2..b649f7ab65049 100644 --- a/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py +++ b/onnxruntime/python/tools/transformers/models/llama/convert_to_onnx.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- from __future__ import annotations import argparse diff --git a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py index 72192ce8d8c63..3b53f60758b27 100644 --- a/onnxruntime/python/tools/transformers/models/llama/dist_settings.py +++ b/onnxruntime/python/tools/transformers/models/llama/dist_settings.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- import os import torch.distributed as dist diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py index 18202f4b81c0f..5aed55c12f38f 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_inputs.py @@ -1,8 +1,13 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- from __future__ import annotations import numpy as np import torch -from transformers import AutoConfig +from transformers import AutoConfig, AutoTokenizer from onnxruntime import InferenceSession, OrtValue @@ -269,6 +274,8 @@ def convert_inputs_for_ort( return ort_inputs +# Re-allocate KV caches from (batch_size, num_heads, past_sequence_length, head_size) to +# (batch_size, num_heads, max_sequence_length, head_size) for past-present buffer sharing def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_seq_len: int): for k, v in ort_inputs.items(): # Allocate new buffers with max_sequence_length for GQA @@ -281,8 +288,29 @@ def enable_past_present_share_buffer(ort_inputs: dict, past_seq_len: int, max_se return ort_inputs -# Add IO bindings for execution providers -def add_io_bindings( +# Verify ONNX Runtime inputs with model +def verify_ort_inputs(model: InferenceSession, ort_inputs: dict): + # Check that all model inputs will be provided + model_inputs = set(map(lambda model_input: model_input.name, model.get_inputs())) + user_inputs = set(ort_inputs.keys()) + missing_inputs = model_inputs - user_inputs + if len(missing_inputs): + print(f"The following model inputs are missing: {missing_inputs}") + raise Exception("There are missing inputs to the model. Please add them and try again.") + + # Remove unnecessary inputs from model inputs + unnecessary_inputs = user_inputs - model_inputs + if len(unnecessary_inputs): + for unnecessary_input in unnecessary_inputs: + print(f"Removing unnecessary input '{unnecessary_input}' from user provided inputs") + del ort_inputs[unnecessary_input] + + return ort_inputs + + +# Add IO bindings for execution providers using OrtValue +# Use when you need to run inference once or twice to save memory +def add_io_bindings_as_ortvalues( model: InferenceSession, ort_inputs: dict, device: str, device_id: int, use_gqa: bool, kv_cache_ortvalues: dict ): io_binding = model.io_binding() @@ -318,3 +346,163 @@ def add_io_bindings( io_binding.bind_output(name, device_type=device, device_id=device_id) return io_binding, kv_cache_ortvalues + + +# Add IO bindings for execution providers using PyTorch tensors +# Use when you need to run inference many times +def add_io_bindings_as_tensors( + model: InferenceSession, inputs: dict, outputs: dict, use_fp16: bool, use_buffer_share: bool +): + # Verify model inputs + inputs = verify_ort_inputs(model, inputs) + + device = None + pt_to_np = { + "torch.int32": np.int32, + "torch.int64": np.int64, + "torch.float16": np.float16, + "torch.float32": np.float32, + } + + # Bind inputs/outputs to IO binding + io_binding = model.io_binding() + for k, v in inputs.items(): + io_binding.bind_input( + name=k, + device_type=v.device.type, + device_id=0 if v.device.type == "cpu" else v.device.index, + element_type=pt_to_np[repr(v.dtype)], + shape=tuple(v.shape), + buffer_ptr=v.data_ptr(), + ) + device = v.device + + for output in model.get_outputs(): + name = output.name + if use_buffer_share and "present" in name: + # Bind KV cache outputs to KV cache inputs + v = inputs[name.replace("present", "past_key_values")] + io_binding.bind_output( + name=name, + device_type=v.device.type, + device_id=v.device.index, + element_type=np.float16, + shape=tuple(v.shape), + buffer_ptr=v.data_ptr(), + ) + else: + v = outputs[name] + io_binding.bind_output( + name=name, + device_type=device.type, + device_id=0 if device.type == "cpu" else device.index, + element_type=(np.float16 if use_fp16 else np.float32), + shape=tuple(v.shape), + buffer_ptr=v.data_ptr(), + ) + + return io_binding + + +# Get actual inputs when using real data (instead of sample data) and initialize outputs +def get_initial_inputs_and_outputs( + config: AutoConfig, + tokenizer: AutoTokenizer, + requested_length: int, + prompt: list[str], + device: torch.device, + use_fp16: bool, + use_buffer_share: bool, + engine: str, +): + tokenizer.pad_token = "[PAD]" + encodings_dict = tokenizer.batch_encode_plus(prompt, padding=True) + torch_dtype = torch.float16 if use_fp16 else torch.float32 + + # input_ids: pad token id is 0 + # attention_mask: pad token id is 0 + # position_ids: pad token id is 1 + input_ids = torch.tensor(encodings_dict["input_ids"], device=device, dtype=torch.int64) + attention_mask = torch.tensor(encodings_dict["attention_mask"], device=device, dtype=torch.int64) + position_ids = get_position_ids(attention_mask, use_past_kv=False) + + # Check if tokenized prompt length matches the requested prompt length + tokenized_length = input_ids.shape[-1] + if tokenized_length > requested_length: + # Shorten the inputs from (batch_size, tokenized_length) to (batch_size, requested_length) + input_ids = input_ids[:, :requested_length] + attention_mask = attention_mask[:, :requested_length] + position_ids = get_position_ids(attention_mask, use_past_kv=False) + elif tokenized_length < requested_length: + # Lengthen the inputs from (batch_size, tokenized_length) to (batch_size, requested_length) + input_ids_first_col = input_ids[:, 0].unsqueeze(0).T + attention_mask_first_col = attention_mask[:, 0].unsqueeze(0).T + for _ in range(requested_length - tokenized_length): + input_ids = torch.hstack((input_ids_first_col, input_ids)) + attention_mask = torch.hstack((attention_mask_first_col, attention_mask)) + position_ids = get_position_ids(attention_mask, use_past_kv=False) + + tokenized_length = input_ids.shape[-1] + assert tokenized_length == requested_length + + # Create inputs + inputs = { + "input_ids": input_ids.contiguous() if engine == "ort" else input_ids, + "attention_mask": attention_mask.contiguous() if engine == "ort" else attention_mask, + "position_ids": position_ids.contiguous() if engine == "ort" else position_ids, + } + if engine != "ort": + inputs["past_key_values"] = [] + + # Get shape of KV cache inputs + batch_size, sequence_length = input_ids.shape + max_sequence_length = config.max_position_embeddings + num_heads = config.num_key_value_heads + head_size = config.head_dim if hasattr(config, "head_dim") else config.hidden_size // config.num_attention_heads + + # Create KV cache inputs + for i in range(config.num_hidden_layers): + past_key = torch.zeros( + batch_size, + num_heads, + max_sequence_length if use_buffer_share else 0, + head_size, + device=device, + dtype=torch_dtype, + ) + past_value = torch.zeros( + batch_size, + num_heads, + max_sequence_length if use_buffer_share else 0, + head_size, + device=device, + dtype=torch_dtype, + ) + if engine == "ort": + inputs.update( + { + f"past_key_values.{i}.key": past_key.contiguous(), + f"past_key_values.{i}.value": past_value.contiguous(), + } + ) + else: + inputs["past_key_values"].append((past_key, past_value)) + + outputs = None + if engine == "ort": + # Create outputs + logits = torch.zeros(batch_size, sequence_length, config.vocab_size, device=device, dtype=torch_dtype) + outputs = {"logits": logits.contiguous()} + if not use_buffer_share: + for i in range(config.num_hidden_layers): + present_key = torch.zeros( + batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype + ) + present_value = torch.zeros( + batch_size, num_heads, sequence_length, head_size, device=device, dtype=torch_dtype + ) + outputs.update( + {f"present.{i}.key": present_key.contiguous(), f"present.{i}.value": present_value.contiguous()} + ) + + return inputs, outputs diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py index f41a90208c51b..9cbc9af7fe9b5 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_parity.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_parity.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- from __future__ import annotations import argparse @@ -10,7 +15,7 @@ from benchmark_helper import setup_logger from dist_settings import get_rank, get_size from llama_inputs import ( - add_io_bindings, + add_io_bindings_as_ortvalues, convert_inputs_for_ort, get_merged_sample_with_past_kv_inputs, get_sample_inputs, @@ -123,7 +128,7 @@ def verify_parity( # Add IO bindings for non-CPU execution providers if args.execution_provider != "cpu": - io_binding, kv_cache_ortvalues = add_io_bindings( + io_binding, kv_cache_ortvalues = add_io_bindings_as_ortvalues( ort_model, inputs, args.execution_provider, diff --git a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py index 89b459c80beec..d570e2d7ee086 100644 --- a/onnxruntime/python/tools/transformers/models/llama/llama_torch.py +++ b/onnxruntime/python/tools/transformers/models/llama/llama_torch.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- import logging import os diff --git a/onnxruntime/python/tools/transformers/models/llama/prompts.json b/onnxruntime/python/tools/transformers/models/llama/prompts.json new file mode 100644 index 0000000000000..5d8fae99dbc7e --- /dev/null +++ b/onnxruntime/python/tools/transformers/models/llama/prompts.json @@ -0,0 +1,11 @@ +{ + "16": "How are astronauts launched into space quickly on those rockets? ", + "64": "Today, we will learn how to bake a chocolate cake. First, you need to have all of the ingredients to bake. Otherwise, the chocolate cake won't be tasty. You will also need a large baking pan to hold the batter. ", + "256": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to ", + "1024": "Risk Management and Insurance (RMI) is a field that focuses on the identification, assessment and financial mitigation of risk. It's about insurance but also more than that. For example, insurance companies look at risk factors such as age, gender and medical history to determine how much they will charge for life insurance coverage. However, RMI is not just about buying insurance (although it is a big part of this). It is also about taking steps to reduce the likelihood that something bad happens in the first place. For example, you may think twice before crossing a busy road if there's a high risk of being hit by a car or getting injured. In addition to insurance companies and financial services firms, RMI professionals work with individuals (customers), businesses and other entities (clients). Their job is to identify potential risks and help mitigate them before they become problems for their clients. This can include helping people prepare financially for unexpected events like losing a job or being injured in an accident, as well as assisting businesses with managing risk exposure from things like natural disasters or cyber attacks. Insurance companies use RMI to assess the level of risk associated with potential customers and determine how much they should charge them for coverage. For example, if you are a healthy 25-year old male who doesn't smoke and has never been in an accident, your insurance premiums will likely be lower than those of someone else who fits into one or more of these categories (or all three). Risk Management & Insurance is the process by which you can protect yourself from financial loss. It's about taking control of your money and making sure that it's safe, secure and accessible to you when you need it most. The first step in risk management is understanding what risks are important to you as an individual or a family member who may depend on the income generated by these investments for their livelihood. Once you have identified these key risk factors, then we can help identify how best to manage them through various strategies such as setting up automatic payments into savings accounts so that money is always available when needed most; setting aside emergency funds in case something unexpected happens (e.g., illness); investing wisely so that returns outpace inflation over time; diversifying portfolios by adding stocks and bonds which will help reduce volatility while still providing growth potential through dividends/interest payments over longer periods of time than if invested solely into one type of asset class alone etc. The field of risk management and insurance is growing rapidly, as more people become aware of the potential dangers that can arise from an unforeseen event or accident. As a result, there are many different careers within this field that you may want to consider if you're interested in working with risks and helping others protect themselves from them.One common career path in risk management is as an insurance agent/broker. This person would work for an insurance company or brokerage firm, selling policies to clients who need coverage against things like car accidents or home damage caused by natural disasters such as fires or floods. Insurance agents typically work on commission (i.e., they receive a percentage of every sale). This is important because it means that the more successful an agent is at selling policies, the higher his/her income will be. Another career option within risk management is working for an insurance company itself rather than as an external broker or salesperson. In this case, you'd help manage claims made by policyholders who have been injured through no fault of their own (for example after being hit by another driver). You can also work in risk analysis, a field that involves analyzing the potential risks associated with various investments and projects. This is done to determine whether or not an opportunity has enough upside to justify taking on any related risks. In addition, you might also be responsible for developing strategies to minimize those risks so they don't result in big losses if something goes wrong down the road. If your goal is to work as a broker or agent, then there are some prerequisites that will need to be met before beginning this career path: You must have an associate's degree from an accredited college; pass an exam administered by state regulators (the Series 6) and/or complete additional training offered by professional organizations such as NAFA, which stands for National Association of Financial Advisors. After meeting these requirements, you'll then need to find employment at one or more insurance companies where they offer positions that allow new hires some flexibility when starting out their careers.Risk management and insurance is a broad field that includes many different types of jobs. ", + "2048": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. In conclusion, we must use AI responsibly. ", + "3840": "Artificial Intelligence (AI) is a transformative technology that has the potential to revolutionize society in many ways. AI can be used to enhance the accuracy and efficiency of decision-making, improve lives through new apps and services, and solve some of the thorny policy problems of climate change, infrastructure, and healthcare. In this essay, I will discuss some of the ways AI can benefit society. One of the most significant benefits of AI is its ability to improve healthcare. AI can assist doctors, nurses, and other healthcare professionals in making better diagnoses and faster decisions on a course of treatment, based on the large amount of data that currently exists. AI allows doctors to pinpoint effective drugs that may have otherwise been overlooked and can identify higher-risk individuals before any human can. AI can also help relieve the burden on healthcare professionals by taking care of routine data collection and filing, freeing up time for other higher-value activities. Another area where AI can benefit society is in the fight against climate change. AI can be used to analyze vast amounts of data, identify patterns, and provide accurate predictions. It can help us forecast what further spread of pandemics is going to look like, and track their development around the world. AI can also help us predict the impact of climate change on our planet and develop strategies to mitigate its effects. For example, AI can be used to optimize energy consumption, reduce waste, and improve the efficiency of transportation systems. AI can also benefit society by improving education. AI-powered educational tools can help students learn more effectively by providing personalized learning experiences tailored to their individual needs. AI can also help teachers by automating routine tasks such as grading and providing feedback on student work. This can free up time for teachers to focus on more important tasks such as lesson planning and student engagement. AI can also benefit society by improving public safety. AI-powered surveillance systems can help law enforcement agencies detect and prevent crime more effectively. AI can also be used to analyze social media data to identify potential threats and prevent them before they occur. For example, AI can be used to detect hate speech and other forms of online harassment, which can help prevent cyberbullying and other forms of online abuse. Finally, AI can benefit society by improving the economy. AI can help businesses become more efficient by automating routine tasks and providing insights into customer behavior. This can help businesses make better decisions and improve their bottom line. AI can also help create new jobs by enabling the development of new products and services that were previously impossible. In conclusion, AI has the potential to benefit society in many ways. From improving healthcare and education to fighting climate change and improving public safety, AI can help us solve some of the most pressing problems facing our world today. As we continue to develop and refine this transformative technology, it is important that we do so in an ethical and responsible manner, ensuring that the benefits of AI are shared by all members of society. AI has been a topic of discussion for many years, and while it has brought many benefits to society, there are also concerns about its impact. In this essay, I will discuss some of the reasons why AI may not help society. Firstly, AI can be biased. AI systems are designed by humans, and they can be infused with the biases of their creators. This can lead to discrimination against certain groups of people and can perpetuate existing inequalities in society. Additionally, AI can lack transparency, making it difficult to understand how decisions are being made. This can lead to mistrust of AI systems and can hinder their adoption. Secondly, AI can be used to automate jobs, which can lead to unemployment. While AI can increase productivity and efficiency, it can also lead to job displacement, particularly in industries that rely heavily on manual labor. This can have a negative impact on individuals and communities, particularly those that are already marginalized. Thirdly, AI can be used to create fake content, such as deepfakes, which can be used to spread misinformation and propaganda. This can have serious consequences for democracy and can undermine trust in institutions. Fourthly, AI can be used to create autonomous weapons, which can have devastating consequences. These weapons can make decisions without human intervention, which can lead to unintended consequences and can be difficult to control. Fifthly, AI can be used to create surveillance systems that infringe on privacy rights. These systems can be used to monitor individuals without their knowledge or consent, which can have serious consequences for civil liberties. In conclusion, while AI has many potential benefits, there are also concerns about its impact on society. It is important to consider these concerns and to ensure that AI is developed and used in a responsible and ethical manner. Within AI, there are also many subfields. Reinforcement learning is a type of machine learning algorithm that focuses on training models to make decisions in an environment in order to maximize a reward. This is typically done through trial and error, as the algorithm receives feedback in the form of rewards or punishments for its actions. Reinforcement learning has many potential benefits for society, some of which are discussed below. Firstly, reinforcement learning can be used to improve industrial automation and robotics. By training robots to learn from their own experiences, they can gain the skills necessary to perform complex tasks without human intervention. This can lead to increased efficiency and productivity in industries such as manufacturing and logistics. Secondly, reinforcement learning can be used to optimize traffic control systems. By training models to make real-time decisions based on traffic patterns and other data, traffic flow can be improved, reducing congestion and travel times. Thirdly, reinforcement learning can be used to improve healthcare. By training models to make decisions based on patient data, doctors can make more accurate diagnoses and develop more effective treatment plans. This can lead to better health outcomes for patients and can reduce healthcare costs. Fourthly, reinforcement learning can be used to improve education. By training models to adapt to individual student needs, personalized learning experiences can be created that are tailored to each student\u2019s strengths and weaknesses. This can lead to improved academic performance and can help to close the achievement gap. Finally, reinforcement learning can be used to improve environmental sustainability. By training models to make decisions based on environmental data, such as weather patterns and pollution levels, more effective policies can be developed to reduce carbon emissions and protect natural resources. In conclusion, reinforcement learning has many potential benefits for society. By training models to make decisions based on feedback from their environment, we can create more efficient and effective systems in a wide range of fields. However, it is important to consider the ethical implications of these technologies and to ensure that they are developed and used in a responsible and ethical manner. Multi-modal models are another type of machine learning that can process and find relationships between different types of data, such as images, video, audio, and text. They have the potential to revolutionize many aspects of our lives, from healthcare to transportation to education. In this essay, I will discuss how multi-modal models can help society in various ways. One of the most significant benefits of multi-modal models is their ability to transform unstructured data into structured data that can be analyzed. For example, a company could use a multi-modal model to extract data from images or PDFs of invoices or receipts. This would enable them to analyze the data more efficiently and make better-informed decisions. Another benefit of multi-modal models is their ability to cater to various learning styles. Blended and multi-modal learning can reach people who benefit from different learning styles. By understanding their individual learning styles, employees can leverage resources that are compatible with how they process information most effectively. Multi-modal models can also help improve healthcare. For example, they can be used to analyze medical images and identify patterns that might be difficult for human doctors to detect. This can lead to earlier diagnoses and more effective treatments. In addition, multi-modal models can help improve transportation. For example, they can be used to analyze traffic patterns and optimize traffic flow. This can help reduce congestion and improve safety on the roads. Finally, multi-modal models can help improve education. For example, they can be used to create personalized learning experiences for students based on their individual learning styles. This can help students learn more effectively and efficiently. In conclusion, multi-modal models have the potential to help society in many ways. They can transform unstructured data into structured data, cater to various learning styles, improve healthcare, transportation, and education. However, like any new technology, it is important to approach it with caution and consider the potential risks and benefits. I hope this essay has provided some insight into the potential benefits of multi-modal models. Semi-supervised learning is a type of machine learning that falls in between supervised and unsupervised learning. It is a method that uses a small amount of labeled data and a large amount of unlabeled data to train a model. The goal of semi-supervised learning is to learn a function that can accurately predict the output variable based on the input variables, similar to supervised learning. However, unlike supervised learning, the algorithm is trained on a dataset that contains both labeled and unlabeled data. Semi-supervised learning is particularly useful when there is a large amount of unlabeled data available, but it\u2019s too expensive or difficult to label all of it. The primary advantage of semi-supervised learning is that it can reduce the amount of annotated data used. This is particularly useful when labeled data is scarce or expensive to obtain. By using a small amount of labeled data and a large amount of unlabeled data, semi-supervised learning algorithms can learn from both types of data and improve their accuracy. Semi-supervised learning algorithms are also capable of consolidating overfitting tendencies, which is a common problem in supervised learning. Another advantage of semi-supervised learning is that it is versatile. It can be applied in various situations, from image recognition to crawlers. For example, in text classification, the goal is to classify a given text into one or more predefined categories. Semi-supervised learning can be used to train a text classification model using a small amount of labeled data and a large amount of unlabeled text data. In image classification, the goal is to classify a given image into one or more predefined categories. Semi-supervised learning can be used to train an image classification model using a small amount of labeled data and a large amount of unlabeled image data. In anomaly detection, the goal is to detect patterns or observations that are unusual or different from the norm. Semi-supervised learning can be used to detect anomalies using a small amount of labeled data and a large amount of unlabeled data. Semi-supervised learning algorithms are also stable and simple. They have high efficiency and can be used to improve the performance and generalization of models. However, semi-supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, semi-supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, semi-supervised learning is a powerful tool that can be used to improve the accuracy and generalization of machine learning models. It is particularly useful when labeled data is scarce or expensive to obtain. Semi-supervised learning algorithms can learn from both labeled and unlabeled data, which makes them versatile and capable of consolidating overfitting tendencies. However, semi-supervised learning algorithms also have some disadvantages, such as requiring a large amount of unlabeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, semi-supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Supervised learning is a type of machine learning that involves training a model on labeled data. The goal of supervised learning is to learn a function that can accurately predict the output variable based on the input variables. Supervised learning is widely used in various fields, including image recognition, speech recognition, natural language processing, and more. One of the primary advantages of supervised learning is that it allows for accurate predictions. Supervised learning models can provide highly accurate predictions or classifications when trained on a diverse and representative dataset. This makes supervised learning particularly useful in situations where accuracy is critical, such as in medical diagnosis or fraud detection. Another advantage of supervised learning is that it is easy to understand and implement. Supervised learning algorithms are relatively simple and can be implemented using a variety of programming languages and libraries. This makes it accessible to a wide range of developers and data scientists. Supervised learning is also versatile. It can be applied to a wide range of problem domains, making it a flexible approach for various industries and applications. For example, in image classification, the goal is to classify a given image into one or more predefined categories. Supervised learning can be used to train an image classification model using a labeled dataset of images and their corresponding categories. In speech recognition, the goal is to transcribe spoken words into text. Supervised learning can be used to train a speech recognition model using a labeled dataset of audio recordings and their corresponding transcriptions. Supervised learning algorithms are also capable of handling missing data. If there is missing data in the labeled dataset, supervised learning algorithms can still learn from the available data and make accurate predictions. This is particularly useful in situations where data is incomplete or noisy. However, supervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of labeled data to be effective. If there is not enough labeled data available, the algorithm may not be able to learn effectively. Additionally, supervised learning algorithms can be sensitive to the quality of the labeled data. If the labeled data is noisy or incorrect, the algorithm may not be able to learn effectively. In conclusion, supervised learning is a powerful tool that can be used to make accurate predictions and classifications. It is easy to understand and implement, and it is versatile enough to be applied to a wide range of problem domains. However, supervised learning algorithms also have some disadvantages, such as requiring a large amount of labeled data to be effective and being sensitive to the quality of the labeled data. Despite these disadvantages, supervised learning is a valuable technique that can be used to improve the performance of machine learning models. Unsupervised learning is a type of machine learning that involves training a model on unlabeled data. The goal of unsupervised learning is to learn the underlying structure of the data, without any prior knowledge of the output variable. Unsupervised learning is widely used in various fields, including image recognition, natural language processing, and more. One of the primary advantages of unsupervised learning is that it can handle large amounts of unlabeled and unstructured data. This makes unsupervised learning particularly useful in situations where labeled data is scarce or expensive to obtain. By using unsupervised learning algorithms, we can learn from the available data and make accurate predictions. Another advantage of unsupervised learning is that it can identify previously undetected patterns in data. Unsupervised learning algorithms can be used to cluster data points into groups based on their similarities. This can be useful in various applications, such as customer segmentation, anomaly detection, and more. Unsupervised learning algorithms are also capable of dimensionality reduction. This is particularly useful when dealing with high-dimensional data, such as images or text. By reducing the dimensionality of the data, unsupervised learning algorithms can improve the efficiency and accuracy of the model. Unsupervised learning algorithms are also capable of feature learning. Feature learning is the process of automatically learning features from the input data. This can be useful in various applications, such as image recognition, where the algorithm can learn features such as edges, corners, and more. However, unsupervised learning algorithms also have some disadvantages. One of the main disadvantages is that they require a large amount of unlabeled data to be effective. If there is not enough unlabeled data available, the algorithm may not be able to learn effectively. Additionally, unsupervised learning algorithms can be sensitive to the quality of the data. If the data is noisy or incorrect, the algorithm may not be able to learn effectively. As you can see, artificial intelligence (AI) is a wide-ranging field that encompasses various sub-fields. Some of the sub-fields that we have previously discussed include reinforcement learning, multi-modal learning, semi-supervised learning, supervised learning, unsupervised learning, and much more. There are also many application domains for artificial intelligence (AI) that can utilize it. Throughout this essay, I have demonstrated the numerous benefits that artificial intelligence (AI) will bring to our society. I have also shown some examples of various categories within artificial intelligence that have varying purposes. It is important to consider that each category has its own purpose and has its own pros and cons to it. What do you think artificial intelligence will bring to our society? Will it be used in a responsible manner? ", + "4096": "In the heart of Eldoria, where ancient forests whispered secrets and rivers sang forgotten melodies, lay the Enchanted Labyrinth. Its walls, adorned with shimmering runes, concealed a portal to realms unknown. Few dared to venture inside, for the labyrinth was said to twist time and reality. Evelyn, a curious young mage, stood before the labyrinth's entrance. Her emerald eyes sparkled with determination. She clutched a cracked map, its ink fading like memories lost to the wind. Legends spoke of a treasure hidden deep within - a relic capable of granting any wish. As Evelyn stepped across the threshold, the air thickened. The walls shifted, rearranging themselves. She followed the faint glow of her lantern, each step echoing through eternity. Shadows danced, whispering forgotten names. Was this a dream or a nightmare? Deeper into the labyrinth, Evelyn encountered Aelar, the Guardian of Time. His silver hair flowed like moonlight, and his eyes held the weight of centuries. Aelar barred her path, his staff crackling with energy. 'Seeker,' he intoned, 'answer my riddle, and the way shall open.' Evelyn's heart raced. 'Ask, Guardian.' 'What has roots as old as time, yet dances with the wind?' She pondered, memories of her grandmother's tales flooding her mind. 'A tree,' she replied. Aelar smiled, and the walls shifted once more. 'Proceed, Seeker.' The labyrinth twisted, revealing a moonlit grove. Trees hummed ancient lullabies, and fireflies wove constellations in the air. At the center stood a weeping willow, its branches brushing the ground like a grieving widow's veil. Evelyn approached, her fingers tracing the bark. 'Why do you weep?' The willow's voice, soft as falling petals, answered, 'I guard the Tear of Eternity.' Evelyn's breath caught. The Tear - a gem said to hold memories of lost civilizations. She plucked it from a low branch, its facets reflecting forgotten faces. As Evelyn pressed onward, the labyrinth tightened its grip. She faced illusions - lovers lost, friends betrayed. Doubt gnawed at her resolve. Was the treasure worth the cost? At the labyrinth's heart, she found a mirror. Her reflection wavered, revealing her deepest desire: her sister, Lysandra, who vanished years ago. Tears blurred the glass. 'Speak your wish,' the mirror whispered. Evelyn's voice trembled. 'Bring Lysandra back.' The mirror shattered, and reality fractured. Lysandra stepped through, eyes wide with wonder. 'Evelyn?' Lysandra's return came at a cost - the labyrinth demanded balance. For every wish granted, a memory faded. Evelyn watched as her childhood laughter dissolved like mist. Together, they exited the labyrinth, the Tear pulsing in Evelyn's palm. She gazed at her sister, both joy and sorrow in her eyes. 'Was it worth it?' Lysandra asked. Evelyn smiled. 'In Eldoria, every choice we make becomes a story. And ours, dear sister, is woven in stardust and sacrifice.' And so, the Enchanted Labyrinth whispered its final secret: Wishes are threads, and memories their loom. In the land of Aetherfall, where mist-clad mountains touched the heavens and rivers whispered forgotten spells, a prophecy echoed through time. It spoke of the Starstone, a gem said to hold the universe's secrets - the key to creation and destruction. Eldric, a humble blacksmith with eyes like storm clouds, stumbled upon an ancient map. Its ink had faded, but the constellations remained. Guided by fate, he set forth, leaving his forge behind. Eldric's journey led him to the Whispering Forest, where trees conversed in hushed tones. Their leaves whispered of hidden paths and treacherous guardians. Eldric's heart pounded as he stepped into the shadows. There, he met Lyria, a forest nymph with silver hair and eyes like moonlit pools. She guarded the first clue - a riddle etched into a petal: 'In the heart of the forest, where time bends, seek the Wellspring of Echoes. There, the Starstone awaits.' Eldric followed Lyria's guidance. The Wellspring lay within a moon-kissed glade. Its waters shimmered, reflecting memories of lost lovers, ancient battles, and forgotten oaths. Eldric dipped his hand, and the riddle unfolded: 'To find the Starstone, seek the Three Keys: the tear of a fallen star, the breath of a dragon, and the song of a forgotten bard.' Eldric climbed the Stardust Peaks, where fallen stars lay embedded in the rock. Each tear held a fragment of cosmic sorrow. He found one - a sapphire gem pulsing with celestial fire. But it was guarded by Drakor, the last of the star dragons. Drakor's scales shimmered like galaxies. His eyes held eons of wisdom. 'Why seek the Tear, mortal?' 'To save Aetherfall,' Eldric replied. 'To restore balance.' Drakor nodded, and with a breath, he shattered the gem. Eldric caught the falling tear - a shard of eternity. Next, Eldric sailed to the Isle of Shadows, where the void whispered secrets. There, he faced Nyxia, the ancient shadow dragon. Her wings spanned continents, and her breath could devour stars. 'Why seek my breath?' Nyxia hissed. 'To awaken the Starstone,' Eldric said. 'To mend the rifts.' Nyxia's eyes glowed. She exhaled - a stream of darkness. Eldric captured it in a crystal vial - the Breath of the Void. The final key lay in the Bard's Hollow, where echoes of lost melodies lingered. Eldric met Silvan, a ghostly minstrel who strummed a lute of moonwood. 'Sing,' Silvan urged. 'The Song of the Forgotten.' Eldric sang of battles, love, and sacrifice. The hollow trembled, and from the mist, a spectral harp appeared. Its strings hummed - the Song of Ages. Eldric plucked the notes, and they merged into a silver key - the Song of the Forgotten. At the Nexus of Worlds, Eldric assembled the keys - the Tear, the Breath, and the Song. The ground quaked, and the Starstone emerged - a gem of cosmic hues. Its light wove reality, mending fractures in Aetherfall. But the prophecy held a twist: the Starstone demanded a choice. Eldric could use it to reshape the world or sacrifice it to heal the void. He gazed at Lyria, Drakor, Nyxia, and Silvan - their fates intertwined. With a heavy heart, he whispered, 'Balance.' And so, the Starstone shattered, its fragments seeding new constellations. Eldric returned to his forge, but his hammer now shaped more than iron - it forged destiny. Lyria, the Forest Nymph Lyria, with her silver hair and eyes like moonlit pools, remained in the Whispering Forest. She became its guardian, weaving spells to protect the ancient trees. Her laughter echoed through the glades, and travelers whispered of a nymph who danced with moonbeams. Lyria's heart held a secret - the memory of Eldric's touch, the warmth of their shared quest. She tended to the Wellspring of Echoes, ensuring its waters flowed through time, carrying whispers of forgotten tales. Drakor, the Last Star Dragon Drakor, the last of the star dragons, retreated to the highest peak of the Stardust Peaks. There, he curled his immense form around the shattered Tear of the Fallen. His scales absorbed its cosmic fire, and he became a living constellation - a beacon for lost souls. Drakor's breath no longer consumed stars; instead, it birthed new constellations. Travelers gazed at the night sky, seeking guidance in his patterns. Drakor's eyes held both sorrow and hope, for he knew that balance required sacrifice. Nyxia, the Ancient Shadow Dragon Nyxia, with wings spanning continents, chose a different path. She descended to the Isle of Shadows, where the void whispered secrets. There, she guarded the Abyss of Remembrance - a rift between worlds. Nyxia's breath no longer devoured stars; it sealed the rifts. She became a bridge, allowing souls to traverse realms. Those who sought lost loved ones or glimpses of forgotten memories found solace in her shadowed embrace. Nyxia's eyes held the weight of choices made and unmade, and she vowed to keep the balance intact. Silvan, the Ghostly Minstrel Silvan, the spectral minstrel, wandered the Bard's Hollow. His lute of moonwood sang melodies of love, loss, and courage. Silvan's song echoed through time, touching hearts across Aetherfall. He became the keeper of memories - the forgotten bard who whispered forgotten names. When travelers stumbled upon the hollow, Silvan strummed his lute, and their own stories surfaced. He wove their experiences into the Song of Ages, ensuring that no tale would fade into oblivion. Silvan's translucent form danced in moonlight, a bridge between the living and the departed. Eldric, the Blacksmith As for Eldric, the humble blacksmith, he returned to his forge in the village of Hearthstone. His hammer now shaped more than iron - it forged destiny. Eldric crafted talismans from the Tear of the Fallen, the Breath of the Void, and the Song of the Forgotten. These talismans healed rifts, mended broken hearts, and ignited hope. Eldric's eyes held the wisdom of realms explored, and he knew that Aetherfall's balance rested on the choices of ordinary souls. He continued to tell the tale of the Starstone, passing it down through generations, ensuring that the magic endured. And so, dear reader, the threads of fate intertwined - a forest nymph, a star dragon, a shadow, and a minstrel - all bound by the echoes of a forgotten song. The Chronicles of the Celestial Weaver In the forgotten village of Astralis, where the night sky wept silver tears, lived a young girl named Elara. Her eyes held the secrets of constellations, and her fingers danced like stardust. But Astralis suffered - a curse had befallen the heavens. The stars dimmed, their brilliance fading. Elara's grandmother, Lyris, whispered of an ancient prophecy: 'When the stars falter, seek the Celestial Weaver.' Elara vowed to unravel the mystery and save her village. Guided by Lyris's map, Elara ventured into the Veiled Forest, where moonlight wove through ancient oaks. There, she met Silas, the enigmatic weaver. His loom hummed with cosmic threads - the Loom of Eternity. 'Seek the lost constellations,' Silas said. 'Weave them anew.' Elara's heart raced. She plucked a silver thread - the remnants of Orion - and began to weave. The loom responded, stars rekindling. But the cost was memory - Elara forgot her childhood laughter. Elara's journey spanned realms: The Nebula Caves: She retrieved the Pleiades, their sisterhood echoing through time. The Comet's Trail: She chased Halley's Comet, capturing its fiery tail. The Abyss of Lyra: There, Vega's song echoed - a melody of love and longing. Each constellation restored, Elara's memories faded. She forgot her first kiss, her mother's lullabies. Yet Astralis glimmered - the stars brightened. In the Celestial Citadel, Elara faced Draco, the fallen dragon. His scales bore scars - the price of rebellion. He guarded the final constellation - the Serpent. 'Why weave the stars?' Draco hissed. 'They betrayed me.' Elara's fingers trembled. 'To save my village.' Draco's eyes softened. 'We were once kin. We'll share this memory.' As Elara wove the Serpent, she glimpsed Draco's love for Lyris - their forbidden bond. The constellation blazed, and Elara remembered both love and sacrifice. Back in Astralis, the stars blazed anew. Villagers rejoiced, but Elara's memories were fragile threads. Lyris embraced her. 'You've woven fate,' Lyris said. 'But the Loom demands balance.' Elara faced Silas. 'What price?' He smiled - a constellation of wrinkles. 'Your memories or the stars.' Elara hesitated. She remembered her grandmother's stories, her stolen kisses. She chose the stars. Elara became the new Celestial Weaver. Her memories - her life - wove into the cosmos. Astralis thrived, but Elara forgot her name, her laughter, her love. Lyris whispered, 'Weavers are forgotten, but their constellations endure.' And so, Elara wove - the forgotten girl who stitched eternity. Elara, now the Celestial Weaver, wove constellations with threads of memory. Astralis thrived - the villagers danced under starlit skies, unaware of their forgotten histories. Lyris watched her granddaughter, her eyes both proud and sorrowful. 'Elara,' Lyris whispered, 'the Loom demands more than memories.' Elara's fingers trembled. She glimpsed her own reflection in the cosmic threads - the girl who once dreamed of love and laughter. But now, her past was a constellation of faded stars. Silas, the former weaver, lingered in the shadows. His form blurred - a specter between realms. He spoke of the Whispering Veil, a boundary separating memory from oblivion. Beyond it lay forgotten worlds, lost loves, and forbidden truths. 'Cross the Veil,' Silas urged. 'Retrieve what was sacrificed.' Elara hesitated. She yearned for her stolen memories - the taste of strawberries, the warmth of a lover's touch. But the Veil was treacherous - a labyrinth of half-remembered echoes. Elara stepped into the Veil. Its mist clung to her skin, whispering secrets. She glimpsed fragments of her past - a stolen kiss, a tear shed for a fallen friend. The path forked: The Garden of Remembrance: Blooming with forgotten faces, this garden promised reunion. Elara could reclaim her lost memories, but at a cost - the stars would dim once more. The Abyss of Oblivion: A chasm of emptiness. Here, Elara could sever her ties to Astralis, becoming a true Celestial Weaver. The stars would blaze forever, but her existence would be a threadless void. Elara hesitated. She remembered Lyris's lullabies, Silas's enigmatic smile, and Draco's love for her grandmother. She yearned for her stolen laughter - the taste of strawberries, the warmth of a lover's touch. But the stars - Astralis - called to her. The village thrived, its people dancing under constellations she had rekindled. Elara's choice would echo across eternity. She faced the Veil's center - a mirror reflecting her fragmented self. Her fingers trembled. 'Balance,' she whispered. And so, Elara wove anew. She plucked threads from the Garden of Remembrance, reclaiming stolen moments. The stars dimmed, but Astralis glowed with forgotten love. Silas nodded. 'You've chosen well, Weaver.' Elara's memories returned - the taste of strawberries, the warmth of a lover's touch. She kissed Lyris's forehead, whispered Draco's name, and stepped back into Astralis. The stars blazed - the legacy of a girl who stitched eternity. Short stories like these are great to listen and read because they allow us to explore our creative minds and broaden our imaginations. They also inspire us to learn from others and can become culturally impactful. The themes of these stories can also dive deep into philosophical questions and raise awareness for important issues. The plots for these stories are sometimes based on real life events as well and can have deep emotional impact.", + "7936": "The Effects of Airplanes: A Closer Look Airplanes have revolutionized the way we travel, connect, and explore the world. From short domestic flights to transcontinental journeys, these metal birds have become an integral part of our lives. However, their impact extends beyond convenience and adventure. Let's delve into the effects of airplanes from various angles. Environmental Impact Fuel Consumption and Emissions Airplanes consume vast amounts of fuel during flight. For instance, a Boeing 747, with a gas tank capacity of 63,500 gallons, burns approximately five gallons of jet fuel per mile traveled. On a 4,000-mile flight, this translates to 20,000 gallons of fuel. However, when we consider the number of passengers (around 400), the fuel efficiency per traveler is surprisingly better than that of cars. A Honda Civic, which gets 30 miles per gallon, would need 133 gallons of fuel for the same distance. Even an RV, which moves just seven miles on a gallon of gasoline, would require about 285 gallons per traveler. Greenhouse Gas Emissions Airplanes emit greenhouse gases directly into the upper atmosphere, where they can linger longer and cause more damage than the same gases at lower altitudes. While air travel contributes to climate change, it's essential to recognize that other forms of transportation, such as cars and ships, also emit greenhouse gases. The challenge lies in finding ways to reduce aviation emissions without compromising connectivity and mobility. Ozone Depletion and Contrails Planes affect the concentration of other gases and pollutants in the atmosphere. They lead to a short-term increase in ozone (O3) but a long-term decrease. Contrails - those white streaks left behind by planes - can contribute to cloud formation and impact local weather patterns. Balancing the benefits of air travel with environmental concerns remains a critical challenge. Human Health Implications Jet Lag and Sleep Disruption Frequent flyers are no strangers to jet lag. Crossing time zones disrupts our circadian rhythms, affecting sleep patterns, mood, and overall well-being. Pilots, flight attendants, and passengers alike experience the effects of rapid travel across time zones. Dehydration and Blood Pressure Changes The low humidity in airplane cabins can lead to dehydration. Additionally, changes in cabin pressure affect blood pressure, especially during takeoff and landing. Staying hydrated and moving around during long flights can mitigate these effects. Risk of Contagious Diseases Airplanes put passengers in close proximity to one another. Recirculated air, shared surfaces, and confined spaces create an environment conducive to the spread of infections. While airlines take precautions, travelers should remain vigilant, especially during flu seasons. The Perspective Shift: Seeing Earth from Above Beyond the environmental and health impacts, airplanes have transformed our worldview. Before the Wright brothers' epochal breakthrough, humans were grounded, limited to terrestrial views. The advent of flight not only boosted our power of movement but also enhanced our vision. From above, we witness the curvature of the Earth, the vastness of oceans, and the intricate patterns of landscapes. Airplanes have made us global citizens, connecting us to distant lands and cultures. In conclusion, airplanes are a double-edged sword. They offer unparalleled mobility and exploration but come with environmental consequences and health considerations. As we continue to innovate and improve aviation technology, let's strive for a balance - a world where we soar through the skies while safeguarding our planet and well-being. Economic Impact Air Travel Industry The aviation industry is a significant contributor to the global economy. Airlines, airports, manufacturers, and associated services generate substantial revenue and employment. Air travel facilitates international trade, tourism, and business interactions. However, it also faces challenges such as fuel price fluctuations, competition, and regulatory complexities. Supply Chain and Cargo Transport Airplanes play a crucial role in transporting goods across continents. High-value and time-sensitive cargo, including perishable items, pharmaceuticals, and electronics, rely on air freight. The efficiency of supply chains owes much to the speed and reach of airplanes. Tourism and Local Economies Tourism heavily depends on air travel. Popular destinations thrive due to the influx of visitors arriving by plane. Local economies benefit from tourism-related activities, including hospitality, restaurants, and souvenir shops. Conversely, overreliance on tourism can strain natural resources and cultural heritage. Technological Advancements Aerospace Engineering The development of airplanes has driven advancements in aerospace engineering. Innovations in materials, aerodynamics, and propulsion systems have led to more efficient and safer aircraft. Research in areas like supersonic flight, electric planes, and autonomous drones continues to shape the industry. Navigation and Communication Airplanes rely on sophisticated navigation systems, including GPS, radar, and inertial guidance. These technologies enhance safety, accuracy, and efficiency. Communication networks allow pilots to stay connected with air traffic control, other planes, and ground stations. Social and Cultural Effects Global Connectivity Airplanes have transformed our perception of distance. What once took weeks by ship or months by land can now be accomplished in hours. Families separated by oceans reunite, students study abroad, and cultural exchange flourishes. The world feels smaller, and our interconnectedness grows. Iconic Symbols Airplanes evoke a sense of wonder and adventure. The iconic silhouettes of jumbo jets, fighter planes, and vintage biplanes symbolize human achievement and exploration. Airshows, aviation museums, and historical flights celebrate this legacy. Challenges and Future Prospects Sustainability The aviation industry faces the challenge of reducing its environmental impact. Researchers explore alternative fuels, electric propulsion, and lightweight materials. Balancing growth with sustainability remains critical. Airspace Congestion As air travel becomes more accessible, airspace congestion intensifies. Efficient air traffic management, improved routes, and next-generation air traffic control systems are essential to prevent gridlock. Security and Safety Ensuring the safety of passengers, crew, and cargo remains paramount. Rigorous security protocols, maintenance standards, and emergency preparedness are vital. In conclusion, airplanes are more than mere vessels of transportation. They shape economies, connect cultures, and inspire innovation. As we soar into the future, let's navigate the skies responsibly, appreciating both the marvels and challenges of flight. The Effects of Space Travel on the Human Body Space travel, with its awe-inspiring vistas and boundless possibilities, has captivated humanity for decades. However, venturing beyond our home planet comes with a price - a price paid not only in technological challenges but also in the toll it takes on the human body. Let us explore the effects of space travel, from radiation exposure to altered gravity, and how astronauts adapt to these extreme conditions. Space Radiation: A Silent Threat Radiation Exposure On Earth, our protective magnetic field and atmosphere shield us from the majority of space radiation. However, in space, astronauts face direct exposure to cosmic rays and solar particles. These high-energy particles can penetrate the body, damaging cells and DNA. Increased risk of cancer and degenerative diseases, such as heart disease and cataracts, have been observed in human populations exposed to radiation on Earth. In space, health risks from radiation are mainly driven by long-term impacts. Altered Gravity: A Weighty Matter Microgravity and Muscle Atrophy Astronauts aboard the International Space Station (ISS) experience microgravity, where their bodies float freely. While this weightlessness allows for breathtaking experiments and observations, it wreaks havoc on muscles and bones. Without the constant pull of gravity, muscles weaken, and bones lose density. Astronauts must engage in rigorous exercise routines to counteract muscle atrophy and maintain bone health. Fluid Redistribution and Swollen Faces In microgravity, bodily fluids shift upward, causing facial puffiness and fluid retention. Astronauts often joke about their 'moon faces.' This fluid redistribution can also affect vision, leading to a condition known as spaceflight-associated neuro-ocular syndrome (SANS). Isolation and Confinement: The Mental Strain Psychological Challenges Space missions involve prolonged isolation and confinement. Astronauts live in tight quarters, cut off from the natural world. The absence of familiar sights, sounds, and smells can lead to feelings of loneliness and anxiety. Coping mechanisms, communication with loved ones, and psychological support are crucial to maintaining mental well-being. Distance from Earth: A Cosmic Solitude Emotional Impact The vastness of space can evoke existential thoughts. Astronauts gaze back at Earth - a tiny blue dot suspended in the cosmic void - and grapple with their insignificance. The emotional weight of being far from home, family, and friends can be profound. Hostile and Closed Environments: Surviving in the Void Spacecraft Living Conditions Spacecraft are marvels of engineering, but they are also confined capsules. Astronauts adapt to tight spaces, recycled air, and limited privacy. The constant hum of machinery and the absence of natural light can wear on their senses. Risk of Infection In closed environments, microbes thrive. Astronauts must maintain strict hygiene to prevent infections. The immune system faces unique challenges, especially during extended missions. The Resilience of Astronauts Adaptation and Innovation Astronauts are remarkable in their ability to adapt. They learn to navigate microgravity, perform complex tasks, and troubleshoot technical glitches. Their resilience drives innovation, leading to better spacecraft design and life support systems. The Twin Study: Scott and Mark Kelly Scott Kelly and his identical twin brother, Mark Kelly, participated in the unique Twins Study. Scott spent nearly a year aboard the ISS, while Mark remained on Earth. By comparing their physiological and psychological changes, researchers gained valuable insights into the effects of space travel. Looking Ahead: Mars and Beyond Challenges for Deep Space Missions As we plan for Mars missions and beyond, we face the RIDGE of space travel: Space Radiation: Shielding astronauts from cosmic rays. Isolation and Confinement: Maintaining mental health during long journeys. Distance from Earth: Coping with cosmic solitude. Gravity Fields: Addressing muscle and bone health. Hostile/Closed Environments: Ensuring safety and hygiene. In conclusion, space travel is a delicate balance between exploration and preservation. As we venture farther into the cosmos, we must safeguard both our scientific curiosity and the well-being of those who dare to explore the final frontier. The Environmental Impact of Airplanes and Spaceships Airplanes and spaceships have transformed the way we explore our planet and beyond. However, their operations come with significant environmental consequences. Let's delve into the effects of these flying machines on our delicate ecosystem. Climate Change Air travel is a major contributor to climate change due to greenhouse gas emissions. Jet engines burn fossil fuels (mostly aviation gasoline or jet fuel), releasing carbon dioxide (CO2), nitrogen oxides (NOx), and water vapor into the atmosphere. These emissions trap heat, leading to global warming. Although aviation accounts for about 3.5 percent of human-induced climate change, its impact is disproportionately high due to emissions at high altitudes. Air Quality Airplanes emit pollutants such as sulfur dioxide (SO2), particulate matter (PM), and volatile organic compounds (VOCs). These pollutants degrade air quality near airports and along flight paths. Ground-level ozone formation, which harms human health and ecosystems, is also influenced by aviation emissions. Noise Pollution The roar of jet engines disrupts communities around airports. Noise pollution affects sleep patterns, stress levels, and overall well-being. Efforts to reduce noise include quieter engine designs and flight path adjustments. Spaceships: Earth's Atmospheric Guardians Rocket Launches and Pollution Rocket launches, essential for space exploration, release pollutants into the atmosphere. The fuel used - such as unsymmetrical dimethylhydrazine (UDMH) - can be highly carcinogenic and ecologically damaging. For instance, the Baikonur Cosmodrome in Kazakhstan, the world's oldest spaceport, has left a large zone of pollution due to toxic rocket fuel seeping into the soil. Carbon Particles and Geo-Engineering Recent research highlights the impact of rocket emissions on the atmosphere. Black carbon (soot) particles from rockets can absorb heat, acting as a form of geo-engineering. As commercial space launches increase, so does the concern about their environmental effects. Balancing Exploration and Preservation Space Tourism The rise of space tourism introduces new challenges. As more people venture beyond Earth, we must consider the cumulative impact of rocket emissions. Balancing our curiosity with environmental stewardship is crucial. Sustainable Practices Efforts are underway to develop cleaner propulsion technologies, use alternative fuels, and minimize space debris. Innovations like reusable rockets and electric propulsion aim to reduce the environmental footprint of space travel. Looking Ahead: A Cosmic Responsibility Mars and Beyond As we dream of Mars colonies and interstellar travel, we must tread carefully. The RIDGE of space exploration - Radiation, Isolation, Distance, Gravity, and Environment - requires sustainable solutions. Let's explore the cosmos while safeguarding our home planet. In conclusion, airplanes and spaceships propel us toward the stars, but their effects ripple through our atmosphere and ecosystems. As stewards of both Earth and space, we must navigate the skies responsibly, seeking harmony between exploration and preservation. From the ground to the sky, dining experiences have transcended traditional restaurant settings. Imagine savoring gourmet meals while suspended high above the earth, with breathtaking views stretching as far as the eye can see. Welcome to the world of aerial dining, where culinary delights meet gravity-defying elegance. Dinner in the Sky: Elevating Gastronomy The Original Concept Dinner in the Sky, born in 2006, is the epitome of dining with a twist. Picture a massive table - more like a platform - hoisted almost 200 feet into the air by a sturdy crane. Guests, chefs, and waitstaff don their white hats as they ascend to the skies. The setting? A floating dinner table, surrounded by nothing but open air and panoramic vistas. The Experience As you settle into your seat, the anticipation builds. The restaurant staff orchestrates a three-course fine dining experience, all while suspended in midair. The menu features carefully crafted dishes, often prepared beforehand and finished in a convection oven right there in the sky. Each bite is accompanied by awe-inspiring views - city skylines, rolling landscapes, or even the vastness of the ocean. Safety First Before you ascend, a safety briefing ensures that you're securely strapped in. The thrill of being airborne mingles with the elegance of haute cuisine. Whether it's a romantic date night or a corporate event, Dinner in the Sky promises an unforgettable meal. Sky-High Restaurants Around the World Dubai Marina: A Feast Above the Waters Situated in Dubai Marina, this dining concept boasts some of the best views of the city skyline, surrounding waters, and the iconic Palm Jumeirah. Imagine floating above the ground while you dine - a one-of-a-kind experience you simply cannot miss. After the safety briefing near Skydive Dubai, you're hoisted 50 meters into the air, suspended over the bustling marina. The fusion of flavors meets the fusion of horizons. Las Vegas: Unparalleled Views of the Strip In the entertainment capital of the world, Dinner in the Sky Las Vegas takes fine dining to new heights - literally. As the sun sets, you ascend, and the glittering lights of the Las Vegas Strip come alive. The most unforgettable dinner you'll ever have awaits, with the cityscape stretching out beneath you. It's a feast for the senses, where culinary artistry meets architectural marvels. The Future of Aerial Gastronomy Sustainability and Innovation As we look ahead, the challenge lies in balancing indulgence with environmental responsibility. How can we minimize the carbon footprint of these lofty dining experiences? Innovations like electric-powered cranes, locally sourced ingredients, and waste reduction strategies are steps toward a more sustainable future. Beyond Earth: Space Tourism and Cosmic Cuisine With the rise of space tourism, could we soon dine among the stars? Imagine a celestial restaurant aboard a spacecraft, overlooking Earth from orbit. Cosmic cuisine - crafted by zero-gravity chefs - might become the ultimate bucket-list experience. As we explore the cosmos, let's ensure that our gastronomic adventures leave no trace behind. In conclusion, dining in the air transcends mere sustenance. It's a celebration of human ingenuity, a fusion of flavors and vistas, and a reminder that our appetite for exploration knows no bounds. So, raise your glass (carefully!) to the skies and savor the magic of dining aloft. Dining in the Sky is a unique and exhilarating culinary experience that elevates traditional dining to new heights - literally. Here are the key aspects of this extraordinary concept: The Setting: Up, Up, and Away! Imagine being seated at a massive table suspended high above the ground, often hundreds of feet in the air. The dining platform is typically hoisted by a sturdy crane or other mechanical means. Guests, chefs, and waitstaff ascend together, creating an unforgettable communal experience. The Experience: A Feast with a View As you settle into your seat, anticipation builds. The thrill of being airborne mingles with the elegance of haute cuisine. The menu features carefully crafted dishes, often prepared beforehand and finished on-site. Whether it's breakfast, lunch, or dinner, each course is served against a backdrop of breathtaking views - city skylines, rolling landscapes, or even the vastness of the ocean. The floating table becomes a stage for culinary artistry, where flavors dance amidst the clouds. Safety First: Buckle Up! Before ascending, guests receive a safety briefing. Straps secure them to their seats, ensuring a worry-free dining experience. The focus shifts from gravity to gastronomy as the platform rises, leaving the ground far below. Locations Around the World: Where the Sky Meets the Plate Dubai Marina: Suspended above the bustling marina, diners enjoy views of the city skyline and the iconic Palm Jumeirah. Las Vegas: As the sun sets, guests ascend over the glittering lights of the Las Vegas Strip, creating an unparalleled dining spectacle. The Future: Sustainability and Cosmic Cuisine Balancing indulgence with environmental responsibility is crucial. Innovations like electric-powered cranes and locally sourced ingredients aim to reduce the carbon footprint. Could cosmic cuisine be next? With the rise of space tourism, imagine dining aboard a spacecraft, overlooking Earth from orbit. Zero-gravity chefs crafting celestial dishes - it's a tantalizing prospect. Introduction The sky, our celestial canvas, is a dynamic theater where cosmic phenomena unfold. From twinkling stars to majestic planets, the sky offers a mesmerizing display that captivates astronomers and dreamers alike. In this essay, we'll explore the various elements of celestial weather, from meteor showers to planetary alignments. Stars and Constellations Stellar Climates Stars, like earthly weather patterns, exhibit their own 'climates.' Some stars burn fiercely, radiating intense heat, while others are cooler and more temperate. The constellations, those celestial neighborhoods, form intricate patterns across the night sky. Imagine them as cosmic weather maps, guiding our eyes to distant realms. Meteor Showers: Celestial Rainfall Meteor showers are cosmic storms, where Earth passes through debris left behind by comets. As these tiny particles collide with our atmosphere, they ignite, creating streaks of light - the meteors. The Perseids in August and the Geminids in December are celestial fireworks, painting the sky with ephemeral beauty. Planets and Their Dance Planetary Weather Systems Our solar system hosts a diverse range of planets, each with its own atmospheric conditions. Venus, shrouded in thick clouds of sulfuric acid, experiences hurricane-force winds. Mars, with its rusty surface, battles dust storms that engulf the entire planet. Jupiter's Great Red Spot - a colossal storm - has raged for centuries. Conjunctions and Oppositions Planets engage in a cosmic ballet. Conjunctions occur when two planets appear close together in the sky, as if sharing a celestial embrace. Oppositions, on the other hand, position a planet directly opposite the Sun, making it visible all night. Witnessing Mars during opposition feels like meeting an old friend. Lunar Weather Phases of the Moon The Moon, Earth's faithful companion, cycles through its phases. New Moon, First Quarter, Full Moon - the lunar weather changes predictably. During a lunar eclipse, our planet casts a shadow on the Moon, turning it coppery red. It's a cosmic reminder of our place in the grand celestial drama. Tides: The Ocean's Cosmic Response The Moon's gravitational pull orchestrates tides on Earth. High tides and low tides ebb and flow, responding to lunar cues. The celestial dance between Earth, Moon, and Sun shapes our oceans, affecting coastlines and marine life. Celestial Events Comets: Cosmic Visitors Comets, celestial vagabonds, journey through our solar system. Their icy cores release gas and dust, forming magnificent tails. Halley's Comet, a recurring visitor, graces our skies once every 76 years. Its return is a cosmic homecoming. Supernovae: Stellar Explosions When massive stars reach the end of their lives, they explode in brilliant supernovae. These cosmic fireworks outshine entire galaxies. Witnessing a supernova - a rare event - is like glimpsing the universe's raw power. Conclusion As we gaze upward, let's remember that the sky is not merely a backdrop but a living, breathing entity. Its weather - both familiar and otherworldly - shapes our cosmic experience. So, next time you look up, consider the celestial forecast: a blend of stardust, wonder, and infinite possibilities. In the words of Carl Sagan, 'The cosmos is within us. We are made of star-stuff.' Cosmic Mysteries Dark Matter and Dark Energy The sky harbors secrets beyond our comprehension. Among them are dark matter and dark energy. Dark matter, invisible and elusive, exerts gravitational influence on galaxies, holding them together. Imagine it as the cosmic glue binding the universe. Dark energy, on the other hand, accelerates the universe's expansion, pushing galaxies apart. These cosmic enigmas remain shrouded in mystery, awaiting discovery. Auroras: Celestial Light Shows When charged particles from the Sun collide with Earth's magnetic field, they create auroras - the ethereal dance of light near the poles. The Northern Lights (Aurora Borealis) and Southern Lights (Aurora Australis) paint the night sky with hues of green, pink, and purple. These celestial ballets remind us of our interconnectedness with the solar system. Celestial Timekeeping Stellar Clocks The sky serves as humanity's oldest timekeeper. Ancient civilizations relied on celestial events for calendars. The sidereal day, based on Earth's rotation relative to distant stars, is approximately 23 hours, 56 minutes, and 4 seconds. Constellations rise and set, marking the passage of time - a cosmic heartbeat. Eclipses: Celestial Alignments Solar and lunar eclipses are cosmic alignments. During a solar eclipse, the Moon obscures the Sun, casting a shadow on Earth. The eerie twilight and the diamond ring effect evoke awe. Lunar eclipses, when Earth's shadow engulfs the Moon, transform it into a reddish orb - an astronomical spectacle witnessed by civilizations across millennia. Cosmic Harmony Music of the Spheres Ancient philosophers believed in the 'music of the spheres.' They imagined celestial bodies - planets, stars, and moons - emitting harmonious vibrations. Each celestial note contributed to a cosmic symphony. While we no longer hear this celestial music, its metaphorical resonance persists - a reminder that the universe hums with hidden melodies. Galactic Weather Patterns Galaxies, like weather systems, evolve. Spiral galaxies, with their graceful arms, resemble cosmic hurricanes. Elliptical galaxies, shaped like celestial footballs, harbor dormant black holes at their cores. Colliding galaxies create celestial tempests, birthing new stars. The cosmic weather forecast predicts galactic collisions, stellar births, and cosmic winds. Conclusion: Our Cosmic Home As we conclude our cosmic odyssey, remember that the sky is not an abstract canvas - it's our celestial home. Whether you're stargazing from a mountaintop or contemplating the Moon's craters, you participate in the grand cosmic narrative. The sky whispers tales of creation, destruction, and eternity. So, dear reader, look up. Embrace the celestial weather - the storms and serenades. For in the vastness of space, we find wonder, humility, and a shared cosmic kinship. As Carl Sagan eloquently put it, 'We are a way for the cosmos to know itself.' Introduction The universe is a symphony, and planets are its celestial notes. These enigmatic orbs dance around stars, weaving tales of creation, destruction, and cosmic balance. In this essay, we embark on a cosmic journey to explore the eight planets of our solar system and their profound significance. Mercury: The Swift Messenger Mercury, the swiftest planet, orbits closest to the Sun. Its surface is a rugged landscape of craters and cliffs, baked by scorching temperatures during the day and chilled at night. Named after the Roman messenger god, Mercury shuttles between extremes, delivering cosmic messages across the solar system. Venus: Earth's Fiery Twin Venus, Earth's twin sister, hides behind thick clouds of sulfuric acid. Its surface resembles a volcanic inferno, with temperatures hot enough to melt lead. Yet, its beauty lies in its radiant glow - the Morning and Evening Star - illuminating our dawn and dusk. Earth: Our Blue Gem Earth, our precious home, teems with life. Its oceans, forests, and deserts form a delicate biosphere. From the icy poles to the equatorial rainforests, Earth's diverse climates sustain a symphony of ecosystems. We are its guardians, entrusted with its care. Mars: The Red Planet's Mysteries Mars, the Red Planet, beckons explorers. Its rusty surface bears ancient river valleys and polar ice caps. Could Mars harbor hidden reservoirs of life? Robotic rovers traverse its deserts, seeking answers beneath its crimson skies. Jupiter: King of the Gas Giants Jupiter, the colossal gas giant, boasts a mesmerizing tapestry of bands and storms. Its Great Red Spot - a tempest larger than Earth - has raged for centuries. Jupiter's gravitational pull shapes the solar system, protecting inner planets from cosmic debris. Saturn: Jewel of the Rings Saturn, adorned with majestic rings, is a cosmic jewel. These icy hoops, composed of countless particles, create a celestial ballet. Saturn's moons - Titan, Enceladus, and others - beckon us to explore their icy landscapes. Uranus: The Original Ice Giant Uranus, tipped on its side, spins like a cosmic top. Its icy blue hue conceals turbulent storms. Uranus remains a mystery, awaiting further study by future missions. Neptune: The Farthest Wanderer Neptune, shrouded in azure clouds, is the outermost planet. Its winds whip at supersonic speeds, and its icy heart harbors storms that rival Jupiter's. Voyager 2, our interstellar traveler, captured Neptune's beauty as it sailed past. Conclusion: Cosmic Harmony Planets are cosmic harmonizers. Their gravitational dances sculpt orbits, stir tides, and guide comets. They remind us of our place in the grand cosmic orchestra. As we gaze at the night sky, let us cherish these celestial companions - the guardians of harmony. In the words of Carl Sagan, 'We are made of star-stuff.' Our existence echoes the cosmic rhythm, and planets are our celestial partners in this cosmic waltz. Pluto, once considered our ninth planet, now holds the title of a dwarf planet. The International Astronomical Union (IAU) made this reclassification in 2006. Pluto didn't meet one of the three criteria the IAU uses to define a full-sized planet: it has not cleared its neighboring region of other objects. Despite its demotion, Pluto remains a fascinating member of the Kuiper belt, a ring of bodies beyond Neptune's orbit. It is the ninth-largest and tenth-most-massive known object to directly orbit the Sun. Although smaller than Earth's moon, Pluto's icy and rocky composition continues to intrigue astronomers and stargazers alike. NASA's New Horizons mission is a remarkable endeavor that has expanded our understanding of the outer reaches of our solar system. Let's delve into the details of this pioneering spacecraft: Objective: New Horizons was designed to study the dwarf planet Pluto, its moons, and other objects in the Kuiper Belt. Launch Date: On January 19, 2006, New Horizons embarked on its epic journey. Spacecraft Mass: Weighing 1,054 pounds (478 kilograms), it carried a suite of scientific instruments. Mission Design and Management: The mission was led by NASA in collaboration with the Johns Hopkins University Applied Physics Laboratory (APL). Historic Flyby: On July 14, 2015, New Horizons made history by becoming the first spacecraft to explore Pluto up close. It captured stunning images of Pluto's diverse geological features, including its icy plains, rugged mountains, and frozen canyons. Moons of Pluto: During the flyby, New Horizons also studied Pluto's five moons, including the intriguing Charon. Arrokoth Flyby: In early 2019, New Horizons achieved another milestone by flying past Arrokoth (2014 MU69). Arrokoth is a Kuiper Belt Object, making it the most distant object ever explored up close. Kuiper Belt: This region extends from about 30 AU (near Neptune's orbit) to about 50 AU from the Sun. New Horizons ventured into this uncharted territory. New Horizons carried an impressive array of instruments, including: Ralph: A visible and infrared imager/spectrometer. Alice: An ultraviolet imaging spectrometer. Radio-Science Experiment (REX): Studied radio signals. Long-Range Reconnaissance Imager (LORRI): Captured high-resolution images. Solar Wind and Plasma Spectrometer (SWAP): Analyzed solar wind. Pluto Energetic Particle Spectrometer Science Investigation (PEPSSI): Studied particles around Pluto. Student Dust Counter (SDC): Measured dust impacts. New Horizons provided insights into Pluto's atmosphere, surface, and geology. It revealed icy mountains, glaciers, and mysterious dark regions. The spacecraft also observed Jupiter's moons (Io, Europa, and Ganymede) during its long journey. As of 2023, New Horizons continues to explore the outer solar system, contributing to our understanding of distant bodies. In summary, New Horizons has been a trailblazer, revealing the secrets of Pluto and venturing into the cosmic frontier. Its legacy inspires future missions and fuels our curiosity about the cosmos. ", + "8192": "Once upon a time, in a quaint little village nestled amidst rolling hills, there existed an old teapot. But this was no ordinary teapot; it was a magical one. Its handle curved just so, and its spout seemed to whisper secrets to the wind. The villagers called it 'Elara,' and they believed it held the power to grant wishes. Elara sat on the windowsill of Mrs. Abernathy's cozy cottage. Mrs. Abernathy was a kind-hearted woman with twinkling eyes and a penchant for herbal teas. She'd inherited the teapot from her grandmother, who, in turn, had received it from a mysterious traveler. One chilly evening, as the sun dipped below the horizon, Mrs. Abernathy brewed her favorite chamomile tea. She poured the fragrant liquid into Elara, and to her astonishment, the teapot began to glow. The room filled with a soft, golden light, and Mrs. Abernathy felt a tingle in her fingertips. 'Make a wish,' whispered Elara, its spout quivering. Mrs. Abernathy hesitated. She'd heard tales of wishes gone awry - of greedy desires leading to unintended consequences. But her heart yearned for something simple: a garden filled with blooming roses. So, she closed her eyes and wished for just that. The next morning, Mrs. Abernathy stepped outside, and her breath caught. The air smelled of roses - sweet and heady. But when she looked around, she gasped. Her modest garden had transformed into a riot of colors. Roses of every hue - crimson, ivory, apricot - bloomed in profusion. They climbed the walls, twined around the picket fence, and even spilled onto the cobblestone path. Word spread throughout the village, and soon everyone wanted a turn with Elara. The baker wished for the perfect sourdough loaf, and it appeared in his oven. The blacksmith wished for strength, and his arms bulged with newfound muscle. The schoolteacher wished for wisdom, and her lectures became captivating tales. But as wishes multiplied, so did the consequences. The baker's sourdough grew sentient and demanded to be called 'Doughbert.' The blacksmith's strength made him accidentally crush his anvil. And the schoolteacher's wisdom led her to question the very fabric of reality. Mrs. Abernathy watched with a mix of amusement and concern. Elara seemed to thrive on granting wishes, but its porcelain surface bore faint cracks. Was it growing weaker? One day, a young girl named Lily approached Elara. Her eyes sparkled with innocence, and she clutched a dandelion in her hand. 'Teapot,' she said, 'I wish for a friend.' Elara hesitated. It sensed the purity of Lily's heart, but it also knew the weight of loneliness. With a shudder, it granted the wish. And so, Lily's dandelion transformed into a giggling sprite named Petal. They danced through meadows, shared secrets, and became inseparable. Elara's cracks deepened, but it didn't mind. As seasons passed, Mrs. Abernathy sat by the window, watching Elara fade. Yet, she felt no regret. For in granting wishes, the teapot had found purpose. And perhaps, just perhaps, it had one final wish left - to be remembered. And so, when Mrs. Abernathy's time came, she whispered to Elara, 'Thank you.' The teapot glowed one last time, and Mrs. Abernathy drifted away, leaving behind a garden of roses and a village full of stories. And that, my dear reader, is how the enchanted teapot became a legend - a vessel of magic, love, and wishes granted with a fragile heart. As the seasons changed, so did the village. The once-sleepy hamlet now buzzed with visitors from distant lands. They came seeking Elara, the legendary teapot that granted wishes. Some sought riches, others fame, but most yearned for something deeper - a connection to the mystical. Among the newcomers was a weary traveler named Ezra. His cloak was tattered, and his boots bore the marks of countless miles. He'd heard whispers of Elara's magic and hoped it could mend his broken heart. For Ezra had lost his beloved, and grief weighed upon him like an anchor. Mrs. Abernathy, now an old woman with silver hair, welcomed Ezra into her cottage. Elara sat on the windowsill, its porcelain surface etched with memories. Mrs. Abernathy poured chamomile tea into the teapot, and it glowed faintly, as if recognizing an old friend. 'Make a wish,' Mrs. Abernathy said, her voice soft. Ezra hesitated. His wish was simple yet profound: to see his love once more, if only in a dream. He closed his eyes and whispered, 'I wish for a single night with her.' Elara trembled, its spout quivering. It understood the ache of lost love - the longing that transcended time. And so, it granted Ezra's wish. That night, as the moon hung low in the sky, Ezra lay on Mrs. Abernathy's creaky bed. Elara sat beside him, its glow illuminating the room. He drifted into slumber, and there, in the realm between wakefulness and dreams, he found himself in a moonlit garden. His love, Isolde, stood before him. Her eyes were the color of forget-me-nots, and her laughter echoed like wind chimes. They danced beneath a silver canopy, twirling through memories - their first kiss, stolen moments by the river, promises whispered under ancient oaks. But dreams are fragile, and dawn approached. Isolde's form wavered, and Ezra clung to her. 'Stay,' he pleaded. 'Just a little longer.' Isolde smiled, her touch like a butterfly's kiss. 'Time bends here,' she said. 'But you must wake, my love.' As the sun peeked over the horizon, Ezra opened his eyes. Elara sat on the windowsill, its glow fading. Mrs. Abernathy watched him, her gaze knowing. 'Did you see her?' she asked. Ezra nodded, tears glistening. 'She was real, Mrs. Abernathy. I held her again.' The village marveled at Ezra's tale - the man who danced with a ghost. They flocked to Elara, each with their wishes. The blacksmith wished for forgiveness, the baker for inspiration, and the schoolteacher for courage. Elara obliged, its cracks deepening, but it never complained. One day, as winter painted the landscape white, Mrs. Abernathy grew frail. She called Ezra to her bedside. 'Elara's magic wanes,' she whispered. 'But it has one final wish.' Ezra knelt beside her. 'What is it?' 'Take Elara beyond the hills,' Mrs. Abernathy said. 'To the ancient oak where Isolde and I carved our initials. There, bury the teapot. It will become part of the earth, and its magic will seep into the roots.' And so, on a frost-kissed morning, Ezra carried Elara to the oak. He dug a small hole, placed the teapot inside, and covered it with soil. As he patted the ground, he felt a tremor - a farewell. The next spring, the oak bloomed with roses - crimson, ivory, apricot. And in its shade, a dandelion sprouted. Its petals glowed like moonlight, and when the wind whispered, it carried echoes of laughter. Ezra knew then that Elara's wish had come true. It had become part of the land, woven into the fabric of stories. And perhaps, just perhaps, it still listened, granting silent wishes to those who believed. And so, the legend of Elara lived on - a teapot turned earth, a vessel of love, and a bridge between worlds. In the heart of the Whispering Forest, where ancient trees leaned close and their leaves murmured secrets, lived a young girl named Evelyn. She had eyes the color of moss and hair that tangled like wild vines. Evelyn was no ordinary child; she could hear the forest's whispers - the soft rustle of leaves, the creaking of branches, and the laughter of unseen creatures. The villagers feared the Whispering Forest. They said it was cursed - a place where time flowed differently, where shadows danced with mischief, and where lost souls wandered forever. But Evelyn felt drawn to its heart. She believed the forest held answers - about her missing parents, about the world beyond the village. One moonlit night, when the forest beckoned with silver fingers, Evelyn slipped away from her tiny cottage. She wore a cloak spun from spider silk and carried a lantern that glowed like a captured star. The trees leaned in, their bark etched with ancient runes. They whispered her name - Evelyn, Evelyn - as if they knew her purpose. Deeper she ventured, past gnarled roots and dew-kissed ferns. The air smelled of moss and memories. The lantern's light flickered, casting eerie shadows on the forest floor. And then, she heard it - the melody of the Whispering Forest. It was a haunting tune, sung by unseen lips, and it tugged at her heart. 'Who are you?' Evelyn whispered. The forest answered - a chorus of voices, overlapping and harmonizing. 'We are the echoes of forgotten dreams, the guardians of lost paths. Seek what you desire, but beware the price.' Evelyn pressed on. She reached a clearing where moonflowers bloomed - a sea of pale petals that glowed like fallen stars. In their midst stood a stone pedestal, and atop it rested a silver key. It was unlike any key she'd seen - twisted and delicate, with a single emerald set in its bow. The whispers intensified. 'Take the key,' they urged. 'Unlock the door to your destiny.' Evelyn hesitated. What door? What destiny? She thought of her parents - their laughter, their scent of pine and adventure. They'd vanished when she was a baby, leaving only a crumpled map with cryptic symbols. With trembling fingers, she picked up the key. It felt warm, alive. And then, she saw it - a door, half-hidden behind an ancient oak. Its wood was etched with constellations, and its handle bore the same emerald as the key. Evelyn inserted the key into the lock. The door groaned open, revealing a tunnel - a ribbon of darkness that wound deeper into the forest. The whispers grew urgent. 'Step through, Evelyn. Find your truth.' She stepped into the tunnel, and the world shifted. Time blurred, and she glimpsed her parents - laughing, dancing, fading like smoke. The tunnel led to a chamber - a celestial cavern where stars swirled in liquid patterns. And there, on a stone pedestal, lay a crystal vial. The whispers crescendoed. 'Drink,' they urged. 'Remember.' Evelyn uncorked the vial. Memories flooded her - the scent of pine, her parents' laughter, the taste of adventure. Tears blurred her vision. She drank, and the forest embraced her - a cocoon of whispers, of love, of belonging. When Evelyn emerged, the Whispering Forest had changed. It no longer whispered of curses but sang of hope. She carried her parents' memories - their legacy - and vowed to protect the forest's secrets. And so, Evelyn became the new guardian. She tended the moonflowers, listened to the trees, and sang the haunting melody. The villagers no longer feared the forest; they sought its solace, its magic. And every night, as the moon rose, Evelyn stood by the ancient oak. She whispered her parents' names, and the forest whispered back - a lullaby woven from stardust and love. Beyond the Whispering Forest, where the moonflowers bloomed and the stars whispered secrets, lay a forgotten path. It was a narrow trail, overgrown with moss and guarded by ancient stones. Few dared to tread there, for it led to the Compass Grove. Lysander, a young cartographer with ink-stained fingers and a heart full of wanderlust, stumbled upon this path one misty morning. His boots sank into damp earth, and the air smelled of pine and possibility. He carried a tattered map - a relic passed down through generations. Its edges bore cryptic symbols, and its center held a blank space - an uncharted territory. The Compass Grove was said to house a mystical compass - the Wayfinder's Compass - forged by the first explorers. It was no ordinary instrument; it pointed not to north, but to one's true desire. Legends whispered that whoever held the compass could navigate not only the physical world but also the labyrinth of their own heart. Lysander's pulse quickened. He yearned for adventure - to map uncharted lands, to unravel mysteries. His parents had vanished during an expedition, leaving behind a single clue: the blank space on the map. Perhaps the Compass Grove held answers. As he pushed through brambles and ferns, the forest seemed to guide him. Sunlight filtered through leaves, casting dappled patterns on the ground. And then, he saw it - a circle of ancient stones, their surfaces etched with symbols. At the center stood a pedestal, and atop it rested the Wayfinder's Compass. Lysander's breath caught. The compass was unlike any he'd seen. Its needle shimmered like a captured star, and its dial bore not cardinal directions but enigmatic words: Dreams, Regret, Destiny, and Hope. He touched the compass, and it hummed - a vibration that resonated in his bones. The whispers began - the voices of long-lost explorers, of forgotten dreams. 'Choose,' they urged. 'Choose your path.' Lysander hesitated. Dreams? Regret? Destiny? Hope? Each word held a promise, a peril. He thought of his parents - their laughter, their courage. He thought of the blank space on the map - the uncharted territory that beckoned. And so, he turned the dial to Dreams. The needle quivered, then settled - a path leading deeper into the forest. Lysander followed, lantern in hand, heart pounding. The compass guided him past silver streams and ancient oaks. It led him to a hidden waterfall - a curtain of moonlight that shimmered like stardust. There, he glimpsed a figure - a woman with eyes like forgotten constellations. She wore a cloak spun from spider silk, and her hair flowed like a river. 'Lysander,' she said, her voice a melody. 'You seek dreams.' He nodded. 'I seek answers. About my parents.' The woman touched his forehead, and memories flooded him - the scent of pine, his parents' laughter, the taste of adventure. 'Dreams are maps,' she said. 'They guide us beyond what we see.' Lysander understood. Dreams were compasses of the soul. His parents had followed theirs, and now he would follow his. He stepped through the waterfall, and the world shifted. He found himself on a cliff overlooking a vast sea - a sea of blank parchment. Islands floated in the distance, waiting to be charted. Lysander unrolled his map - the one with the blank space - and dipped his quill. He drew coastlines, marked mountains, and named each land. And as he mapped, the compass glowed - a beacon of dreams fulfilled. Lysander knew then that he was not merely a cartographer; he was a dreamweaver. His parents' legacy flowed through him - their courage, their laughter, their love. And so, Lysander sailed the uncharted seas, guided by the Wayfinder's Compass. He discovered islands of forgotten myths, forests of whispered tales, and cities where stars danced in the streets. He wrote his own story - a cartography of dreams. And in the Compass Grove, the ancient stones whispered his name - Lysander, Lysander - as if they knew he'd found his true north. In the heart of the city, where cobblestone streets wound like forgotten memories, stood an abandoned mansion. Its windows were boarded up, and ivy clung to its crumbling walls. But within those decaying walls lay a secret - a clockwork garden. Evelyn, a curious girl with eyes like rain-kissed petals, discovered the mansion one rainy afternoon. She wore mismatched socks and carried a notebook filled with sketches - a testament to her love for hidden wonders. The mansion's gate creaked open, and Evelyn stepped into a world frozen in time. The clockwork garden was unlike any other. Its flowers were made of gears and springs, their petals unfolding with precise clicks. The roses ticked, the daffodils whirred, and the tulips chimed. And at the center stood a colossal mechanical tree - its branches reaching toward the sky, its leaves spinning like miniature windmills. Evelyn gasped. She'd read about clockwork wonders - the automatons that danced at royal balls, the pocket watches that whispered secrets. But this garden was alive - a symphony of metal and magic. As she explored, she noticed a silver key embedded in the tree's trunk. It gleamed, beckoning her. Evelyn hesitated. What did the key unlock? And why had the clockwork garden been abandoned? The flowers seemed to whisper. 'Unlock the tree,' they urged. 'Discover its heart.' Evelyn turned the key. The tree shuddered, and its branches parted, revealing a hidden chamber. Inside, a mechanical heart pulsed - a delicate contraption of brass and crystal. It hummed, resonating with the rhythm of forgotten time. And then, she heard it - the voice of the tree. 'I am Chronos,' it said. 'Guardian of moments.' Evelyn's heart raced. 'Moments?' 'Every petal, every leaf,' Chronos explained. 'They hold memories - the laughter of lovers, the tears of parting, the whispers of dreams. But time has fractured. The clockwork garden is frozen, and I am fading.' Evelyn understood. The mansion's former owner - a clockmaker named Lysander - had built this garden to capture fleeting moments. But Lysander had vanished, leaving Chronos incomplete. 'I can mend you,' Evelyn said. 'But why was the garden abandoned?' Chronos sighed - a sound like winding gears. 'Lysander sought eternity. He believed that by freezing time, he could preserve love, prevent loss. But he forgot that life thrives in impermanence.' Evelyn touched the mechanical heart. 'Can we fix it?' Chronos nodded. 'You must find Lysander's final creation - the Celestial Gear. It lies beyond the city, where the river meets the stars.' And so, Evelyn embarked on her quest. She followed the river, past moonlit bridges and forgotten docks. The Celestial Gear awaited - a constellation of interlocking wheels, its center a pulsing light. As she placed the gear in Chronos's heart, the clockwork garden stirred. Flowers bloomed, petals unfurling with joy. The mechanical tree's leaves spun faster, and time flowed once more. But Chronos grew weaker. 'I am bound to this place,' it said. 'My purpose fulfilled.' Evelyn wept. 'Can't you come with me?' Chronos smiled - a clockwork smile. 'I am part of the garden now. But you, dear Evelyn, carry its memory.' And so, she returned to the mansion, where the clockwork garden thrived. She sketched its wonders, capturing gears and petals on paper. And when she closed her eyes, she heard the whispers - the laughter of lovers, the tears of parting, the echoes of dreams. Evelyn became the new guardian. She tended the flowers, wound the tree, and listened to Chronos's fading heartbeat. And every night, as the stars wheeled overhead, she whispered her thanks. For in the heart of the clockwork garden, time danced - a fragile waltz of moments, preserved and cherished. In the heart of the Astronomer's Quarter, where cobblestone streets wound like celestial paths, stood an ancient observatory. Its domed roof bore the scars of countless meteor showers, and its telescopes whispered secrets to the night sky. But within those hallowed walls lay a mystery - a forgotten constellation. Aria, a young stargazer with eyes like distant galaxies, discovered the observatory one moonless night. She wore a cloak spun from stardust and carried a pocket-sized atlas - a testament to her love for the heavens. The observatory's door creaked open, and Aria stepped into a world woven with cosmic threads. The forgotten constellation was unlike any other. Its stars were elusive, their positions shifting with each passing century. Astronomers had once mapped it - a celestial tapestry of myth and memory - but over time, its name faded, its stories lost. As Aria explored, she noticed a silver quill resting on an ancient star chart. Its nib gleamed, beckoning her. Aria hesitated. What secrets did the quill hold? And why had the forgotten constellation slipped from memory? The stars seemed to whisper. 'Write,' they urged. 'Illuminate the night.' Aria dipped the quill in ink. The constellations above shifted - a celestial dance awaiting completion. She traced the forgotten lines - the Hunter's Bow, the Weaver's Loom, the Lost Lyre. And then, she saw it - a gap in the sky, a void where a constellation once blazed. The quill hummed - a vibration that resonated in her bones. The whispers intensified. 'Remember,' they urged. 'Remember the story.' And so, Aria wrote - a tale woven from stardust and longing. She penned the forgotten constellation's name: Lyra's Veil. Its stars had once guided lovers across oceans, inspired poets to verses, and cradled dreams in their luminous arms. But Lyra's Veil had vanished - a casualty of time's relentless march. Its stories faded, its purpose lost. Aria vowed to restore it - to stitch the celestial fabric, thread by thread. She climbed to the observatory's rooftop, where telescopes pointed toward infinity. Aria gazed at the sky, her breath mingling with the Milky Way. And there, in the gap, she saw it - the faint glimmer of Lyra's Veil. The quill guided her. She drew the missing lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as she wrote, the stars responded. Lyra's Veil emerged - a constellation reborn. But Aria felt a pull - a cosmic yearning. She touched the quill to her heart, and memories flooded her - the scent of stardust, her grandmother's bedtime stories, the taste of wonder. 'Guard it,' whispered the stars. 'Guard Lyra's Veil.' And so, Aria became the new guardian. She tended the observatory, charted the skies, and whispered the forgotten stories. The astronomers marveled - the gap was gone, and Lyra's Veil blazed once more. But Aria knew her duty. She would write new tales - of love, of courage, of dreams stitched together. And every night, as the constellations wheeled overhead, she whispered her thanks. For in the heart of the forgotten constellation, time danced - a fragile waltz of memory, preserved and cherished. In the heart of the bustling city, where skyscrapers touched the clouds and neon signs flickered like distant stars, lived a forgotten runner named Evelyn. She wasn't famous like the sprinters on billboards or the marathon champions with their gleaming medals. No, Evelyn was an ordinary woman who ran for the sheer joy of it. Every morning, before the sun peeked over the horizon, Evelyn laced up her worn-out sneakers. She followed the same route - a loop around the park, past the fountain where pigeons bathed, and along the riverbank where willow trees whispered secrets. Her pace was steady, her breaths rhythmic. She ran not to win races but to escape the noise of life - to find solace in the rhythm of her footsteps. But the city had forgotten Evelyn. The sports channels didn't broadcast her runs, and the local newspapers didn't write about her achievements. She was a lone figure - a silhouette against the dawn, chasing dreams that no one else cared about. One chilly morning, as Evelyn jogged along the river, she noticed a poster taped to a lamppost. It announced the city's annual marathon - the grand event that drew elite athletes from around the world. Evelyn's heart skipped a beat. She'd never run a marathon, but the idea tugged at her like a distant constellation. She tore off the poster and studied it. The race would wind through the city's streets, past cheering crowds and historic landmarks. The finish line was the grand stadium - the same stadium where she'd watched her heroes cross the tape, their names echoing through the loudspeakers. Evelyn hesitated. She wasn't a professional runner. She didn't have a coach or a team. But something stirred within her - a longing to be part of the marathon, to leave her mark on the city she loved. And so, she trained. She woke earlier, ran farther, and pushed her limits. She practiced pacing, fueled by oatmeal and determination. The other runners didn't notice her - a middle-aged woman with graying hair - but Evelyn didn't mind. She was a comet streaking through the pre-dawn darkness, fueled by her own quiet fire. On marathon day, the city buzzed with excitement. The streets were lined with spectators - families with homemade signs, old couples in folding chairs, children waving tiny flags. The elite runners surged ahead, their strides effortless. But Evelyn was in the middle of the pack - a forgotten runner among thousands. As she crossed each mile marker, Evelyn felt a surge of pride. She wasn't breaking records, but she was breaking barriers - the ones she'd built around herself. The cheers of the crowd fueled her - their encouragement like solar winds pushing her forward. And then, at mile 20, exhaustion hit. Evelyn's legs wobbled, her breaths came in ragged gasps. She glanced at the grand stadium - the finish line shimmering like a distant galaxy. But her body rebelled. She wanted to collapse, to fade into anonymity. And that's when she saw him - a young boy with a crumpled sign. It read, 'Go, Evelyn! You're not forgotten.' Tears blurred her vision. She pushed through the pain, her heartbeat a metronome of determination. As Evelyn crossed the finish line, the crowd erupted. The loudspeakers blared her name - Evelyn, Evelyn - and the forgotten runner became a star. She collapsed into the arms of a volunteer, her legs trembling. But she'd done it. She'd run the marathon - the one that mattered to her. The newspapers wrote about her - the woman who defied odds, who ran not for glory but for love. And the city remembered Evelyn - the forgotten runner who'd become a constellation, lighting the way for others. Lysander stood at the finish line of the marathon, his chest heaving, sweat-soaked shirt clinging to his skin. The stadium roared - a symphony of applause and encouragement. But amidst the cheers, he felt a void - an ache that no medal could fill. He'd run the race - the one that mattered to him. Yet, as he caught his breath, Lysander wondered about the blank space on his map. The uncharted territory - the reason his parents had vanished - still haunted him. A shadow fell across the track. It was Evelyn, the forgotten runner. Her eyes sparkled with determination, and her worn-out sneakers bore the marks of countless miles. She'd finished the marathon too, her name echoing through the loudspeakers. 'Evelyn,' Lysander said, his voice hoarse. 'Why do we run?' She leaned against the railing, gazing at the city beyond. 'For the same reason we map,' she replied. 'To find what's lost.' Lysander nodded. 'The Compass Grove,' he said. 'The Wayfinder's Compass.' Evelyn's eyes widened. 'You know of it?' He traced the blank space on his map - the gap where the forgotten constellation should be. 'My parents sought it,' Lysander confessed. 'They believed it held answers - about time, about destiny.' Evelyn's fingers brushed the silver quill in her pocket. 'And did they find it?' He shook his head. 'They vanished. But I won't stop searching.' Together, they left the stadium - the forgotten runner and the cartographer. They followed the same path - the one that led beyond the city, into the Whispering Forest. The compass guided them - the needle pointing not to north, but to dreams. As they reached the ancient stones of the Compass Grove, Evelyn gasped. 'Look,' she said, her voice hushed. There, etched into the stones, were symbols - the Weaver's Loom, the Lost Lyre, and the Hunter's Bow. And at the center stood the pedestal - the Wayfinder's Compass. Lysander touched it - the needle quivering. 'What do we seek?' he asked. Evelyn's eyes held galaxies. 'Not just answers,' she said. 'But connection - to the forgotten, to each other.' And so, they turned the dial - to Hope. The compass hummed, and the forest whispered. A path opened - a ribbon of moonlight leading deeper. They stepped through, and the world shifted. Stars swirled - a celestial dance. And there, in the gap, they saw it - the forgotten constellation. Lyra's Veil blazed - a tapestry of memories, stitched by stardust. Its stars guided lovers, inspired poets, and cradled dreams. Lysander and Evelyn held hands - the cartographer and the runner. They traced the lines - the Weaver's Loom reconnected, the Lost Lyre's melody restored. And as they gazed at Lyra's Veil, they felt it - a cosmic yearning. Not for fame or medals, but for eternity - the kind woven into forgotten constellations. Together, they whispered their thanks - to the stars, to the forest, to each other. In the small town of Maplewood, basketball was more than a game - it was a way of life. The local high school gym, with its creaky wooden floors and flickering lights, held memories etched into the hearts of generations. Tommy Reynolds, a lanky teenager with dreams as big as the full moon, had grown up shooting hoops in that gym. His father, a former basketball star, had taught him the art of the game - the perfect arc of a jump shot, the rhythm of dribbling, and the magic of teamwork. But Tommy wasn't like his father. He lacked the height and the natural talent. Still, he practiced tirelessly, his sneakers squeaking on the polished floor. He'd stare at the faded championship banners hanging from the rafters - the ones his father had helped win - and imagine his own name there someday. Senior year arrived, and Tommy made the varsity team. He wasn't a star player, but he hustled, diving for loose balls and setting screens. The crowd cheered louder for the flashy slam dunks, but Tommy's heart beat for the fundamentals - the bounce pass, the defensive stance, the pick-and-roll. The state championship game loomed - a David-and-Goliath matchup against the undefeated Oakwood Tigers. They had a towering center, a lightning-fast point guard, and a reputation for crushing opponents. Maplewood was the underdog, the team with heart but not much else. As the final seconds ticked away, the score was tied. Tommy stood at center court, sweat dripping down his face. The gym seemed to hold its breath. He glanced at the banners - the ghosts of champions past urging him on. The ball found its way to Tommy. He dribbled, eyes scanning the court. His father's voice echoed in his mind: 'Trust your instincts, son.' He drove toward the basket, the Tigers' defense closing in. But instead of taking the shot, Tommy passed - the perfect bounce pass to his teammate, Danny. Danny leaped, releasing the ball just as the buzzer sounded. The gym erupted. The ball swirled through the net - a miracle shot that defied physics. Maplewood had won - the underdogs had toppled the giants. Tommy's teammates lifted him on their shoulders. The crowd chanted his name. But as he glanced at the banners, he knew the truth. It wasn't just his shot - it was the culmination of every bounce pass, every defensive stance, every pick-and-roll. His father hugged him - a rare display of emotion. 'You did it, Tommy,' he whispered. 'You made your mark.' And there, in the glow of victory, Tommy realized that sometimes the greatest miracles happen at center court - not in the spotlight, but in the quiet moments of practice, persistence, and heart." +} diff --git a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py index e8b563261001b..33084aec214c2 100644 --- a/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py +++ b/onnxruntime/python/tools/transformers/models/llama/quant_kv_dataloader.py @@ -1,3 +1,8 @@ +# ------------------------------------------------------------------------- +# Copyright (c) Microsoft Corporation. All rights reserved. +# Licensed under the MIT License. See License.txt in the project root for +# license information. +# -------------------------------------------------------------------------- import argparse import numpy as np From 2bc29244b4b6992667d06446c839426917945a29 Mon Sep 17 00:00:00 2001 From: Baiju Meswani Date: Fri, 22 Mar 2024 10:28:44 -0700 Subject: [PATCH 54/55] Support model with multiple SCE loss nodes (#20016) --- .../orttraining/core/framework/gradient_graph_builder.cc | 5 +++++ .../orttraining/core/optimizer/insert_output_rewriter.cc | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/orttraining/orttraining/core/framework/gradient_graph_builder.cc b/orttraining/orttraining/core/framework/gradient_graph_builder.cc index d66591318d5c7..2ee4b5e1a173d 100644 --- a/orttraining/orttraining/core/framework/gradient_graph_builder.cc +++ b/orttraining/orttraining/core/framework/gradient_graph_builder.cc @@ -210,6 +210,11 @@ NodeSet GradientGraphBuilder::ReverseBFSWithStopGradient(const NodeSet& nodes) c continue; } const NodeArg* node_arg = n->InputDefs()[edge_it->GetDstArgIndex()]; + if (!node_arg) { + LOGS(logger_, VERBOSE) << "Skip building gradient for input_" << edge_it->GetDstArgIndex() + << " of node: " << n->Name() << " because it is not found in the graph."; + continue; + } const auto [is_tensor_type, is_allowed_type_for_grad, type] = IsAllowedForGradient(graph_, node_arg); if (is_tensor_type) { if (!is_allowed_type_for_grad) { diff --git a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc index 2aade8c9bc1f9..61fc8d5492c2b 100644 --- a/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc +++ b/orttraining/orttraining/core/optimizer/insert_output_rewriter.cc @@ -44,7 +44,7 @@ Status InsertSoftmaxCrossEntropyLossOutput::Apply(Graph& graph, Node& node, Rewr t.mutable_tensor_type()->mutable_shape()->CopyFrom(*X->Shape()); // log probability should have the same shape as logits. } - NodeArg& node_arg = graph.GetOrCreateNodeArg(X->Name() + "_log_prob", &t); + NodeArg& node_arg = graph.GetOrCreateNodeArg(graph.GenerateNodeArgName(X->Name() + "_log_prob"), &t); outputs.push_back(&node_arg); From 7e84ba0ea30f3642c75d8d3fce5626766ce5a20e Mon Sep 17 00:00:00 2001 From: Abhishek Jindal Date: Fri, 22 Mar 2024 10:39:19 -0700 Subject: [PATCH 55/55] remove const cast for DLManagedTensor (#20015) ### Description Removing const_cast as it might lead to unknown behavior. Specifying DLMangedTensor as a const doesn't seem to be necessary and I have tested this by running torch_ort.configure. Not sure what other tests which needs to be done. Background can be found in this [PR](https://github.com/microsoft/onnxruntime/pull/19982) ### Motivation and Context --- .../torch_cpp_extensions/aten_op_executor/aten_op_executor.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc index 4148e63d58619..f4d2f68d4d8b5 100644 --- a/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc +++ b/onnxruntime/python/torch_cpp_extensions/aten_op_executor/aten_op_executor.cc @@ -36,7 +36,7 @@ struct ATenOperator { size_t return_size; std::vector ret_kinds; - c10::IValue ToIValueArgument(const DLManagedTensor* dlpack, size_t index) const { + c10::IValue ToIValueArgument(DLManagedTensor* dlpack, size_t index) const { TORCH_INTERNAL_ASSERT(index < argument_size); bool is_optional = is_optional_arguments[index]; TORCH_INTERNAL_ASSERT(dlpack || is_optional || default_values[index] || @@ -57,7 +57,7 @@ struct ATenOperator { c10::IValue i_value; // Create the torch tensor from this DLPack no matter we need it or not below, // so that the dlpack's deleter will be triggered when torch tensor is out of scope. - at::Tensor tensor = at::fromDLPack(const_cast(dlpack)); + at::Tensor tensor = at::fromDLPack(dlpack); switch (elem_kinds[index]) { case c10::TypeKind::TensorType: { i_value = is_optional ? c10::IValue(c10::optional(tensor)) : c10::IValue(tensor);