From 8ae399720fcf32c95130edc307323e255da97ef3 Mon Sep 17 00:00:00 2001 From: Chen Fu <1316708+chenfucn@users.noreply.github.com> Date: Thu, 7 Mar 2024 17:32:01 +0000 Subject: [PATCH] lint --- .../cuda/quantization/matmul_nbits.cc | 32 ++++---- .../core/mickey/blk_q4/f16_prepack_sm80.h | 52 ++++++------- onnxruntime/core/optimizer/gpu_ops_prepack.cc | 78 +++++++++---------- .../test/optimizer/gpu_op_prepack_test.cc | 31 ++++---- .../test_cases/blkq4_fp16_gemm_sm80_test.cc | 2 +- 5 files changed, 96 insertions(+), 99 deletions(-) diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc index 1d1853725aac4..3ff7c84ed8c61 100644 --- a/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc +++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc @@ -15,14 +15,14 @@ namespace onnxruntime { namespace contrib { namespace cuda { -template<> +template <> Status MatMulNBits::PrepackedGemm( - cudaStream_t stream, - const Tensor* a, - const Tensor* b, - const Tensor* scales, - const Tensor* zero_points, - Tensor* Y) const { + cudaStream_t stream, + const Tensor* a, + const Tensor* b, + const Tensor* scales, + const Tensor* zero_points, + Tensor* Y) const { int64_t M = a->Shape()[0]; uint8_t const* zero_points_ptr = nullptr; size_t zero_points_size = 0; @@ -32,12 +32,12 @@ Status MatMulNBits::PrepackedGemm( } return blkq4_fp16_gemm_sm80_dispatch( - int(block_size_), column_wise_quant_blk_, int(M), int(N_), int(K_), stream, - a->Data(), a->Shape().Size(), - b->Data(), b->Shape().Size(), - scales->Data(), scales->Shape().Size(), - zero_points_ptr, zero_points_size, - Y->MutableData(), Y->Shape().Size()); + int(block_size_), column_wise_quant_blk_, int(M), int(N_), int(K_), stream, + a->Data(), a->Shape().Size(), + b->Data(), b->Shape().Size(), + scales->Data(), scales->Shape().Size(), + zero_points_ptr, zero_points_size, + Y->MutableData(), Y->Shape().Size()); } template @@ -59,14 +59,14 @@ Status MatMulNBits::ComputeInternal(OpKernelContext* ctx) const { // Bail out early if the output is going to be empty if (Y->Shape().Size() == 0) return Status::OK(); - if (prepack_ > 0){ + if (prepack_ > 0) { ORT_RETURN_IF(reorder_idx != nullptr, "Internal Error: Prepacked gemm does not support reorder index. Fix the prepacking logic!"); ORT_RETURN_IF(zero_points != nullptr && zero_points->IsDataType(), "Internal Error: Prepacked gemm does not support zero points of type T. Fix the prepacking logic!"); return PrepackedGemm( - static_cast(ctx->GetComputeStream()->GetHandle()), - a, b, scales, zero_points, Y); + static_cast(ctx->GetComputeStream()->GetHandle()), + a, b, scales, zero_points, Y); } const auto* a_data = a->Data(); diff --git a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h index a1f9b80d7754b..26645c4785a38 100644 --- a/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h +++ b/onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h @@ -187,8 +187,8 @@ struct BlockwiseQuantization { static constexpr bool ShouldRearrangeMeta = sizeof(ElementT) == 2 && QuantBlocking::kRow == 1; static void prepack_quant_scales( - size_t rows, - size_t columns, + int rows, + int columns, const gsl::span& scales, // <- quant scales, column major layout const gsl::span& scales_prepacked // <- quant scales prepacked, same size buffer ) { @@ -345,8 +345,8 @@ struct BlockwiseQuantization { }; static inline bool IsSm80WithWholeBlocks( - int weight_rows, [[maybe_unused]] int weight_cols, - int major, [[maybe_unused]] int minor) { + int weight_rows, [[maybe_unused]] int weight_cols, + int major, [[maybe_unused]] int minor) { if (major < 8) { return false; } @@ -364,9 +364,8 @@ static inline bool IsSm80WithWholeBlocks( return (weight_rows % 64 == 0); } -template -inline -bool BlkQuantGemmSm80Supported(int weight_rows, int weight_cols, int major, int minor) { +template +inline bool BlkQuantGemmSm80Supported(int weight_rows, int weight_cols, int major, int minor) { using Base = BlockwiseQuantization; if (!Base::weight_dimension_supported(weight_rows, weight_cols)) { return false; @@ -375,26 +374,25 @@ bool BlkQuantGemmSm80Supported(int weight_rows, int weight_cols, int major, int } static inline bool BlkQuantGemmSm80Supported(int block_size, bool col_blocking, int weight_rows, int weight_cols, int major, int minor) { - switch (block_size) - { - case 16: - if (col_blocking) { - return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); - } else { - return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); - } - case 32: - if (col_blocking) { - return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); - } else { - return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); - } - case 64: - if (col_blocking) { - return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); - } else { - return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); - } + switch (block_size) { + case 16: + if (col_blocking) { + return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); + } else { + return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); + } + case 32: + if (col_blocking) { + return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); + } else { + return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); + } + case 64: + if (col_blocking) { + return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); + } else { + return onnxruntime::cuda::BlkQuantGemmSm80Supported(weight_rows, weight_cols, major, minor); + } } return false; } diff --git a/onnxruntime/core/optimizer/gpu_ops_prepack.cc b/onnxruntime/core/optimizer/gpu_ops_prepack.cc index b0219124c13de..a9adeac191428 100644 --- a/onnxruntime/core/optimizer/gpu_ops_prepack.cc +++ b/onnxruntime/core/optimizer/gpu_ops_prepack.cc @@ -24,7 +24,6 @@ // 3. The logic of prepacking depends on underlying GPU // hardware. Currently this part is hard-coded for SM80. - #include "core/graph/graph_utils.h" #include "core/optimizer/initializer.h" #include "core/optimizer/gpu_ops_prepack.h" @@ -43,17 +42,17 @@ extern ProviderInfo_CUDA* TryGetProviderInfo_CUDA(); /** * @brief Read initialized tensor from protobuf, and store it in ort_value. * Keep in mind that ort_value is the owner of the tensor memory after calling this function. -*/ + */ inline Status GetOrtValue(const NodeArg* arg, const Graph& graph, OrtValue& ort_value) { const ONNX_NAMESPACE::TensorProto* tensor_proto; ORT_RETURN_IF_NOT(graph.GetInitializedTensor(arg->Name(), tensor_proto), "Missing initializer for ", arg->Name()); - const auto* path_c_str = graph.ModelPath().ToPathString().c_str(); + const auto path_str = graph.ModelPath().ToPathString(); return utils::TensorProtoToOrtValue( - Env::Default(), path_c_str, *tensor_proto, - std::make_shared(), ort_value); + Env::Default(), path_str.c_str(), *tensor_proto, + std::make_shared(), ort_value); } template @@ -65,7 +64,7 @@ inline gsl::span make_span(std::string& str) { // Prepacking logic specific to MatMulNBits on sm80 // -static inline bool IsNodeMatMulNbitsFp16(const Node& node){ +static inline bool IsNodeMatMulNbitsFp16(const Node& node) { if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "MatMulNBits", {1}, kMSDomain)) { return false; } @@ -78,13 +77,13 @@ static inline bool IsNodeMatMulNbitsFp16(const Node& node){ template void Sm80BlkQ4PrepackT( - int rows, int columns, - gsl::span weights, - gsl::span scales, - gsl::span zp, - std::string& packed_w, - std::string& packed_scales, - std::string& packed_zp) { + int rows, int columns, + gsl::span weights, + gsl::span scales, + gsl::span zp, + std::string& packed_w, + std::string& packed_scales, + std::string& packed_zp) { using Base = onnxruntime::cuda::BlockwiseQuantization< MLFloat16, block_size, @@ -95,31 +94,31 @@ void Sm80BlkQ4PrepackT( packed_w.resize(q_weight_shape.product() * sizeof(uint8_t)); Base::prepack_weights( - rows, columns, weights, - make_span(packed_w)); + rows, columns, weights, + make_span(packed_w)); packed_scales.resize(meta_shape.product() * sizeof(MLFloat16)); Base::prepack_quant_scales( - rows, columns, scales, - make_span(packed_scales)); + rows, columns, scales, + make_span(packed_scales)); if (!zp.empty()) { packed_zp.resize(meta_shape.product() * sizeof(uint8_t)); Base::prepack_quant_offsets( - rows, columns, zp, - make_span(packed_zp)); + rows, columns, zp, + make_span(packed_zp)); } } void Sm80BlkQ4Prepack( - int block_size, bool column_quant_blk, - int rows, int columns, - gsl::span weights, - gsl::span scales, - gsl::span zp, - std::string& packed_w, - std::string& packed_scales, - std::string& packed_zp) { + int block_size, bool column_quant_blk, + int rows, int columns, + gsl::span weights, + gsl::span scales, + gsl::span zp, + std::string& packed_w, + std::string& packed_scales, + std::string& packed_zp) { switch (block_size) { case 16: if (column_quant_blk) { @@ -161,21 +160,23 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) { Status status = graph_utils::TryGetNodeAttribute(node, "prepacked", att_i); bool prepacked = status.IsOK() ? att_i != 0 : false; if (prepacked) { - return Status::OK(); // already prepacked, nothing to do + return Status::OK(); // already prepacked, nothing to do } ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute(node, "bits", att_i)); - int nbits = static_cast(att_i); + int nbits = SafeInt(att_i); if (nbits != 4) { - return Status::OK(); // only support 4 bits for now + return Status::OK(); // only support 4 bits for now } + // A single dimension can not exceed 2G yet. ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute(node, "K", att_i)); - int k = static_cast(att_i); + int k = SafeInt(att_i); ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute(node, "N", att_i)); - int n = static_cast(att_i); + int n = SafeInt(att_i); + ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute(node, "block_size", att_i)); - int block_size = static_cast(att_i); + int block_size = SafeInt(att_i); status = graph_utils::TryGetNodeAttribute(node, "column_wise_blocking", att_i); bool column_wise_quant_blk = status.IsOK() ? att_i != 0 : true; @@ -184,10 +185,10 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) { ORT_ENFORCE(provider_info != nullptr, "Failed to query CUDA provider info while prepacking cuda operators."); int major, minor; ORT_ENFORCE(provider_info->GetCurrentGpuDeviceVersion(&major, &minor) == nullptr, - "Failed to query CUDA device version while prepacking cuda operators."); + "Failed to query CUDA device version while prepacking cuda operators."); if (!onnxruntime::cuda::BlkQuantGemmSm80Supported(block_size, column_wise_quant_blk, k, n, major, minor)) { - return Status::OK(); // not supported + return Status::OK(); // not supported } // @@ -196,7 +197,7 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) { auto& node_name = node.Name(); auto& mutable_input_defs = node.MutableInputDefs(); if (mutable_input_defs.size() < 3 || mutable_input_defs.size() > 4) { - return Status::OK(); // not supported + return Status::OK(); // not supported } NodeArg* old_weights_arg = mutable_input_defs[1]; @@ -227,7 +228,7 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) { ORT_RETURN_IF_ERROR(GetOrtValue(old_zp_arg, graph, zp_val)); Tensor* zp_tensor_ptr = zp_val.GetMutable(); if (!zp_tensor_ptr->IsDataType()) { - return Status::OK(); // not supported + return Status::OK(); // not supported } zp = zp_tensor_ptr->DataAsSpan(); } @@ -289,7 +290,6 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) { return Status::OK(); } - Status GpuOpsPrepack::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const { GraphViewer graph_viewer(graph); const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder(); @@ -304,7 +304,7 @@ Status GpuOpsPrepack::ApplyImpl(Graph& graph, bool& modified, int graph_level, c ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger)); if (node.GetExecutionProviderType() != onnxruntime::kCudaExecutionProvider) { - continue; // only interested in CUDA nodes + continue; // only interested in CUDA nodes } // Run prepack if the node is MatMulNBits. diff --git a/onnxruntime/test/optimizer/gpu_op_prepack_test.cc b/onnxruntime/test/optimizer/gpu_op_prepack_test.cc index 5fb90c59684f3..0b7c34b71543a 100644 --- a/onnxruntime/test/optimizer/gpu_op_prepack_test.cc +++ b/onnxruntime/test/optimizer/gpu_op_prepack_test.cc @@ -41,18 +41,18 @@ std::shared_ptr LoadCudaEp() { * - the addition of cuda execution provider in the session. * - a different location for the model checker, right after session initialization * as the initializers will be deleted during session run. -*/ + */ void GpuPrepackTester( - const std::shared_ptr& cuda_ep, - const std::function& build_test_case, - const std::function& check_transformed_graph, - TransformerLevel baseline_level, - TransformerLevel target_level, - int opset_version = 12, - double per_sample_tolerance = 0.001, - double relative_per_sample_tolerance = 0.001, - const std::function& add_session_options = {}, - const InlinedHashSet& disabled_optimizers = {}) { + const std::shared_ptr& cuda_ep, + const std::function& build_test_case, + const std::function& check_transformed_graph, + TransformerLevel baseline_level, + TransformerLevel target_level, + int opset_version = 12, + double per_sample_tolerance = 0.001, + double relative_per_sample_tolerance = 0.001, + const std::function& add_session_options = {}, + const InlinedHashSet& disabled_optimizers = {}) { // Build the model for this test. std::unordered_map domain_to_version; domain_to_version[kOnnxDomain] = opset_version; @@ -133,12 +133,12 @@ inline Status GetOrtValue(const NodeArg* arg, const Graph& graph, OrtValue& ort_ const auto* path_c_str = graph.ModelPath().ToPathString().c_str(); return utils::TensorProtoToOrtValue( - Env::Default(), path_c_str, *tensor_proto, - std::make_shared(), ort_value); + Env::Default(), path_c_str, *tensor_proto, + std::make_shared(), ort_value); } template -void MatMulQ4Test(int M, int N, int K, const std::shared_ptr& cuda_ep){ +void MatMulQ4Test(int M, int N, int K, const std::shared_ptr& cuda_ep) { // // Type definitions // @@ -260,7 +260,7 @@ void MatMulQ4Test(int M, int N, int K, const std::shared_ptr for (size_t i = 0; i < packed_w_ref.size(); ++i) { int expected = packed_w_ref[i]; int found = weights_data[i]; - ASSERT_EQ(expected, found) << "prepacked weight mismatch index i = " << i << " shape[" << K << "," << N/2 << "]"; + ASSERT_EQ(expected, found) << "prepacked weight mismatch index i = " << i << " shape[" << K << "," << N / 2 << "]"; } } { @@ -298,7 +298,6 @@ void MatMulQ4Test(int M, int N, int K, const std::shared_ptr check_graph, TransformerLevel::Level2, TransformerLevel::Level3); - } TEST(GpuOpPrepackTests, MatmulNBits) { diff --git a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc index 0f25769e24e96..6092ba9ff098a 100644 --- a/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc +++ b/onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc @@ -34,7 +34,7 @@ void testPrepack(int rows, int columns) { col_blocking>; EXPECT_TRUE(Base::weight_dimension_supported(rows, columns)) - << "Test setup problem, unsupported weight dimension: [" << rows << ", " << columns << "]"; + << "Test setup problem, unsupported weight dimension: [" << rows << ", " << columns << "]"; using QuantBlocking = typename Base::QuantBlocking; using ElementW = typename Base::ElementW;