Skip to content

Commit

Permalink
lint
Browse files Browse the repository at this point in the history
  • Loading branch information
chenfucn committed Mar 7, 2024
1 parent 6bd5c0c commit 8ae3997
Show file tree
Hide file tree
Showing 5 changed files with 96 additions and 99 deletions.
32 changes: 16 additions & 16 deletions onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,14 @@ namespace onnxruntime {
namespace contrib {
namespace cuda {

template<>
template <>
Status MatMulNBits<MLFloat16>::PrepackedGemm(
cudaStream_t stream,
const Tensor* a,
const Tensor* b,
const Tensor* scales,
const Tensor* zero_points,
Tensor* Y) const {
cudaStream_t stream,
const Tensor* a,
const Tensor* b,
const Tensor* scales,
const Tensor* zero_points,
Tensor* Y) const {
int64_t M = a->Shape()[0];
uint8_t const* zero_points_ptr = nullptr;
size_t zero_points_size = 0;
Expand All @@ -32,12 +32,12 @@ Status MatMulNBits<MLFloat16>::PrepackedGemm(
}

return blkq4_fp16_gemm_sm80_dispatch<MLFloat16>(
int(block_size_), column_wise_quant_blk_, int(M), int(N_), int(K_), stream,
a->Data<MLFloat16>(), a->Shape().Size(),
b->Data<uint8_t>(), b->Shape().Size(),
scales->Data<MLFloat16>(), scales->Shape().Size(),
zero_points_ptr, zero_points_size,
Y->MutableData<MLFloat16>(), Y->Shape().Size());
int(block_size_), column_wise_quant_blk_, int(M), int(N_), int(K_), stream,

Check warning on line 35 in onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Using deprecated casting style. Use static_cast<int>(...) instead [readability/casting] [4] Raw Output: onnxruntime/contrib_ops/cuda/quantization/matmul_nbits.cc:35: Using deprecated casting style. Use static_cast<int>(...) instead [readability/casting] [4]
a->Data<MLFloat16>(), a->Shape().Size(),
b->Data<uint8_t>(), b->Shape().Size(),
scales->Data<MLFloat16>(), scales->Shape().Size(),
zero_points_ptr, zero_points_size,
Y->MutableData<MLFloat16>(), Y->Shape().Size());
}

template <typename T>
Expand All @@ -59,14 +59,14 @@ Status MatMulNBits<T>::ComputeInternal(OpKernelContext* ctx) const {
// Bail out early if the output is going to be empty
if (Y->Shape().Size() == 0) return Status::OK();

if (prepack_ > 0){
if (prepack_ > 0) {
ORT_RETURN_IF(reorder_idx != nullptr,
"Internal Error: Prepacked gemm does not support reorder index. Fix the prepacking logic!");
ORT_RETURN_IF(zero_points != nullptr && zero_points->IsDataType<T>(),
"Internal Error: Prepacked gemm does not support zero points of type T. Fix the prepacking logic!");
return PrepackedGemm(
static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()),
a, b, scales, zero_points, Y);
static_cast<cudaStream_t>(ctx->GetComputeStream()->GetHandle()),
a, b, scales, zero_points, Y);
}

const auto* a_data = a->Data<T>();
Expand Down
52 changes: 25 additions & 27 deletions onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
Original file line number Diff line number Diff line change
Expand Up @@ -187,8 +187,8 @@ struct BlockwiseQuantization {
static constexpr bool ShouldRearrangeMeta = sizeof(ElementT) == 2 && QuantBlocking::kRow == 1;

static void prepack_quant_scales(
size_t rows,
size_t columns,
int rows,
int columns,
const gsl::span<ElementT const>& scales, // <- quant scales, column major layout
const gsl::span<ElementT>& scales_prepacked // <- quant scales prepacked, same size buffer
) {
Expand Down Expand Up @@ -345,8 +345,8 @@ struct BlockwiseQuantization {
};

static inline bool IsSm80WithWholeBlocks(
int weight_rows, [[maybe_unused]] int weight_cols,
int major, [[maybe_unused]] int minor) {
int weight_rows, [[maybe_unused]] int weight_cols,
int major, [[maybe_unused]] int minor) {
if (major < 8) {
return false;
}
Expand All @@ -364,9 +364,8 @@ static inline bool IsSm80WithWholeBlocks(
return (weight_rows % 64 == 0);
}

template<typename ElementT, int block_size, bool col_blocking>
inline
bool BlkQuantGemmSm80Supported(int weight_rows, int weight_cols, int major, int minor) {
template <typename ElementT, int block_size, bool col_blocking>
inline bool BlkQuantGemmSm80Supported(int weight_rows, int weight_cols, int major, int minor) {
using Base = BlockwiseQuantization<ElementT, block_size, 4, col_blocking>;
if (!Base::weight_dimension_supported(weight_rows, weight_cols)) {
return false;
Expand All @@ -375,26 +374,25 @@ bool BlkQuantGemmSm80Supported(int weight_rows, int weight_cols, int major, int
}

static inline bool BlkQuantGemmSm80Supported(int block_size, bool col_blocking, int weight_rows, int weight_cols, int major, int minor) {
switch (block_size)
{
case 16:
if (col_blocking) {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 16, true>(weight_rows, weight_cols, major, minor);
} else {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 16, false>(weight_rows, weight_cols, major, minor);
}
case 32:
if (col_blocking) {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 32, true>(weight_rows, weight_cols, major, minor);
} else {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 32, false>(weight_rows, weight_cols, major, minor);
}
case 64:
if (col_blocking) {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 64, true>(weight_rows, weight_cols, major, minor);
} else {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 64, false>(weight_rows, weight_cols, major, minor);
}
switch (block_size) {
case 16:
if (col_blocking) {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 16, true>(weight_rows, weight_cols, major, minor);
} else {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 16, false>(weight_rows, weight_cols, major, minor);
}
case 32:
if (col_blocking) {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 32, true>(weight_rows, weight_cols, major, minor);
} else {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 32, false>(weight_rows, weight_cols, major, minor);
}
case 64:
if (col_blocking) {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 64, true>(weight_rows, weight_cols, major, minor);
} else {
return onnxruntime::cuda::BlkQuantGemmSm80Supported<MLFloat16, 64, false>(weight_rows, weight_cols, major, minor);
}
}
return false;
}
Expand Down
78 changes: 39 additions & 39 deletions onnxruntime/core/optimizer/gpu_ops_prepack.cc
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@
// 3. The logic of prepacking depends on underlying GPU
// hardware. Currently this part is hard-coded for SM80.


#include "core/graph/graph_utils.h"
#include "core/optimizer/initializer.h"
#include "core/optimizer/gpu_ops_prepack.h"
Expand All @@ -43,17 +42,17 @@ extern ProviderInfo_CUDA* TryGetProviderInfo_CUDA();
/**
* @brief Read initialized tensor from protobuf, and store it in ort_value.
* Keep in mind that ort_value is the owner of the tensor memory after calling this function.
*/
*/
inline Status GetOrtValue(const NodeArg* arg, const Graph& graph, OrtValue& ort_value) {
const ONNX_NAMESPACE::TensorProto* tensor_proto;
ORT_RETURN_IF_NOT(graph.GetInitializedTensor(arg->Name(), tensor_proto),
"Missing initializer for ", arg->Name());

const auto* path_c_str = graph.ModelPath().ToPathString().c_str();
const auto path_str = graph.ModelPath().ToPathString();

return utils::TensorProtoToOrtValue(
Env::Default(), path_c_str, *tensor_proto,
std::make_shared<CPUAllocator>(), ort_value);
Env::Default(), path_str.c_str(), *tensor_proto,
std::make_shared<CPUAllocator>(), ort_value);
}

template <typename T>
Expand All @@ -65,7 +64,7 @@ inline gsl::span<T> make_span(std::string& str) {
// Prepacking logic specific to MatMulNBits<float16> on sm80
//

static inline bool IsNodeMatMulNbitsFp16(const Node& node){
static inline bool IsNodeMatMulNbitsFp16(const Node& node) {
if (!graph_utils::IsSupportedOptypeVersionAndDomain(node, "MatMulNBits", {1}, kMSDomain)) {
return false;
}
Expand All @@ -78,13 +77,13 @@ static inline bool IsNodeMatMulNbitsFp16(const Node& node){

template <int block_size, bool column_quant_blk>
void Sm80BlkQ4PrepackT(
int rows, int columns,
gsl::span<const uint8_t> weights,
gsl::span<const MLFloat16> scales,
gsl::span<const uint8_t> zp,
std::string& packed_w,
std::string& packed_scales,
std::string& packed_zp) {
int rows, int columns,
gsl::span<const uint8_t> weights,
gsl::span<const MLFloat16> scales,
gsl::span<const uint8_t> zp,
std::string& packed_w,
std::string& packed_scales,
std::string& packed_zp) {
using Base = onnxruntime::cuda::BlockwiseQuantization<
MLFloat16,
block_size,
Expand All @@ -95,31 +94,31 @@ void Sm80BlkQ4PrepackT(

packed_w.resize(q_weight_shape.product() * sizeof(uint8_t));
Base::prepack_weights(
rows, columns, weights,
make_span<uint8_t>(packed_w));
rows, columns, weights,
make_span<uint8_t>(packed_w));

packed_scales.resize(meta_shape.product() * sizeof(MLFloat16));
Base::prepack_quant_scales(
rows, columns, scales,
make_span<MLFloat16>(packed_scales));
rows, columns, scales,
make_span<MLFloat16>(packed_scales));

if (!zp.empty()) {
packed_zp.resize(meta_shape.product() * sizeof(uint8_t));
Base::prepack_quant_offsets(
rows, columns, zp,
make_span<uint8_t>(packed_zp));
rows, columns, zp,
make_span<uint8_t>(packed_zp));
}
}

void Sm80BlkQ4Prepack(
int block_size, bool column_quant_blk,
int rows, int columns,
gsl::span<const uint8_t> weights,
gsl::span<const MLFloat16> scales,
gsl::span<const uint8_t> zp,
std::string& packed_w,
std::string& packed_scales,
std::string& packed_zp) {
int block_size, bool column_quant_blk,
int rows, int columns,
gsl::span<const uint8_t> weights,
gsl::span<const MLFloat16> scales,
gsl::span<const uint8_t> zp,
std::string& packed_w,
std::string& packed_scales,
std::string& packed_zp) {
switch (block_size) {
case 16:
if (column_quant_blk) {
Expand Down Expand Up @@ -161,21 +160,23 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) {
Status status = graph_utils::TryGetNodeAttribute(node, "prepacked", att_i);
bool prepacked = status.IsOK() ? att_i != 0 : false;
if (prepacked) {
return Status::OK(); // already prepacked, nothing to do
return Status::OK(); // already prepacked, nothing to do
}

ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute<int64_t>(node, "bits", att_i));
int nbits = static_cast<int>(att_i);
int nbits = SafeInt<int>(att_i);
if (nbits != 4) {
return Status::OK(); // only support 4 bits for now
return Status::OK(); // only support 4 bits for now
}

// A single dimension can not exceed 2G yet.
ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute<int64_t>(node, "K", att_i));
int k = static_cast<int>(att_i);
int k = SafeInt<int>(att_i);
ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute<int64_t>(node, "N", att_i));
int n = static_cast<int>(att_i);
int n = SafeInt<int>(att_i);

ORT_RETURN_IF_ERROR(graph_utils::TryGetNodeAttribute<int64_t>(node, "block_size", att_i));
int block_size = static_cast<int>(att_i);
int block_size = SafeInt<int>(att_i);

status = graph_utils::TryGetNodeAttribute(node, "column_wise_blocking", att_i);
bool column_wise_quant_blk = status.IsOK() ? att_i != 0 : true;
Expand All @@ -184,10 +185,10 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) {
ORT_ENFORCE(provider_info != nullptr, "Failed to query CUDA provider info while prepacking cuda operators.");
int major, minor;
ORT_ENFORCE(provider_info->GetCurrentGpuDeviceVersion(&major, &minor) == nullptr,
"Failed to query CUDA device version while prepacking cuda operators.");
"Failed to query CUDA device version while prepacking cuda operators.");

if (!onnxruntime::cuda::BlkQuantGemmSm80Supported(block_size, column_wise_quant_blk, k, n, major, minor)) {
return Status::OK(); // not supported
return Status::OK(); // not supported
}

//
Expand All @@ -196,7 +197,7 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) {
auto& node_name = node.Name();
auto& mutable_input_defs = node.MutableInputDefs();
if (mutable_input_defs.size() < 3 || mutable_input_defs.size() > 4) {
return Status::OK(); // not supported
return Status::OK(); // not supported
}

NodeArg* old_weights_arg = mutable_input_defs[1];
Expand Down Expand Up @@ -227,7 +228,7 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) {
ORT_RETURN_IF_ERROR(GetOrtValue(old_zp_arg, graph, zp_val));
Tensor* zp_tensor_ptr = zp_val.GetMutable<Tensor>();
if (!zp_tensor_ptr->IsDataType<uint8_t>()) {
return Status::OK(); // not supported
return Status::OK(); // not supported
}
zp = zp_tensor_ptr->DataAsSpan<uint8_t>();
}
Expand Down Expand Up @@ -289,7 +290,6 @@ Status PackMatMulNBitsFp16(Node& node, Graph& graph, bool& modified) {
return Status::OK();
}


Status GpuOpsPrepack::ApplyImpl(Graph& graph, bool& modified, int graph_level, const logging::Logger& logger) const {
GraphViewer graph_viewer(graph);
const auto& node_topology_list = graph_viewer.GetNodesInTopologicalOrder();
Expand All @@ -304,7 +304,7 @@ Status GpuOpsPrepack::ApplyImpl(Graph& graph, bool& modified, int graph_level, c
ORT_RETURN_IF_ERROR(Recurse(node, modified, graph_level, logger));

if (node.GetExecutionProviderType() != onnxruntime::kCudaExecutionProvider) {
continue; // only interested in CUDA nodes
continue; // only interested in CUDA nodes
}

// Run prepack if the node is MatMulNBits<float16>.
Expand Down
31 changes: 15 additions & 16 deletions onnxruntime/test/optimizer/gpu_op_prepack_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -41,18 +41,18 @@ std::shared_ptr<IExecutionProvider> LoadCudaEp() {
* - the addition of cuda execution provider in the session.
* - a different location for the model checker, right after session initialization
* as the initializers will be deleted during session run.
*/
*/
void GpuPrepackTester(
const std::shared_ptr<IExecutionProvider>& cuda_ep,
const std::function<void(ModelTestBuilder& helper)>& build_test_case,
const std::function<void(InferenceSessionWrapper& session)>& check_transformed_graph,
TransformerLevel baseline_level,
TransformerLevel target_level,
int opset_version = 12,
double per_sample_tolerance = 0.001,
double relative_per_sample_tolerance = 0.001,
const std::function<void(SessionOptions&)>& add_session_options = {},
const InlinedHashSet<std::string>& disabled_optimizers = {}) {
const std::shared_ptr<IExecutionProvider>& cuda_ep,
const std::function<void(ModelTestBuilder& helper)>& build_test_case,
const std::function<void(InferenceSessionWrapper& session)>& check_transformed_graph,
TransformerLevel baseline_level,
TransformerLevel target_level,
int opset_version = 12,
double per_sample_tolerance = 0.001,
double relative_per_sample_tolerance = 0.001,
const std::function<void(SessionOptions&)>& add_session_options = {},
const InlinedHashSet<std::string>& disabled_optimizers = {}) {
// Build the model for this test.
std::unordered_map<std::string, int> domain_to_version;
domain_to_version[kOnnxDomain] = opset_version;
Expand Down Expand Up @@ -133,12 +133,12 @@ inline Status GetOrtValue(const NodeArg* arg, const Graph& graph, OrtValue& ort_
const auto* path_c_str = graph.ModelPath().ToPathString().c_str();

return utils::TensorProtoToOrtValue(
Env::Default(), path_c_str, *tensor_proto,
std::make_shared<CPUAllocator>(), ort_value);
Env::Default(), path_c_str, *tensor_proto,
std::make_shared<CPUAllocator>(), ort_value);
}

template <int block_size, bool columnwise_blocking, bool has_offsets>
void MatMulQ4Test(int M, int N, int K, const std::shared_ptr<IExecutionProvider>& cuda_ep){
void MatMulQ4Test(int M, int N, int K, const std::shared_ptr<IExecutionProvider>& cuda_ep) {
//
// Type definitions
//
Expand Down Expand Up @@ -260,7 +260,7 @@ void MatMulQ4Test(int M, int N, int K, const std::shared_ptr<IExecutionProvider>
for (size_t i = 0; i < packed_w_ref.size(); ++i) {
int expected = packed_w_ref[i];
int found = weights_data[i];
ASSERT_EQ(expected, found) << "prepacked weight mismatch index i = " << i << " shape[" << K << "," << N/2 << "]";
ASSERT_EQ(expected, found) << "prepacked weight mismatch index i = " << i << " shape[" << K << "," << N / 2 << "]";
}
}
{
Expand Down Expand Up @@ -298,7 +298,6 @@ void MatMulQ4Test(int M, int N, int K, const std::shared_ptr<IExecutionProvider>
check_graph,
TransformerLevel::Level2,
TransformerLevel::Level3);

}

TEST(GpuOpPrepackTests, MatmulNBits) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ void testPrepack(int rows, int columns) {
col_blocking>;

EXPECT_TRUE(Base::weight_dimension_supported(rows, columns))
<< "Test setup problem, unsupported weight dimension: [" << rows << ", " << columns << "]";
<< "Test setup problem, unsupported weight dimension: [" << rows << ", " << columns << "]";

using QuantBlocking = typename Base::QuantBlocking;
using ElementW = typename Base::ElementW;
Expand Down

0 comments on commit 8ae3997

Please sign in to comment.