Skip to content

Commit

Permalink
Build issues
Browse files Browse the repository at this point in the history
Fix cuda_test_provider gtest argument stack corruption
  • Loading branch information
yuslepukhin committed Mar 22, 2024
1 parent 3076b56 commit 516f780
Show file tree
Hide file tree
Showing 14 changed files with 153 additions and 145 deletions.
2 changes: 1 addition & 1 deletion cmake/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ option(onnxruntime_USE_CUDA "Build with CUDA support" OFF)
# Enable ONNX Runtime CUDA EP's internal unit tests that directly access the EP's internal functions instead of through
# OpKernels. When the option is ON, we will have two copies of GTest library in the same process. It is not a typical
# use. If you hit any problem with that, please do not report it to GTest. Turn OFF the following build option instead.
cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS;LINUX" OFF)
cmake_dependent_option(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS "Build with CUDA unit tests" OFF "onnxruntime_USE_CUDA;onnxruntime_BUILD_UNIT_TESTS" OFF)

option(onnxruntime_USE_CUDA_NHWC_OPS "Build CUDA with NHWC op support" OFF)
option(onnxruntime_CUDA_MINIMAL "Build CUDA without any operations apart from memcpy ops. Usefuel for a very minial TRT build" OFF)
Expand Down
2 changes: 1 addition & 1 deletion cmake/onnxruntime_providers_cuda.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@
endif()
if(onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
# cuda_provider_interface.cc is removed from the object target: onnxruntime_providers_cuda_obj and
# add to the lib onnxruntime_providers_cuda separatedly.
# added to the lib onnxruntime_providers_cuda separatedly.

Check warning on line 125 in cmake/onnxruntime_providers_cuda.cmake

View workflow job for this annotation

GitHub Actions / Optional Lint

[misspell] reported by reviewdog 🐶 "separatedly" is a misspelling of "separately" Raw Output: ./cmake/onnxruntime_providers_cuda.cmake:125:50: "separatedly" is a misspelling of "separately"
# onnxruntime_providers_cuda_ut can share all the object files with onnxruntime_providers_cuda except cuda_provider_interface.cc.
set(cuda_provider_interface_src ${ONNXRUNTIME_ROOT}/core/providers/cuda/cuda_provider_interface.cc)
list(REMOVE_ITEM onnxruntime_providers_cuda_src ${cuda_provider_interface_src})
Expand Down
7 changes: 7 additions & 0 deletions cmake/onnxruntime_unittests.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -779,6 +779,13 @@ if (onnxruntime_ENABLE_CUDA_EP_INTERNAL_TESTS)
onnxruntime_add_include_to_target(onnxruntime_providers_cuda_ut GTest::gtest GTest::gmock)
target_include_directories(onnxruntime_providers_cuda_ut PRIVATE ${ONNXRUNTIME_ROOT}/core/mickey)
target_link_libraries(onnxruntime_providers_cuda_ut PRIVATE GTest::gtest GTest::gmock ${ONNXRUNTIME_MLAS_LIBS} onnxruntime_common)
if (MSVC)
# Cutlass code has an issue with the following:
# warning C4100: 'magic': unreferenced formal parameter
target_compile_options(onnxruntime_providers_cuda_ut PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:SHELL:--compiler-options /wd4100>"
"$<$<NOT:$<COMPILE_LANGUAGE:CUDA>>:/wd4100>")
endif()

list(APPEND onnxruntime_test_providers_dependencies onnxruntime_providers_cuda_ut)
endif()

Expand Down
2 changes: 1 addition & 1 deletion include/onnxruntime/core/framework/execution_provider.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ struct NodeComputeInfo {
DestroyFunctionStateFunc release_state_func;
};

using RunOptions = OrtRunOptions;
using RunOptions = ::OrtRunOptions;

enum class DataLayout {
NCHW,
Expand Down
2 changes: 1 addition & 1 deletion include/onnxruntime/core/framework/run_options.h
Original file line number Diff line number Diff line change
Expand Up @@ -45,5 +45,5 @@ struct OrtRunOptions {
};

namespace onnxruntime {
using RunOptions = OrtRunOptions;
using RunOptions = ::OrtRunOptions;
} // namespace onnxruntime
16 changes: 8 additions & 8 deletions onnxruntime/core/mickey/blk_q4/f16_prepack_sm80.h
Original file line number Diff line number Diff line change
Expand Up @@ -110,8 +110,8 @@ struct BlockwiseQuantization {
static void prepack_weights(
int rows,
int columns,
const gsl::span<uint8_t const>& weights, // <- int4 weights, column major
const gsl::span<uint8_t>& weights_prepacked // <- int4 prepacked weights tensor, same size buffer
gsl::span<uint8_t const> weights, // <- int4 weights, column major
gsl::span<uint8_t> weights_prepacked // <- int4 prepacked weights tensor, same size buffer
) {
ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0 &&
(rows % QuantBlocking::kRow) == 0 &&
Expand Down Expand Up @@ -171,10 +171,10 @@ struct BlockwiseQuantization {
static void prepack_quant_scales(
size_t rows,
size_t columns,
const gsl::span<ElementT const>& scales, // <- quant scales, column major layout
const gsl::span<ElementT>& scales_prepacked // <- quant scales prepacked, same size buffer
gsl::span<ElementT const> scales, // <- quant scales, column major layout
gsl::span<ElementT> scales_prepacked // <- quant scales prepacked, same size buffer
) {
auto meta_shape = get_quant_meta_shape(rows, columns);
auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));
ORT_ENFORCE(scales.size() == size_t(meta_shape.product()),
"Quantization scale tensor shape mismatch!");
ORT_ENFORCE(scales_prepacked.size() == size_t(meta_shape.product()),
Expand Down Expand Up @@ -241,10 +241,10 @@ struct BlockwiseQuantization {
static void prepack_quant_offsets(
size_t rows,
size_t columns,
const gsl::span<uint8_t const>& offsets, // <- quant offsets, int4, column major layout
const gsl::span<uint8_t>& offsets_prepacked // <- quant offsets prepacked, double size buffer
gsl::span<uint8_t const> offsets, // <- quant offsets, int4, column major layout
gsl::span<uint8_t> offsets_prepacked // <- quant offsets prepacked, double size buffer
) {
auto meta_shape = get_quant_meta_shape(rows, columns);
auto meta_shape = get_quant_meta_shape(static_cast<int>(rows), static_cast<int>(columns));

ORT_ENFORCE((rows % 16) == 0 && (columns % 16) == 0,
"Does not support odd number of rows or columns!");
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -132,7 +132,7 @@ struct DummyType{
}

CUTLASS_HOST_DEVICE
std::monostate& operator[](int idx) {
std::monostate& operator[](int /*idx */) {
return dummy_;
}
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -437,7 +437,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,

CUTLASS_HOST_DEVICE
static void dequant(FragmentScale const &scales,
FragmentOffset const &offsets,
FragmentOffset const &fragment_offsets,
Array<uint8_t,kExpandedSize/2> const &weights,
Array<ElementScale, kExpandedSize>& dest){
static_assert(kNumBsPerCoreTileFragement == 2, "Only for 16b gemm.");
Expand All @@ -453,19 +453,18 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,

uint32_t* dest_pair = reinterpret_cast<uint32_t*>(dest.data());
const b64* scales_ptr = reinterpret_cast<const b64*>(scales.data());
const ElementOffset* offsets_ptr = nullptr;
if constexpr(kHasOffset) { offsets_ptr = offsets.data(); }
[[maybe_unused]] const ElementOffset* fragment_offsets_ptr = nullptr;
if constexpr(kHasOffset) { fragment_offsets_ptr = fragment_offsets.data(); }

CUTLASS_PRAGMA_UNROLL
for (int n_idx = 0; n_idx < kMmaIterations; n_idx++){
// dequantize: d = scale * (weight - offset)
// to use FMA, d = scale * weight + (scale * (-offset))

b64 offsets;
if constexpr(kHasOffset){
const uint32_t* p = reinterpret_cast<const uint32_t*>(offsets_ptr);

[[maybe_unused]] b64 offsets{0};
if constexpr(kHasOffset) {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
const uint32_t* p = reinterpret_cast<const uint32_t*>(fragment_offsets_ptr);
asm volatile(
"{\n\t"
" .reg .b32 rb0, rb1;\n" // b32 regs for fp16x2 mul operands
Expand All @@ -486,7 +485,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
assert(0);
#endif

offsets_ptr += 4;
fragment_offsets_ptr += 4;
} else {
#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800))
asm volatile(
Expand Down Expand Up @@ -541,7 +540,7 @@ class QuantBMetaMmaTensorOpTileIterator<WarpShapeB_, BlockingShape_,
int idx = elem_idx + mma_tile_idx * kCoreTileFragementSize + n_idx * kCoreTileFragementSize * kTilesPerMma;
ElementScale s = scales[idx];
if constexpr(kHasOffset){
offset = s * static_cast<ElementScale>(-16 - int(offsets[idx]));
offset = s * static_cast<ElementScale>(-16 - static_cast<int>(fragment_offsets[idx]));
} else {
offset = s * static_cast<ElementScale>(-16-8);
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -394,14 +394,6 @@ struct ConfigOptions final {
PROVIDER_DISALLOW_ALL(ConfigOptions)
};

struct OrtRunOptions final {
const ConfigOptions& GetConfigOptions() const {
return g_host->RunOptions__GetConfigOptions(this);
}

PROVIDER_DISALLOW_ALL(OrtRunOptions)
};

struct ComputeCapability final {
static std::unique_ptr<ComputeCapability> Create(std::unique_ptr<IndexedSubGraph> t_sub_graph) { return g_host->ComputeCapability__construct(std::move(t_sub_graph)); }
static void operator delete(void* p) { g_host->ComputeCapability__operator_delete(reinterpret_cast<ComputeCapability*>(p)); }
Expand Down Expand Up @@ -1283,3 +1275,10 @@ template <>
inline gsl::span<const int64_t> Tensor::DataAsSpan() const { return g_host->Tensor__DataAsSpan_int64(this); }

} // namespace onnxruntime

struct OrtRunOptions final {
const onnxruntime::ConfigOptions& GetConfigOptions() const {
return onnxruntime::g_host->RunOptions__GetConfigOptions(this);
}
PROVIDER_DISALLOW_ALL(OrtRunOptions)
};
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,11 @@

#include <random>

#include "core/util/matrix_layout.h"
#include "core/common/common.h"
#include "core/mickey/blk_q4/f16_prepack_sm80.h"
#include "core/util/matrix_layout.h"
#include "test/cuda_host/blkq4_fp16_quant_sm80.h"
#include <thrust/host_vector.h>

Check warning on line 23 in onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Found C system header after other header. Should be: blkq4_fp16_gemm_sm80.h, c system, c++ system, other. [build/include_order] [4] Raw Output: onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80.h:23: Found C system header after other header. Should be: blkq4_fp16_gemm_sm80.h, c system, c++ system, other. [build/include_order] [4]

namespace onnxruntime {
namespace cuda {
Expand Down Expand Up @@ -48,10 +49,10 @@ Status sm80_supported();
template <typename ElementT, int block_size, bool col_blocking, bool has_offsets>
inline void blkq4_weights_gen(
int rows, int columns,
std::vector<ElementT>& dequants,
std::vector<uint8_t>& q_weights,
std::vector<ElementT>& q_scales,
std::vector<uint8_t>& q_zp) {
thrust::host_vector<ElementT>& dequants,
thrust::host_vector<uint8_t>& q_weights,
thrust::host_vector<ElementT>& q_scales,
thrust::host_vector<uint8_t>& q_zp) {
using Base = onnxruntime::cuda::BlockwiseQuantization<
ElementT,
block_size,
Expand Down Expand Up @@ -120,9 +121,9 @@ inline void blkq4_weights_gen(

q_scales.resize(meta_shape.product());
for (size_t i = 0; i < q_scales.size(); i++) {
uint32_t v = dis(gen);
uint32_t m = (v % 63) + 1;
uint32_t e = (v >> 6) % 4;
uint32_t vl = dis(gen);
uint32_t m = (vl % 63) + 1;
uint32_t e = (vl >> 6) % 4;
q_scales[i] = ElementT(m / static_cast<float>(1 << (2 + e)));
}
MatrixRef<ElementT, ColumnMajorLayout, true> tensor_scale(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,13 @@
* well with CUTLASS headers.
*/

#include "blkq4_fp16_gemm_sm80.h"

Check warning on line 14 in onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Include the directory when naming header files [build/include_subdir] [4] Raw Output: onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc:14: Include the directory when naming header files [build/include_subdir] [4]
#include <random>

#include "core/framework/float16.h"
#include "core/mlas/inc/mlas_q4.h"

#include "blkq4_fp16_gemm_sm80.h"
#include <thrust/host_vector.h>

Check warning on line 20 in onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc

View workflow job for this annotation

GitHub Actions / Lint C++

[cpplint] reported by reviewdog 🐶 Found C system header after other header. Should be: blkq4_fp16_gemm_sm80_test.h, c system, c++ system, other. [build/include_order] [4] Raw Output: onnxruntime/test/providers/cuda/test_cases/blkq4_fp16_gemm_sm80_test.cc:20: Found C system header after other header. Should be: blkq4_fp16_gemm_sm80_test.h, c system, c++ system, other. [build/include_order] [4]

#include "gtest/gtest.h"

Expand All @@ -43,10 +44,10 @@ void testPrepack(int rows, int columns) {
const auto meta_shape = Base::get_quant_meta_shape(rows, columns);
const auto zp_shape = make_Position((meta_shape[0] + 1) / 2, meta_shape[1]);

std::vector<ElementW> q_weights;
std::vector<ElementT> q_scales;
std::vector<ElementQOffset> q_zp;
std::vector<ElementT> dequants;
thrust::host_vector<ElementW> q_weights;
thrust::host_vector<ElementT> q_scales;
thrust::host_vector<ElementQOffset> q_zp;
thrust::host_vector<ElementT> dequants;
onnxruntime::cuda::test::blkq4_weights_gen<ElementT, block_size, col_blocking, has_offset>(
rows, columns, dequants, q_weights, q_scales, q_zp);

Expand Down
Loading

0 comments on commit 516f780

Please sign in to comment.