From 2c20960bb0b6098a927db656a5826994c165f8a2 Mon Sep 17 00:00:00 2001 From: Dmitri Smirnov Date: Tue, 26 Mar 2024 12:11:19 -0700 Subject: [PATCH] Fix code definciency --- .../q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h index e4e3b9814bd50..26239161cf8a3 100644 --- a/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h +++ b/onnxruntime/core/mickey/cutlass_ext/q4gemm/warp/quantb_meta_mma_tensor_op_tile_iterator.h @@ -794,13 +794,13 @@ class QuantBMetaMmaTensorOpTileIterator(scales.data()); - uint32_t* addon_ptr = reinterpret_cast(addon); - if constexpr (kHasOffset){ +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + const uint32_t* scales_ptr = reinterpret_cast(scales.data()); + uint32_t* addon_ptr = reinterpret_cast(addon); // possible buffer over read 2 bytes here. const uint32_t* p = reinterpret_cast(offsets.data()); -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)) + asm volatile( "{\n\t" " .reg .b32 rb0, rb1, rb2;\n"