optimize dequant

microsoft · yufenglee · Sep 6, 2023 · Sep 11, 2023 · Sep 12, 2023 · Sep 12, 2023
commit 155d4b2611e33634867cbec70dc461858618c7df
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise.cu
@@ -5,6 +5,7 @@
 #include <cub/cub.cuh>
 #include <cublas_v2.h>
 #include <cuda_fp16.h>
+#include <cmath>
 #include <math_constants.h>
 #include "core/providers/cuda/cu_inc/common.cuh"
 #include "core/providers/cuda/cuda_common.h"
@@ -21,12 +22,13 @@ __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, h
   half2 scale_half2 = {scale, scale};
   half zp_adjust = -scale * __short2half_rn(zp);
   half2 zp_adjust2 = {zp_adjust, zp_adjust};
-  half2* output_half2 = reinterpret_cast<half2*>(output);
 
-  output_half2[0] = __hfma2(__halves2half2(__uint2half_rn(values_quant & 0xF), __uint2half_rn((values_quant >> 4) & 0xF)), scale_half2, zp_adjust2);
-  output_half2[1] = __hfma2(__halves2half2(__uint2half_rn((values_quant >> 8) & 0xF), __uint2half_rn((values_quant >> 12) & 0xF)), scale_half2, zp_adjust2);
-  output_half2[2] = __hfma2(__halves2half2(__uint2half_rn((values_quant >> 16) & 0xF), __uint2half_rn((values_quant >> 20) & 0xF)), scale_half2, zp_adjust2);
-  output_half2[3] = __hfma2(__halves2half2(__uint2half_rn((values_quant >> 24) & 0xF), __uint2half_rn((values_quant >> 28) & 0xF)), scale_half2, zp_adjust2);
+  alignas(16) half2 results[4];
+  results[0] = __hfma2(__halves2half2(__uint2half_rn(values_quant & 0xF), __uint2half_rn((values_quant >> 4) & 0xF)), scale_half2, zp_adjust2);
+  results[1] = __hfma2(__halves2half2(__uint2half_rn((values_quant >> 8) & 0xF), __uint2half_rn((values_quant >> 12) & 0xF)), scale_half2, zp_adjust2);
+  results[2] = __hfma2(__halves2half2(__uint2half_rn((values_quant >> 16) & 0xF), __uint2half_rn((values_quant >> 20) & 0xF)), scale_half2, zp_adjust2);
+  results[3] = __hfma2(__halves2half2(__uint2half_rn((values_quant >> 24) & 0xF), __uint2half_rn((values_quant >> 28) & 0xF)), scale_half2, zp_adjust2);
+  *(reinterpret_cast<float4*>(output)) = *(reinterpret_cast<float4*>(results));
 }
 
 __device__ __forceinline__ void DequantizeEightElements(uint32_t values_quant, float scale, uint8_t zp, float* output) {
@@ -48,12 +50,13 @@ __global__ void Dequantize4BitsKernel(
     const T* scale_data,
     const uint8_t* zero_points,
     int block_size,
-    int blocks_per_tb) {
-  int block_id = blockIdx.x * blocks_per_tb + (threadIdx.x * 8) / block_size;
-  int element_offset = block_id * block_size + (threadIdx.x * 8) % block_size;
+    int blocks_per_tb,
+    int shift) {
+  int block_id = blockIdx.x * blocks_per_tb + ((threadIdx.x * 8)>>shift);
+  int element_offset = block_id * block_size + ((threadIdx.x * 8) & ((1<<shift) - 1));
   uint32_t quant_value = *(reinterpret_cast<const uint32_t*>(quant_data + element_offset / 2));
   T scale = *(scale_data + block_id);
-  T zero_point = static_cast<T>(zero_points ? float(zero_points[block_id]) : 8.f);
+  T zero_point = static_cast<T>(zero_points ? zero_points[block_id] : (uint8_t)(8));
 
   output = output + element_offset;
   DequantizeEightElements(quant_value, scale, zero_point, output);
@@ -70,17 +73,20 @@ Status Dequantize4Bits(
     int block_size,
     cudaStream_t stream) {
   ORT_ENFORCE(k % block_size == 0, "k must be a multiplier of block_size");
-  int blocks_per_tb = GridDim::maxThreadsPerBlock * 8 / block_size;
+  constexpr int element_per_thread = 8;
+  int blocks_per_tb = GridDim::maxThreadsPerBlock * element_per_thread / block_size;
   int k_blocks = k / block_size;
   int blocks_per_grid = static_cast<int>(CeilDiv(n * k_blocks, blocks_per_tb));
+  int shift = static_cast<int>(log2f(block_size));
 
   Dequantize4BitsKernel<<<blocks_per_grid, GridDim::maxThreadsPerBlock, 0, stream>>>(
       output,
       quant_data,
       scales_data,
       zero_points,
       block_size,
-      blocks_per_tb);
+      blocks_per_tb,
+      shift);
 
   return Status::OK();
 }

diff --git a/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_int4.py b/onnxruntime/python/tools/kernel_explorer/kernels/dequantize_int4.py
@@ -43,7 +43,7 @@ def profile_dequantize_int4_func(n, k, dtype, func):
     f = getattr(ke, func)
     my_op = f(output_d, quant_d, scales_d, n, k)
     duration_ms = my_op.Profile()
-    total_bytes = 2 * (n * k) * (dtype_to_bytes(dtype))
+    total_bytes = (n * k) / 2 + (n * k + n * k / 32) * dtype_to_bytes(dtype)
 
     ke.report(DequantizeInt4Metric(func, dtype, duration_ms, total_bytes, n, k))