From 2ad56898da586603aae076e7f0ec8b486ae428de Mon Sep 17 00:00:00 2001
From: Jambay Kinley <jambaykinley@microsoft.com>
Date: Mon, 23 Oct 2023 22:27:48 +0000
Subject: [PATCH] template spacing

---
 .../quantization/dequantize_blockwise_bnb4.cu | 14 +++++------
 .../cuda/quantization/matmul_bnb4.cu          | 24 +++++++++----------
 2 files changed, 19 insertions(+), 19 deletions(-)
diff --git a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu
index e1236d4119c6d..12c956fee7488 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/dequantize_blockwise_bnb4.cu
@@ -111,13 +111,13 @@ template Status DequantizeBnb4<float>(
     cudaStream_t stream);
 
 template Status DequantizeBnb4<half>(
-  const half* quant_map, 
-  half *output, 
-  const uint8_t *quant_data, 
-  const half *absmax, 
-  int block_size, 
-  int numel, 
-  cudaStream_t stream);
+    const half* quant_map, 
+    half *output, 
+    const uint8_t *quant_data, 
+    const half *absmax, 
+    int block_size, 
+    int numel, 
+    cudaStream_t stream);
 
 }  // namespace cuda
 }  // namespace contrib
diff --git a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu
index ce5288948c5d4..a5eeed82e2bb4 100644
--- a/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu
+++ b/onnxruntime/contrib_ops/cuda/quantization/matmul_bnb4.cu
@@ -13,18 +13,18 @@ namespace cuda {
 #define num_values_4bit 32
 template <typename T, int THREADS, int BITS>
 __global__ void kgemm_4bit_inference_naive(
-  int M,
-  int N,
-  int K,
-  const T* __restrict__ A,
-  const uint8_t *B,
-  const T *absmax,
-  const T *datatype,
-  T * out,
-  int lda,
-  int ldb,
-  int ldc,
-  int block_size) {
+    int M,
+    int N,
+    int K,
+    const T* __restrict__ A,
+    const uint8_t *B,
+    const T *absmax,
+    const T *datatype,
+    T * out,
+    int lda,
+    int ldb,
+    int ldc,
+    int block_size) {
   // per threadblock:
   // load step-by-step in chunks of [32,warps]: 1x32 * [32,warps] -> [1,warps]
   // 4 warps -> 4 loads per iter