lint

microsoft · Mar 22, 2024 · 379dffa · 379dffa
1 parent abec9b5
commit 379dffa
Show file tree

Hide file tree

Showing 24 changed files with 3,353 additions and 3,891 deletions.
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/compute_occupancy.h
@@ -24,40 +24,41 @@ using namespace onnxruntime;
 
 namespace ort_fastertransformer {
 
-template <typename GemmKernel, bool enable_cutlass_3x = false> inline int compute_occupancy_for_kernel() {
-    int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
-
-    if (smem_size > (48 << 10)) {
-        cudaFuncAttributes attr;
-        int device = 0;
-        int max_smem_per_block = 0;
-        CUDA_CALL_THROW(cudaGetDevice(&device));
-        CUDA_CALL_THROW(cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
-        if constexpr (enable_cutlass_3x) {
-            CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::device_kernel<GemmKernel>));
-        } else {
-            CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
-        }
-        if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block)) {
-            // This should mean that
-            // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
-            // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
-            // configuration.
-            return 0;
-        }
-    }
+template <typename GemmKernel, bool enable_cutlass_3x = false>
+inline int compute_occupancy_for_kernel() {
+  int smem_size = int(sizeof(typename GemmKernel::SharedStorage));
 
-    int max_active_blocks = -1;
+  if (smem_size > (48 << 10)) {
+    cudaFuncAttributes attr;
+    int device = 0;
+    int max_smem_per_block = 0;
+    CUDA_CALL_THROW(cudaGetDevice(&device));
+    CUDA_CALL_THROW(cudaDeviceGetAttribute(&max_smem_per_block, cudaDevAttrMaxSharedMemoryPerBlockOptin, device));
     if constexpr (enable_cutlass_3x) {
-        CUDA_CALL_THROW(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
-            &max_active_blocks, cutlass::device_kernel<GemmKernel>,
-            128 * (GemmKernel::NumLoadWarpGroups + GemmKernel::NumMmaWarpGroups), smem_size));
+      CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::device_kernel<GemmKernel>));
     } else {
-        CUDA_CALL_THROW(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, cutlass::Kernel<GemmKernel>,
-                                                                      GemmKernel::kThreadCount, smem_size));
+      CUDA_CALL_THROW(cudaFuncGetAttributes(&attr, cutlass::Kernel<GemmKernel>));
+    }
+    if (smem_size + attr.sharedSizeBytes >= static_cast<size_t>(max_smem_per_block)) {
+      // This should mean that
+      // cudaFuncSetAttribute(cutlass::Kernel<GemmKernel>, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)
+      // wouldn't work. In that case, we return an occupancy of 0. This will cause the heuristic to ignore this
+      // configuration.
+      return 0;
     }
+  }
+
+  int max_active_blocks = -1;
+  if constexpr (enable_cutlass_3x) {
+    CUDA_CALL_THROW(cudaOccupancyMaxActiveBlocksPerMultiprocessor(
+        &max_active_blocks, cutlass::device_kernel<GemmKernel>,
+        128 * (GemmKernel::NumLoadWarpGroups + GemmKernel::NumMmaWarpGroups), smem_size));
+  } else {
+    CUDA_CALL_THROW(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&max_active_blocks, cutlass::Kernel<GemmKernel>,
+                                                                  GemmKernel::kThreadCount, smem_size));
+  }
 
-    return max_active_blocks;
+  return max_active_blocks;
 }
 
-} // namespace ort_fastertransformer
+}  // namespace ort_fastertransformer
diff --git a/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h b/onnxruntime/contrib_ops/cuda/moe/cutlass_extensions/epilogue/thread/fused_activations.h
@@ -46,60 +46,50 @@
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-namespace cutlass
-{
-namespace epilogue
-{
-namespace thread
-{
+namespace cutlass {
+namespace epilogue {
+namespace thread {
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 
-__forceinline__ __device__ float copysignf_pos(float a, float b)
-{
-    float r;
-    r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
-    return r;
+__forceinline__ __device__ float copysignf_pos(float a, float b) {
+  float r;
+  r = __int_as_float(__float_as_int(a) | (__float_as_int(b) & 0x80000000));
+  return r;
 }
 
-__forceinline__ __device__ float tanh_opt(float x)
-{
+__forceinline__ __device__ float tanh_opt(float x) {
 #if (__CUDACC_VER_MAJOR__ < 11) || (__CUDA_ARCH__ < 750)
-    float const exp_val = -1.f * fabs(2 * x);
-    return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
+  float const exp_val = -1.f * fabs(2 * x);
+  return copysignf_pos((1.0f - __expf(exp_val)) / (__expf(exp_val) + 1.0f), x);
 #else
-    return fast_tanh(x);
+  return fast_tanh(x);
 #endif
 }
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
 template <>
-struct GELU_taylor<float>
-{
-    static bool const kIsHeavy = true;
+struct GELU_taylor<float> {
+  static bool const kIsHeavy = true;
 
-    CUTLASS_DEVICE
-    float operator()(float const& z) const
-    {
+  CUTLASS_DEVICE
+  float operator()(float const& z) const {
+    float k0 = float(0.7978845608028654);
+    float k1 = float(0.044715);
 
-        float k0 = float(0.7978845608028654);
-        float k1 = float(0.044715);
+    return float(cutlass::constants::half<float>() * z * (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
+  }
 
-        return float(cutlass::constants::half<float>() * z
-            * (cutlass::constants::one<float>() + tanh_opt(k0 * z * (cutlass::constants::one<float>() + k1 * z * z))));
-    }
+  using Params = LinearCombinationGenericParams<float>;
 
-    using Params = LinearCombinationGenericParams<float>;
-
-    CUTLASS_DEVICE
-    float operator()(float const& scalar, Params const& params_) const
-    {
-        return this->operator()(scalar);
-    }
+  CUTLASS_DEVICE
+  float operator()(float const& scalar, Params const& params_) const {
+    return this->operator()(scalar);
+  }
 };
 
-} // namespace thread
-} // namespace epilogue
-} // namespace cutlass
+}  // namespace thread
+}  // namespace epilogue
+}  // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////