Updates for 3.2 release (#1065)

SNU-ARC · Feb 7, 2024 · 4022448 · 4022448
1 parent b0be135
commit 4022448
Show file tree

Hide file tree

Showing 20 changed files with 904 additions and 257 deletions.
diff --git a/PUBLICATIONS.md b/PUBLICATIONS.md
@@ -2,10 +2,14 @@
 
 ## 2023
 
-- ["Graphene: An IR for Optimized Tensor Computations on GPUs"](https://dl.acm.org/doi/pdf/10.1145/3582016.3582018). Hagedorn, Bastian, Bin Fan, Hanfeng Chen, Cris Cecka, Michael Garland, and Vinod Grover. _Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, March 2023.
+- ["FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning"](https://arxiv.org/abs/2307.08691). Tri Dao. _Technical Report_, July 2023.
 
 - ["ByteTransformer: A High-Performance Transformer Boosted for Variable-Length Inputs"](https://arxiv.org/abs/2210.03052). Yujia Zhai, Chengquan Jiang, Leyuan Wang, Xiaoying Jia, Shang Zhang, Zizhong Chen, Xin Liu, Yibo Zhu. _Proceedings of the 37th IEEE International Parallel & Distributed Processing Symposium (Best Paper)_, May 2023.
 
+- ["A Framework for Fine-Grained Synchronization of Dependent GPU Kernels"](https://arxiv.org/abs/2305.13450). Abhinav Jangda, Saeed Maleki, Maryam Mehri Dehnavi, Madan Musuvathi, Olli Saarikivi. _Computing Research Repository_, May 2023.
+
+- ["Graphene: An IR for Optimized Tensor Computations on GPUs"](https://dl.acm.org/doi/pdf/10.1145/3582016.3582018). Hagedorn, Bastian, Bin Fan, Hanfeng Chen, Cris Cecka, Michael Garland, Vinod Grover. _Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, March 2023.
+
 - ["Stream-K: Work-centric Parallel Decomposition for Dense Matrix-Matrix Multiplication on the GPU"](https://arxiv.org/abs/2301.03598). Muhammad Osama, Duane Merrill, Cris Cecka, Michael Garland, John D. Owens. _arXiv_, January 2023.
 
 ## 2022

diff --git a/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp b/examples/54_hopper_fp8_warp_specialized_gemm/hopper_fp8_commandline.hpp
@@ -71,7 +71,7 @@ struct Options {
   /// Prints the usage statement.
   std::ostream & print_usage(std::ostream &out) const {
 
-    out << "52_fp8_hopper_warp_specialized_gemm\n\n"
+    out << "54_fp8_hopper_warp_specialized_gemm\n\n"
       << "  Hopper FP8 GEMM using a Warp Specialized kernel.\n\n"
       << "Options:\n\n"
       << "  --help                      If specified, displays this usage statement\n\n"
@@ -93,7 +93,7 @@ struct Options {
 
     out
       << "\n\nExamples:\n\n"
-      << "$ " << "52_fp8_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
+      << "$ " << "54_fp8_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
 
     return out;
   }

diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
@@ -54,6 +54,13 @@ struct SyncthreadsSync {
   }
 };
 
+struct SyncwarpSync {
+  CUTLASS_DEVICE
+  static void sync() {
+    __syncwarp();
+  }
+};
+
 template <
   int ThreadCount,
   int BarrierId
@@ -311,6 +318,60 @@ struct NamedBarrierManager {
   }
 };
 
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
+ *  via an API that mirrors that of NamedBarrierManager
+ *
+ * @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
+**/
+template <
+  class Synchronizer,
+  uint32_t ThreadCount_
+>
+struct SyncManager {
+
+  // Number of threads participating in the barrier
+  static constexpr uint32_t ThreadCount = ThreadCount_;
+
+  using BarrierSync = cutlass::GenericBarrier<Synchronizer>;
+
+  // Underlying type used by all barriers for synchronization.
+  using T = typename BarrierSync::T;
+
+  CUTLASS_DEVICE
+  static
+  void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
+    BarrierSync::wait_lt_helper(lock_ptr, thread_idx, flag_idx, count);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
+    BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
+    BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
+  }
+
+  CUTLASS_DEVICE
+  static void
+  arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
+    BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
 } // namespace cutlass
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -67,7 +67,7 @@ sm90_get_tma_dispatch_policy() {
 
   constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
   constexpr int FragmentSize = size(EpilogueTileMN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128);
-  constexpr int ReuseSmemC = sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>;
+  constexpr int ReuseSmemC = (sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>) && (sizeof_bits_v<ElementD> > 8);
   constexpr int StagesD = 2;
   constexpr int StagesC = ReuseSmemC ? cute::max(EpiTiles, StagesD + 1) : EpiTiles;
 
@@ -98,7 +98,7 @@ sm90_get_epilogue_smem_swizzle_layout_atom() {
 }
 
 // Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
-template <class Element, class EpilogueTileType, class Schedule>
+template <class ElementD, class EpilogueTileType, class Schedule>
 constexpr auto
 sm90_compute_tile_shape_or_override() {
   if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
@@ -107,7 +107,12 @@ sm90_compute_tile_shape_or_override() {
       return Shape<_128,_32>{};
     }
     else if constexpr (detail::sm90_is_warp_specialized_v<Schedule>) {
-      return Shape<_64,_32>{};
+      if constexpr (sizeof_bits_v<ElementD> == 8) {
+        return Shape<_64,_64>{};
+      }
+      else {
+        return Shape<_64,_32>{};
+      }
     }
     else {
       static_assert(cutlass::detail::dependent_false<Schedule>, "Unsupported schedule.");

diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -34,6 +34,7 @@
 #include "cutlass/kernel_hardware_info.hpp"
 #include "cute/layout.hpp"
 #include "cute/tensor.hpp"
+#include "cute/arch/cluster_sm90.hpp"
 
 namespace cutlass::gemm::kernel::detail {
 
@@ -205,18 +206,14 @@ class PersistentTileSchedulerSm90 {
 
     uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
     divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
-    // MSVC requires protecting use of CUDA-specific nonstandard syntax,
-    // like blockIdx and gridDim, with __CUDA_ARCH__.
-#if defined(__CUDA_ARCH__) 
+
+    auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
     if (raster_order == RasterOrder::AlongN) {
-      cluster_minor_offset = blockIdx.x;
+      cluster_minor_offset = cta_m_in_cluster;
     }
     else {
-      cluster_minor_offset = blockIdx.y;
+      cluster_minor_offset = cta_n_in_cluster;
     }
-#else
-    CUTLASS_ASSERT(false && "This line should never be reached");
-#endif
 
     uint64_t cluster_idx_minor, cluster_idx_major;