Skip to content

Commit

Permalink
Updates for 3.2 release (#1065)
Browse files Browse the repository at this point in the history
  • Loading branch information
ANIKET-SHIVAM authored and ttl10101 committed Feb 7, 2024
1 parent b0be135 commit 4022448
Show file tree
Hide file tree
Showing 20 changed files with 904 additions and 257 deletions.
6 changes: 5 additions & 1 deletion PUBLICATIONS.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,10 +2,14 @@

## 2023

- ["Graphene: An IR for Optimized Tensor Computations on GPUs"](https://dl.acm.org/doi/pdf/10.1145/3582016.3582018). Hagedorn, Bastian, Bin Fan, Hanfeng Chen, Cris Cecka, Michael Garland, and Vinod Grover. _Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, March 2023.
- ["FlashAttention-2: Faster Attention with Better Parallelism and Work Partitioning"](https://arxiv.org/abs/2307.08691). Tri Dao. _Technical Report_, July 2023.

- ["ByteTransformer: A High-Performance Transformer Boosted for Variable-Length Inputs"](https://arxiv.org/abs/2210.03052). Yujia Zhai, Chengquan Jiang, Leyuan Wang, Xiaoying Jia, Shang Zhang, Zizhong Chen, Xin Liu, Yibo Zhu. _Proceedings of the 37th IEEE International Parallel & Distributed Processing Symposium (Best Paper)_, May 2023.

- ["A Framework for Fine-Grained Synchronization of Dependent GPU Kernels"](https://arxiv.org/abs/2305.13450). Abhinav Jangda, Saeed Maleki, Maryam Mehri Dehnavi, Madan Musuvathi, Olli Saarikivi. _Computing Research Repository_, May 2023.

- ["Graphene: An IR for Optimized Tensor Computations on GPUs"](https://dl.acm.org/doi/pdf/10.1145/3582016.3582018). Hagedorn, Bastian, Bin Fan, Hanfeng Chen, Cris Cecka, Michael Garland, Vinod Grover. _Proceedings of the 28th ACM International Conference on Architectural Support for Programming Languages and Operating Systems_, March 2023.

- ["Stream-K: Work-centric Parallel Decomposition for Dense Matrix-Matrix Multiplication on the GPU"](https://arxiv.org/abs/2301.03598). Muhammad Osama, Duane Merrill, Cris Cecka, Michael Garland, John D. Owens. _arXiv_, January 2023.

## 2022
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ struct Options {
/// Prints the usage statement.
std::ostream & print_usage(std::ostream &out) const {

out << "52_fp8_hopper_warp_specialized_gemm\n\n"
out << "54_fp8_hopper_warp_specialized_gemm\n\n"
<< " Hopper FP8 GEMM using a Warp Specialized kernel.\n\n"
<< "Options:\n\n"
<< " --help If specified, displays this usage statement\n\n"
Expand All @@ -93,7 +93,7 @@ struct Options {

out
<< "\n\nExamples:\n\n"
<< "$ " << "52_fp8_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";
<< "$ " << "54_fp8_hopper_warp_specialized_gemm" << " --m=1024 --n=512 --k=1024 --alpha=2 --beta=0.707 \n\n";

return out;
}
Expand Down
61 changes: 61 additions & 0 deletions include/cutlass/barrier.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,13 @@ struct SyncthreadsSync {
}
};

struct SyncwarpSync {
CUTLASS_DEVICE
static void sync() {
__syncwarp();
}
};

template <
int ThreadCount,
int BarrierId
Expand Down Expand Up @@ -311,6 +318,60 @@ struct NamedBarrierManager {
}
};

/////////////////////////////////////////////////////////////////////////////////////////////////

/** Structure for synchronizing via contiguous barriers (e.g., __syncwarp, __syncthreads)
* via an API that mirrors that of NamedBarrierManager
*
* @param Synchronizer Synchronization helper exposing a `sync()` method to perform synchronization
**/
template <
class Synchronizer,
uint32_t ThreadCount_
>
struct SyncManager {

// Number of threads participating in the barrier
static constexpr uint32_t ThreadCount = ThreadCount_;

using BarrierSync = cutlass::GenericBarrier<Synchronizer>;

// Underlying type used by all barriers for synchronization.
using T = typename BarrierSync::T;

CUTLASS_DEVICE
static
void wait_lt(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int count) {
BarrierSync::wait_lt_helper(lock_ptr, thread_idx, flag_idx, count);
}

CUTLASS_DEVICE
static void
wait_eq(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
BarrierSync::wait_eq(lock_ptr, thread_idx, flag_idx, val);
}

CUTLASS_DEVICE
static void
wait_eq_reset(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
BarrierSync::wait_eq_reset(lock_ptr, thread_idx, flag_idx, val);
}

CUTLASS_DEVICE
static void
arrive_inc(uint32_t, void *lock_ptr, int thread_idx, int flag_idx, int val = 1) {
BarrierSync::arrive_inc(lock_ptr, thread_idx, flag_idx, val);
}

CUTLASS_DEVICE
static void
arrive_range_inc(uint32_t idx, void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1, int val = 1) {
BarrierSync::arrive_range_inc(lock_ptr, thread_idx, first_flag_idx, count, val);
}
};

/////////////////////////////////////////////////////////////////////////////////////////////////

} // namespace cutlass

/////////////////////////////////////////////////////////////////////////////////////////////////
11 changes: 8 additions & 3 deletions include/cutlass/epilogue/collective/builders/sm90_builder.inl
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ sm90_get_tma_dispatch_policy() {

constexpr int EpiTiles = size(shape_div(take<0,2>(TileShapeMNK{}), EpilogueTileMN{}));
constexpr int FragmentSize = size(EpilogueTileMN{}) / (detail::sm90_is_cooperative_v<Schedule> ? 256 : 128);
constexpr int ReuseSmemC = sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>;
constexpr int ReuseSmemC = (sizeof_bits_v<ElementC> == sizeof_bits_v<ElementD>) && (sizeof_bits_v<ElementD> > 8);
constexpr int StagesD = 2;
constexpr int StagesC = ReuseSmemC ? cute::max(EpiTiles, StagesD + 1) : EpiTiles;

Expand Down Expand Up @@ -98,7 +98,7 @@ sm90_get_epilogue_smem_swizzle_layout_atom() {
}

// Attempts to compute a reasonable epilogue tile based on block tile shape or allows the user to provide one.
template <class Element, class EpilogueTileType, class Schedule>
template <class ElementD, class EpilogueTileType, class Schedule>
constexpr auto
sm90_compute_tile_shape_or_override() {
if constexpr (cute::is_same_v<EpilogueTileType, EpilogueTileAuto>) {
Expand All @@ -107,7 +107,12 @@ sm90_compute_tile_shape_or_override() {
return Shape<_128,_32>{};
}
else if constexpr (detail::sm90_is_warp_specialized_v<Schedule>) {
return Shape<_64,_32>{};
if constexpr (sizeof_bits_v<ElementD> == 8) {
return Shape<_64,_64>{};
}
else {
return Shape<_64,_32>{};
}
}
else {
static_assert(cutlass::detail::dependent_false<Schedule>, "Unsupported schedule.");
Expand Down
13 changes: 5 additions & 8 deletions include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@
#include "cutlass/kernel_hardware_info.hpp"
#include "cute/layout.hpp"
#include "cute/tensor.hpp"
#include "cute/arch/cluster_sm90.hpp"

namespace cutlass::gemm::kernel::detail {

Expand Down Expand Up @@ -205,18 +206,14 @@ class PersistentTileSchedulerSm90 {

uint64_t cluster_id, cluster_major_offset = 0, cluster_minor_offset = 0;
divmod_cluster_shape_major(cluster_id, cluster_major_offset, blk_per_grid_dim);
// MSVC requires protecting use of CUDA-specific nonstandard syntax,
// like blockIdx and gridDim, with __CUDA_ARCH__.
#if defined(__CUDA_ARCH__)

auto [cta_m_in_cluster, cta_n_in_cluster, _] = cute::block_id_in_cluster();
if (raster_order == RasterOrder::AlongN) {
cluster_minor_offset = blockIdx.x;
cluster_minor_offset = cta_m_in_cluster;
}
else {
cluster_minor_offset = blockIdx.y;
cluster_minor_offset = cta_n_in_cluster;
}
#else
CUTLASS_ASSERT(false && "This line should never be reached");
#endif

uint64_t cluster_idx_minor, cluster_idx_major;

Expand Down
Loading

0 comments on commit 4022448

Please sign in to comment.