From 8002dc3f9225f1a5e5c3c72c613635e585515656 Mon Sep 17 00:00:00 2001
From: ANIKET SHIVAM <3268307+ANIKET-SHIVAM@users.noreply.github.com>
Date: Sat, 29 Apr 2023 06:34:27 -0700
Subject: [PATCH] Updates for 3.1 (#932)

---
 CMakeLists.txt                                |  56 +-
 .../08_turing_tensorop_gemm/CMakeLists.txt    |   1 +
 examples/12_gemm_bias_relu/CMakeLists.txt     |   1 +
 examples/31_basic_syrk/basic_syrk.cu          |   2 +-
 examples/32_basic_trmm/basic_trmm.cu          |   2 +-
 .../ampere_gemm_universal_streamk.cu          |   6 +-
 .../49_collective_builder.cu                  |  10 +-
 include/cute/arch/cluster_sm90.hpp            |  13 +
 include/cutlass/arch/memory.h                 | 144 ++++-
 include/cutlass/barrier.h                     |  12 -
 include/cutlass/core_io.h                     |   2 -
 .../collective/builders/sm90_builder.inl      |  48 +-
 .../sm90_epilogue_tma_warpspecialized.hpp     |  22 +-
 .../linear_combination_tensor_broadcast.hpp   |   1 +
 .../cutlass/epilogue/threadblock/epilogue.h   |   2 +-
 .../sm90_mma_tma_gmma_rs_warpspecialized.hpp  |  20 +-
 .../sm90_mma_tma_gmma_ss_warpspecialized.hpp  |  16 -
 include/cutlass/gemm/device/base_grouped.h    |  17 +-
 .../gemm/device/default_gemm_configuration.h  |   6 +-
 .../device/gemm_universal_with_broadcast.h    |   2 +-
 .../gemm/device/gemv_strided_batched.h        | 167 ++++++
 include/cutlass/gemm/dispatch_policy.hpp      |   2 +-
 .../gemm/kernel/gemv_strided_batched.h        | 368 +++++++++++++
 include/cutlass/gemm/kernel/sm90_gemm_tma.hpp |  14 +-
 .../kernel/sm90_gemm_tma_warpspecialized.hpp  |  14 +-
 ...0_gemm_tma_warpspecialized_cooperative.hpp |  14 +-
 ...sm90_gemm_tma_warpspecialized_pingpong.hpp |  14 +-
 .../gemm/kernel/sm90_tile_scheduler.hpp       |   6 +
 .../threadblock/threadblock_swizzle_streamk.h |   9 +-
 include/cutlass/pipeline/sm90_pipeline.hpp    |   9 +-
 include/cutlass/semaphore.h                   |   4 -
 media/docs/cute/01_layout.md                  |  44 +-
 media/docs/cute/02_layout_operations.md       | 148 +++++-
 python/cutlass/backend/compiler.py            |  59 +--
 python/cutlass/cpp/include/tensor_ref_view.h  |   6 +-
 python/setup.py                               |  68 ++-
 test/unit/gemm/device/CMakeLists.txt          |  11 +
 test/unit/gemm/device/gemm_testbed_3x.hpp     |   3 +-
 test/unit/gemm/device/gemv_strided_batched.cu | 490 ++++++++++++++++++
 tools/library/scripts/gemm_operation.py       |  20 +-
 tools/library/scripts/generator.py            | 106 ++--
 tools/library/scripts/library.py              |   7 +-
 tools/library/scripts/manifest.py             |  20 +-
 tools/library/src/library_internal.h          |   4 +
 tools/profiler/CMakeLists.txt                 |  20 +-
 tools/profiler/src/cutlass_profiler.cu        |   4 +-
 tools/profiler/src/device_allocation.cu       |  65 ++-
 tools/profiler/src/gemm_operation_profiler.cu |  10 +-
 tools/profiler/src/operation_profiler.cu      |  30 +-
 tools/profiler/src/options.cu                 |   2 +
 tools/profiler/src/options.h                  |   3 +
 51 files changed, 1796 insertions(+), 328 deletions(-)
 create mode 100644 include/cutlass/gemm/device/gemv_strided_batched.h
 create mode 100644 include/cutlass/gemm/kernel/gemv_strided_batched.h
 create mode 100644 test/unit/gemm/device/gemv_strided_batched.cu

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 1136d095..16e0f9ae 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -59,6 +59,8 @@ endif()
 
 find_package(Doxygen QUIET)
 
+################################################################################
+
 #
 # CUTLASS 3.x requires C++17
 #
@@ -80,6 +82,10 @@ endif()
 
 message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")
 
+set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive
+
+################################################################################
 set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")
 
 if(CUTLASS_ENABLE_HEADERS_ONLY)
@@ -112,6 +118,8 @@ if (CUTLASS_ENABLE_TESTS)
   include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
 endif()
 
+################################################################################
+
 set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
 if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
   list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
@@ -197,15 +205,16 @@ set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
 set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
 set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")
 
+################################################################################
 #
 # CUTLASS generator cmake configuration
 #
+
 set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
-set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
+set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
 set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")
 
-# Test Levels L0, L1, L2
-set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
+################################################################################
 
 set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")
 
@@ -225,6 +234,8 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
   list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
 endif()
 
+################################################################################
+
 #
 # CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
 #
@@ -650,14 +661,16 @@ function(cutlass_add_executable_tests NAME TARGET)
 # DEPENDS: A list of targets or files on which this test is dependent.
 # DEPENDEES: A list of targets which should depend on this test.
 # TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
-#   to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of 
+#   to pass to the test executable. A unique test is generated for each set of 
 #   options given. If this option is not used, a single test with no arguments is generated.
+# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for 
+#   generating the full variable name to be referenced.
 # RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
 #   test results to speed up test runtime.
 # 
 
   set(options DISABLE_EXECUTABLE_INSTALL_RULE)
-  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE)
+  set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
   set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
   cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
 
@@ -701,7 +714,6 @@ function(cutlass_add_executable_tests NAME TARGET)
   endif()
 
   list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
-  set(CMD_IDX 0)
 
   if (CMD_COUNT GREATER 1)
     add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
@@ -710,12 +722,12 @@ function(cutlass_add_executable_tests NAME TARGET)
     endforeach()
   endif()
 
-  foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
+  foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)
 
     if (CMD_COUNT GREATER 1)
-      set(TEST_NAME ${NAME}_${CMD_IDX})
+      string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME)
     else()
-      set(TEST_NAME ${NAME})
+      string(TOLOWER "${NAME}" TEST_NAME)
     endif()
 
     # The following rigmarole is needed to deal with spaces and possible quotes in 
@@ -724,14 +736,14 @@ function(cutlass_add_executable_tests NAME TARGET)
     # preserves any quotes. Note, they have to be in this order for it to work for 
     # all the use cases below.
 
-    set(CMD_OPTIONS ${${CMD_OPTIONS}})
-    list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
-    separate_arguments(CMD_OPTIONS)
-   
+    set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
+    list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
+    separate_arguments(TEST_COMMAND_OPTIONS)
+
     add_custom_target(
       ${TEST_NAME}
       COMMAND
-      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
       DEPENDS
       ${TARGET}
       )
@@ -746,7 +758,7 @@ function(cutlass_add_executable_tests NAME TARGET)
 
     add_test(
       NAME c${TEST_NAME}
-      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
+      COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
       )
 
     set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
@@ -756,18 +768,21 @@ function(cutlass_add_executable_tests NAME TARGET)
       # To run the tests from an install package with tests enabled, we need to generate test files
       # that don't rely on the current directory structure in build.  
 
+      set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NAME})
+      file(MAKE_DIRECTORY ${TEST_GEN_DIR})
+
       set(TEST_NAME c${TEST_NAME})
       set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
       set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
-      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
+      configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
 
       file(GENERATE 
-        OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
-        INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
+        OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake" 
+        INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
         )
   
       install(
-        FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
+        FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake"
         DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
         )
   
@@ -775,8 +790,6 @@ function(cutlass_add_executable_tests NAME TARGET)
   
     endif()
 
-    math(EXPR CMD_IDX "${CMD_IDX} + 1")
-
   endforeach()
 
 endfunction()
@@ -787,6 +800,7 @@ if (CUTLASS_ENABLE_TOOLS)
     add_dependencies(test_all test_profiler)
   endif()
 endif()
+
 if (CUTLASS_ENABLE_EXAMPLES)
   add_subdirectory(examples)
   add_dependencies(test_all test_examples)
diff --git a/examples/08_turing_tensorop_gemm/CMakeLists.txt b/examples/08_turing_tensorop_gemm/CMakeLists.txt
index a240bcc9..e9d659e1 100644
--- a/examples/08_turing_tensorop_gemm/CMakeLists.txt
+++ b/examples/08_turing_tensorop_gemm/CMakeLists.txt
@@ -31,5 +31,6 @@
 cutlass_example_add_executable(
   08_turing_tensorop_gemm
   turing_tensorop_gemm.cu
+  DISABLE_TESTS ON
   )
 
diff --git a/examples/12_gemm_bias_relu/CMakeLists.txt b/examples/12_gemm_bias_relu/CMakeLists.txt
index 5d4dac6c..abe61be1 100644
--- a/examples/12_gemm_bias_relu/CMakeLists.txt
+++ b/examples/12_gemm_bias_relu/CMakeLists.txt
@@ -31,5 +31,6 @@
 cutlass_example_add_executable(
   12_gemm_bias_relu
   gemm_bias_relu.cu
+  DISABLE_TESTS ON
   )
 
diff --git a/examples/31_basic_syrk/basic_syrk.cu b/examples/31_basic_syrk/basic_syrk.cu
index 82f4a6a2..65ac0480 100644
--- a/examples/31_basic_syrk/basic_syrk.cu
+++ b/examples/31_basic_syrk/basic_syrk.cu
@@ -34,7 +34,7 @@
   matrix multiply kernel to verify its correctness.
 
   The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
-  the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
+  the symmetric rank-k update (SYRK) using double-precision floating-point arithmetic and assumes
   all matrices have column-major layout.
 
   The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
diff --git a/examples/32_basic_trmm/basic_trmm.cu b/examples/32_basic_trmm/basic_trmm.cu
index 74f5cb9f..084b6563 100644
--- a/examples/32_basic_trmm/basic_trmm.cu
+++ b/examples/32_basic_trmm/basic_trmm.cu
@@ -34,7 +34,7 @@
   matrix multiply kernel to verify its correctness.
 
   The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes
-  the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
+  the triangular matrix product (TRMM) using double-precision floating-point arithmetic and assumes
   all matrices have column-major layout.
 
   The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
diff --git a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
index 12739a05..bb995f59 100644
--- a/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
+++ b/examples/47_ampere_gemm_universal_streamk/ampere_gemm_universal_streamk.cu
@@ -495,7 +495,7 @@ int main(int argc, const char **argv)
   options.tensor_d.resize(options.problem_size.mn());       // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel
   options.tensor_ref_d.resize(options.problem_size.mn());   // <- Create matrix D with dimensions M x N used to store output from reference kernel
 
-  // Fill matrix A on host with uniform-random data [4, -4]
+  // Fill matrix A on host with uniform-random data [2, -2]
   cutlass::reference::host::TensorFillRandomUniform(
       options.tensor_a.host_view(),
       1,
@@ -503,7 +503,7 @@ int main(int argc, const char **argv)
       ElementA(-2),
       0);
 
-  // Fill matrix B on host with uniform-random data [4, -4]
+  // Fill matrix B on host with uniform-random data [2, -2]
   cutlass::reference::host::TensorFillRandomUniform(
       options.tensor_b.host_view(),
       1,
@@ -511,7 +511,7 @@ int main(int argc, const char **argv)
       ElementB(-2),
       0);
 
-  // Fill matrix C on host with uniform-random data [4, -4]
+  // Fill matrix C on host with uniform-random data [2, -2]
   cutlass::reference::host::TensorFillRandomUniform(
       options.tensor_c.host_view(),
       1,
diff --git a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
index 001e8329..6bbdfb6a 100644
--- a/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
+++ b/examples/49_hopper_gemm_with_collective_builder/49_collective_builder.cu
@@ -84,9 +84,10 @@
     therefore letting the builder pick the collective specialization.
 
     CUTLASS builders make an attempt to pick the best schedule when `Auto` is provided such that the
-    assembled collctives have the best performance, but this is not a guarantee. A user relying on `Auto`
+    assembled collectives have the best performance, but this is not a guarantee. A user relying on `Auto`
     may get a free performance upgrade with newer CUTLASS releases in case we can provide more optimized
-    implementations that the builder can transparently assemble for `Auto`.
+    implementations that the builder can transparently assemble for `Auto`. But a user should not rely on 
+    `Auto` if they require a specific scheduling policy and/or stage count to be used.
 
     If a user decides to let the builders pick the collective specialization via `Auto` schedules,
     they must be used for both mainloop and epilogue alike to ensure compatibility between the
@@ -99,11 +100,6 @@
     in this manner remains the primary API for using CUTLASS 3 kernels. `CollectiveBuilder`s are
     simply meant to be a convenience interface.
 
-    Note also that, while the selections made by CollectiveBuilder attempt to maximize performance, this is not
-    a guarantee. Furthermore, the behavior of the CollectiveBuilder when `Auto` parameters are provided is subject
-    to change in future CUTLASS releases -- do not rely on `Auto` if you require a specific scheduling policy and/or
-    stage count to be used.
-
     Details of this example
     -----------------------
     This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
diff --git a/include/cute/arch/cluster_sm90.hpp b/include/cute/arch/cluster_sm90.hpp
index e9c858d5..7e909712 100644
--- a/include/cute/arch/cluster_sm90.hpp
+++ b/include/cute/arch/cluster_sm90.hpp
@@ -227,4 +227,17 @@ elect_one_leader_sync()
 #endif
 }
 
+// Store value to remote shared memory in the cluster
+CUTE_DEVICE
+void
+store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
+{
+#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
+  uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
+  uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
+  asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
+               : : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
+#endif
+}
+
 } // end namespace cute
diff --git a/include/cutlass/arch/memory.h b/include/cutlass/arch/memory.h
index db6c4073..b44407e5 100644
--- a/include/cutlass/arch/memory.h
+++ b/include/cutlass/arch/memory.h
@@ -35,6 +35,7 @@
 #pragma once
 
 #include "cutlass/cutlass.h"
+#include "cutlass/arch/cache_operation.h"
 
 namespace cutlass {
 namespace arch {
@@ -45,7 +46,9 @@ template <
     /// Fragment type to store loaded data
     typename AccessType,
     /// The bytes of loading
-    int LoadBytes
+    int LoadBytes,
+    /// Cache operation
+    CacheOperation::Kind cache_op = CacheOperation::Always
     >
 struct global_load;
 
@@ -59,7 +62,7 @@ struct global_load;
 
 #if (((__CUDACC_VER_MAJOR__ == 11) && (__CUDACC_VER_MINOR__ >= 4)) || \
      (__CUDACC_VER_MAJOR__ > 11)) &&                                  \
-    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750) 
+    defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 750)
   #define CUTLASS_ENABLE_L2_PREFETCH 1
 #else
   #define CUTLASS_ENABLE_L2_PREFETCH 0
@@ -71,7 +74,8 @@ struct global_load;
 // keep the initializing code before ld.global
 template <typename AccessType>
 struct global_load<AccessType,
-                   32 
+                   32,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -107,7 +111,40 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   16
+                   32,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 *data = reinterpret_cast<uint4 *>(&D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %9, 0;\n"
+        "  mov.b32 %0, %10;\n"
+        "  mov.b32 %1, %11;\n"
+        "  mov.b32 %2, %12;\n"
+        "  mov.b32 %3, %13;\n"
+        "  mov.b32 %4, %14;\n"
+        "  mov.b32 %5, %15;\n"
+        "  mov.b32 %6, %16;\n"
+        "  mov.b32 %7, %17;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%8];\n"
+        "  @p ld.global.lu.v4.u32 {%4, %5, %6, %7}, [%18];\n"
+        "}\n"
+        : "=r"(data[0].x), "=r"(data[0].y), "=r"(data[0].z), "=r"(data[0].w),
+          "=r"(data[1].x), "=r"(data[1].y), "=r"(data[1].z), "=r"(data[1].w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data[0].x), "r"(data[0].y),
+          "r"(data[0].z), "r"(data[0].w), "r"(data[1].x), "r"(data[1].y),
+          "r"(data[1].z), "r"(data[1].w), "l"(((uint8_t *)ptr) + 16));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   16,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -133,7 +170,31 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   8
+                   16,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint4 &data = reinterpret_cast<uint4 &>(D);
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %5, 0;\n"
+        "  mov.b32 %0, %6;\n"
+        "  mov.b32 %1, %7;\n"
+        "  mov.b32 %2, %8;\n"
+        "  mov.b32 %3, %9;\n"
+        "  @p ld.global.lu.v4.u32 {%0, %1, %2, %3}, [%4];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y), "=r"(data.z), "=r"(data.w)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y), "r"(data.z), "r"(data.w));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   8,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -158,7 +219,30 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   4
+                   8,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint2 &data = reinterpret_cast<uint2 &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %3, 0;\n"
+        "  mov.b32 %0, %4;\n"
+        "  mov.b32 %1, %5;\n"
+        "  @p ld.global.lu.v2.u32 {%0, %1}, [%2];\n"
+        "}\n"
+        : "=r"(data.x), "=r"(data.y)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data.x), "r"(data.y));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   4,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -182,7 +266,29 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   2
+                   4,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  unsigned &data = reinterpret_cast<unsigned &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b32 %0, %3;\n"
+        "  @p ld.global.lu.u32 %0, [%1];\n"
+        "}\n"
+        : "=r"(data)
+        : "l"(ptr), "r"((int)pred_guard), "r"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   2,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
@@ -206,7 +312,29 @@ struct global_load<AccessType,
 
 template <typename AccessType>
 struct global_load<AccessType,
-                   1
+                   2,
+                   CacheOperation::LastUse
+                  > {
+  CUTLASS_DEVICE
+  global_load(AccessType &D, void const *ptr, bool pred_guard) {
+  uint16_t &data = reinterpret_cast<uint16_t &>(D);
+
+    asm volatile(
+        "{\n"
+        "  .reg .pred p;\n"
+        "  setp.ne.b32 p, %2, 0;\n"
+        "  mov.b16 %0, %3;\n"
+        "  @p ld.global.lu.u16 %0, [%1];\n"
+        "}\n"
+        : "=h"(data)
+        : "l"(ptr), "r"((int)pred_guard), "h"(data));
+  }
+};
+
+template <typename AccessType>
+struct global_load<AccessType,
+                   1,
+                   CacheOperation::Always
                   > {
   CUTLASS_DEVICE
   global_load(AccessType &D, void const *ptr, bool pred_guard) {
diff --git a/include/cutlass/barrier.h b/include/cutlass/barrier.h
index 78777147..b74e1038 100644
--- a/include/cutlass/barrier.h
+++ b/include/cutlass/barrier.h
@@ -81,7 +81,6 @@ struct Barrier
   CUTLASS_DEVICE
   static void red_release(int *ptr, int val)
   {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
 #if (__CUDA_ARCH__ >= 700)
     /// SM70 and newer use memory consistency qualifiers
 
@@ -94,7 +93,6 @@ struct Barrier
     __threadfence();
     atomicAdd(ptr, val);
 #endif // (__CUDA_ARCH__ >= 700)
-#endif
   }
 
 
@@ -104,7 +102,6 @@ struct Barrier
   CUTLASS_DEVICE
   static void wait_lt(void *lock_ptr, int thread_idx, int flag_idx, int count)
   {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
 
     if (thread_idx == 0)
@@ -115,14 +112,12 @@ struct Barrier
     }
 
     __syncthreads();
-#endif
   }
 
   /// Uses thread[0] to wait for at least the specified count of signals on the given flag counter
   CUTLASS_DEVICE
   static void wait_eq(void *lock_ptr, int thread_idx, int flag_idx, T val = 1)
   {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
 
     if (thread_idx == 0)
@@ -132,13 +127,11 @@ struct Barrier
         while(ld_acquire(flag_ptr) != val) {}
     }
     __syncthreads();
-#endif
   }
 
   /// Uses thread[0] to wait for the specified count of signals on the given flag counter
   CUTLASS_DEVICE
   static void wait_eq_reset(void *lock_ptr, int thread_idx, int flag_idx, T val = 1) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     T *flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
 
     if (thread_idx == 0)
@@ -149,14 +142,12 @@ struct Barrier
     }
 
     __syncthreads();
-#endif
   }
 
   /// Increment the arrival count for a flag
   CUTLASS_DEVICE
   static void arrive_inc(void *lock_ptr, int thread_idx, int flag_idx)
   {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
 
     __syncthreads();
@@ -165,7 +156,6 @@ struct Barrier
     {
       red_release(flag_ptr, 1);
     }
-#endif
   }
 
 
@@ -173,7 +163,6 @@ struct Barrier
   CUTLASS_DEVICE
   static void arrive_range_inc(void *lock_ptr, int thread_idx, int first_flag_idx, int count = 1)
   {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     int flag_idx = first_flag_idx + thread_idx;
     T* flag_ptr = reinterpret_cast<T*>(lock_ptr) + flag_idx;
 
@@ -184,7 +173,6 @@ struct Barrier
     if (thread_idx < count) {
       red_release(flag_ptr, 1);
     }
-#endif
   }
 };
 
diff --git a/include/cutlass/core_io.h b/include/cutlass/core_io.h
index c0438a17..c0a96850 100644
--- a/include/cutlass/core_io.h
+++ b/include/cutlass/core_io.h
@@ -59,9 +59,7 @@ inline std::ostream &operator<<(std::ostream &out, dim3 d) {
 
 /// Output operator for CUDA built-in error type
 inline std::ostream &operator<<(std::ostream &out, cudaError_t error) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
   return out << cudaGetErrorString(error);
-#endif
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/epilogue/collective/builders/sm90_builder.inl b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
index f0d51bb4..b0549d46 100644
--- a/include/cutlass/epilogue/collective/builders/sm90_builder.inl
+++ b/include/cutlass/epilogue/collective/builders/sm90_builder.inl
@@ -290,7 +290,7 @@ struct CollectiveBuilder<
     AlignmentD,
     Schedule,
     cute::enable_if_t<cute::is_same_v<Schedule, TmaWarpSpecialized> ||
-                     cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> >> {
+                      cute::is_same_v<Schedule, TmaWarpSpecializedCooperative> >> {
 public:
   // Passing void C disables source load
   using ElementC = cute::conditional_t<cute::is_void_v<ElementC_>,
@@ -302,16 +302,33 @@ public:
 
   using ThreadOp = thread::LinearCombination<
     ElementD, AlignmentD, ElementAccumulator, ElementCompute,
-    thread::ScaleType::Default, FloatRoundStyle::round_to_nearest, ElementC>;
+    ScaleType, FloatRoundStyle::round_to_nearest, ElementC>;
 
-private:
-  using Impl = detail::TmaBuilderImpl<
-    TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
-    ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>;
+  using GmemStrideTypeC = gemm::TagToStrideC_t<GmemLayoutTagC>;
+  using GmemStrideTypeD = gemm::TagToStrideC_t<GmemLayoutTagD>;
 
-public:
-  using CollectiveOp = typename Impl::CollectiveOp;
+  using EpilogueTile_MN = decltype(detail::sm90_compute_tile_shape_or_override<
+      ElementD, EpilogueTileType, Schedule>());
+
+  static constexpr int StagesC = 1;
+  static constexpr int StagesD = 2;
+  static constexpr bool DisableReuseSmemC = true;
+  using CollectiveOp = cutlass::epilogue::collective::CollectiveEpilogue<
+      cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC,StagesD,DisableReuseSmemC>,
+      TileShape_MNK,
+      EpilogueTile_MN,
+      ElementC_, // need to pass void to expose via GemmUniversal
+      GmemStrideTypeC,
+      ElementD,
+      GmemStrideTypeD,
+      ThreadOp,
+      SM90_TMA_LOAD,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeC, ElementC, TileShape_MNK>()),
+      decltype(detail::sm90_get_smem_load_op_for_source<GmemStrideTypeC, ElementC>()),
+      SM90_TMA_STORE,
+      decltype(detail::sm90_get_epilogue_smem_swizzle_layout_atom<GmemStrideTypeD, ElementD, EpilogueTile_MN>()),
+      decltype(detail::sm90_get_smem_store_op_for_accumulator<GmemStrideTypeD, ElementD>())
+    >;
 };
 
 // Auto builder
@@ -409,7 +426,7 @@ struct CollectiveBuilder<
     AlignmentD,
     Schedule,
     cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedElementwiseBase, Schedule> ||
-                     cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeElementwiseBase, Schedule> >> {
 
 public:
   using ThreadOp = thread::LinearCombinationGeneric<
@@ -419,10 +436,13 @@ public:
     Schedule::Round>;
 
 private:
+  static constexpr int StagesC = 1;
+  static constexpr int StagesD = 2;
+  static constexpr bool DisableReuseSmemC = true;
   using Impl = detail::TmaBuilderImpl<
     TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
     ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<1,2,true>>;
+    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecialized<StagesC,StagesD,DisableReuseSmemC>>;
 
 public:
   using CollectiveOp = typename Impl::CollectiveOp;
@@ -459,7 +479,7 @@ struct CollectiveBuilder<
     AlignmentD,
     Schedule,
     cute::enable_if_t<cute::is_base_of_v<TmaWarpSpecializedBiasElementwiseBase, Schedule> ||
-                     cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
+                      cute::is_base_of_v<TmaWarpSpecializedCooperativeBiasElementwiseBase, Schedule> >> {
 
 public:
   using ThreadOp = thread::LinearCombinationBiasElementwise<
@@ -468,10 +488,12 @@ public:
       Schedule::StoreT, typename Schedule::ElementBias>;
 
 private:
+  static constexpr int StagesC = 1;
+  static constexpr int StagesD = 2;
   using Impl = detail::TmaBuilderImpl<
     TileShape_MNK, ClusterShape_MNK, EpilogueTileType, ElementAccumulator, ElementCompute,
     ElementC, GmemLayoutTagC, AlignmentC, ElementD, GmemLayoutTagD, AlignmentD,
-    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<1,2>>;
+    Schedule, ThreadOp, cutlass::epilogue::Sm90TmaWarpSpecializedBiasElementwise<StagesC,StagesD>>;
 
 public:
   using CollectiveOp = typename Impl::CollectiveOp;
diff --git a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
index 5654597e..bbeb181e 100644
--- a/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
+++ b/include/cutlass/epilogue/collective/sm90_epilogue_tma_warpspecialized.hpp
@@ -82,7 +82,6 @@ class CollectiveEpilogue<
   //
   // Type Aliases
   //
-  // derived types of output thread level operator
   using DispatchPolicy = Sm90TmaWarpSpecialized<StagesC_,StagesD_,DisableSmemReuseC_>;
   using BlockTileShape = BlockTileShape_;
   using EpilogueTile = EpilogueTile_;
@@ -108,7 +107,6 @@ class CollectiveEpilogue<
   constexpr static bool iskThreadEpilogueOpWithBias = detail::IsThreadEpilogueOpWithBias<ThreadEpilogueOp>::value;
   using AlignmentType = typename uint_bit<sizeof_bits<ElementOutput>::value * kOutputAlignment>::type;
 
-  static_assert(sizeof(ElementC) == 2, "Only 16b source supported for now");
   static_assert(sizeof(ElementD) == 2, "Only 16b output supported for now");
   static_assert(!is_layout<EpilogueTile>::value && is_tuple<EpilogueTile>::value, "EpilogueTile must be a cute::Tile or cute::Shape");
   static_assert(rank(BlockTileShape{}) == 3, "BlockTileShape must be rank-3: [BLK_M,BLK_N,BLK_K]");
@@ -117,17 +115,19 @@ class CollectiveEpilogue<
   static_assert(rank(StrideD{}) == 3, "StrideCD must be rank-3: [M, N, L]");
 
 private:
+  using InternalElementC = std::conditional_t<std::is_void_v<ElementC>,ElementD,ElementC>; // prevents void ref breakages
   constexpr static int StagesC = StagesC_;
   constexpr static int StagesD = StagesD_;
   constexpr static bool is_source_supported = ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default ||
                                               ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::NoBetaScaling;
+  static_assert((std::is_void_v<ElementC> && not is_source_supported) || (not std::is_void_v<ElementC> && is_source_supported));
 
   // internal optimization to reuse C shared memory for storing D
-  using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<ElementC>::value>(SmemLayoutAtomC{}));
+  using SmemLayoutAtomBitsC = decltype(downcast<sizeof_bits<InternalElementC>::value>(SmemLayoutAtomC{}));
   using SmemLayoutAtomBitsD = decltype(downcast<sizeof_bits<ElementD>::value>(SmemLayoutAtomD{}));
   constexpr static bool ReuseSmemC = not DispatchPolicy::DisableSmemReuseC &&
                                      is_source_supported &&
-                                     sizeof(ElementC) == sizeof(ElementD) &&
+                                     sizeof(InternalElementC) == sizeof(ElementD) &&
                                      StrideC{} == StrideD{} &&
                                      cute::is_same_v<SmemLayoutAtomBitsC,SmemLayoutAtomBitsD>;
 
@@ -152,7 +152,7 @@ class CollectiveEpilogue<
   using LoadPipeline = cutlass::PipelineTransactionAsync<is_source_supported ? StagesC : 0>;
   using LoadPipelineState = cutlass::PipelineState<is_source_supported ? StagesC : 0>;
   constexpr static uint32_t TmaTransactionBytes =
-    size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(ElementC));
+    size(take<0,2>(SmemLayoutC{})) * static_cast<uint32_t>(sizeof(InternalElementC));
 
   // TMA pipeline for storing D
   using StorePipeline = cutlass::PipelineTmaStore<ReuseSmemC ? StagesC : StagesD>;
@@ -161,8 +161,8 @@ class CollectiveEpilogue<
   struct SharedStorage {
     struct TensorStorage : aligned_struct<128> {
       cute::conditional_t<not is_source_supported,
-        detail::EmptyStorage<ElementC>,
-        array_aligned<ElementC, size(SmemLayoutC{})>> smem_C;
+        detail::EmptyStorage<InternalElementC>,
+        array_aligned<InternalElementC, size(SmemLayoutC{})>> smem_C;
       alignas(128) cute::conditional_t<ReuseSmemC,
         detail::EmptyStorage<ElementD>,
         array_aligned<ElementD, size(SmemLayoutD{})>> smem_D;
@@ -187,7 +187,7 @@ class CollectiveEpilogue<
   struct Params {
     using TMA_C = decltype(make_tma_copy(
         CopyOpG2S{},
-        make_tensor(static_cast<ElementC const*>(nullptr),
+        make_tensor(static_cast<InternalElementC const*>(nullptr),
             repeat_like(StrideC{}, int32_t(0)), StrideC{}),
         SmemLayoutC{}(_,_,0)));
     using TMA_D = decltype(make_tma_copy(
@@ -217,7 +217,7 @@ class CollectiveEpilogue<
     auto M = get<0>(problem_shape_MNKL);
     auto N = get<1>(problem_shape_MNKL);
     auto L = get<3>(problem_shape_MNKL);
-    Tensor tensor_c = make_tensor(args.ptr_C, make_layout(make_shape(M,N,L), args.dC));
+    Tensor tensor_c = make_tensor(static_cast<InternalElementC const*>(args.ptr_C), make_layout(make_shape(M,N,L), args.dC));
     Tensor tensor_d = make_tensor(args.ptr_D, make_layout(make_shape(M,N,L), args.dD));
     typename Params::TMA_C tma_load_c = make_tma_copy(
         CopyOpG2S{},
@@ -409,7 +409,7 @@ class CollectiveEpilogue<
 
     // Allocate register tensors
     auto tRS_rD_shape = take<0,3>(shape(thread_r2s.partition_S(bEsD)));                            // (R2S,R2S_M,R2S_N)
-    Tensor tRS_rC = make_tensor<ElementC>(tRS_rD_shape);                                           // (R2S,R2S_M,R2S_N)
+    Tensor tRS_rC = make_tensor<InternalElementC>(tRS_rD_shape);                                   // (R2S,R2S_M,R2S_N)
     Tensor tRS_rD = make_tensor<ElementD>(tRS_rD_shape);                                           // (R2S,R2S_M,R2S_N)
 
     // Vectorized fragment view for thread epilogue op
@@ -418,7 +418,7 @@ class CollectiveEpilogue<
     Tensor tRS_rD_frg   = recast<typename ThreadEpilogueOp::FragmentOutput>(tRS_rD);
 
     // Partition for smem to register copy (tSR_)
-    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,ElementC>{}, tiled_r2s);
+    TiledCopy tiled_s2r = make_tiled_copy_S(Copy_Atom<CopyOpS2R,InternalElementC>{}, tiled_r2s);
     ThrCopy thread_s2r = tiled_s2r.get_slice(thread_idx);
     Tensor tSR_sC = thread_s2r.partition_S(bEsC);                                      // (S2R,S2R_M,S2R_N,EPI_M,EPI_N)
     Tensor tSR_rC = thread_s2r.retile_D(tRS_rC);                                       //             (S2R,S2R_M,S2R_N)
diff --git a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
index e78eb76c..c89f2889 100644
--- a/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
+++ b/include/cutlass/epilogue/thread/linear_combination_tensor_broadcast.hpp
@@ -130,6 +130,7 @@ class LinearCombinationTensorBroadcast {
   using ActivationFunctor = ActivationFunctor_<ElementCompute>;
 
   static constexpr int kCount = 1;
+  static constexpr ScaleType::Kind kScale = Scale;
 
   using FragmentOutput = Array<ElementOutput, kCount>;
   using FragmentAccumulator = Array<ElementAccumulator, kCount>;
diff --git a/include/cutlass/epilogue/threadblock/epilogue.h b/include/cutlass/epilogue/threadblock/epilogue.h
index 4cba4a60..61d961df 100644
--- a/include/cutlass/epilogue/threadblock/epilogue.h
+++ b/include/cutlass/epilogue/threadblock/epilogue.h
@@ -323,7 +323,7 @@ class Epilogue :
       OutputTileIterator destination_iterator,        ///< Tile iterator for destination
       OutputTileIterator source_iterator)             ///< Threadblock tile coordinate in GEMM (in units of threadblock tiles)
   {
-    // Redcuce peer accumulator fragments into one fragment
+    // Reduce peer accumulator fragments into one fragment
     AccumulatorFragment accum_fragment;
     BaseStreamK::reduce(accum_fragment, peer_idx_begin, peer_idx_end, reduce_fragment_idx, element_workspace);
 
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
index a80a6dbd..faf2857a 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_rs_warpspecialized.hpp
@@ -190,7 +190,8 @@ struct CollectiveMma<
     "SmemLayoutB K must be 128bytes to be transposed.");
   static_assert(!transform::collective::detail::use_universal_transposition<InternalSmemLayoutAtomB, InternalElementB>(),
     "Warp specialized ARF kernels have not supported universal B transposition yet.");
-  static_assert(!TransposeB || shape<0>(TileShape{}) == 64, "Optimized transpose RS kernel requires TileShape M = 64.");
+  static_assert(!TransposeB || !cute::is_same_v<KernelSchedule, KernelTmaWarpSpecializedCooperative>,
+    "Transpose RS kernel requires kernel schedule schmem is not KernelTmaWarpSpecializedCooperative.");
 
   struct SharedStorage
   {
@@ -294,7 +295,7 @@ struct CollectiveMma<
 
   static constexpr int K_PIPE_MAX = DispatchPolicy::Stages;
   static constexpr int K_PIPE_MMAS = DispatchPolicy::PipelineAsyncMmaStages;
-  static_assert(K_PIPE_MMAS >= 1, "At least one MMA stage should be asynchronous for this mainloop.");
+  static_assert(K_PIPE_MMAS == 0, "no MMA stage should be asynchronous for this mainloop for now.");
   static constexpr uint32_t TmaTransactionBytes =
         (size<0>(SmemLayoutA{}) * size<1>(SmemLayoutA{}) * static_cast<uint32_t>(sizeof(InternalElementA)))+
         (size<0>(SmemLayoutB{}) * size<1>(SmemLayoutB{}) * static_cast<uint32_t>(sizeof(InternalElementB)));
@@ -368,21 +369,6 @@ struct CollectiveMma<
         }
       }
 
-      // Issue the prologue loads
-      int k_tile_prologue = min(k_tile_count, K_PIPE_MAX);
-      CUTLASS_PRAGMA_UNROLL
-      for (int count = 0; count < k_tile_prologue; ++count) {
-        pipeline.producer_acquire(smem_pipe_write);
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-        ++smem_pipe_write;
-      }
-      k_tile_count -= k_tile_prologue;
       // Mainloop
       CUTLASS_PRAGMA_NO_UNROLL
       for ( ; k_tile_count > 0; --k_tile_count) {
diff --git a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
index 01638c52..39c1a17a 100644
--- a/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
+++ b/include/cutlass/gemm/collective/sm90_mma_tma_gmma_ss_warpspecialized.hpp
@@ -303,22 +303,6 @@ struct CollectiveMma<
         }
       }
 
-      // Issue the prologue loads
-      int k_tile_prologue = min(k_tile_count, K_PIPE_MAX);
-      CUTLASS_PRAGMA_UNROLL
-      for (int count = 0; count < k_tile_prologue; ++count) {
-        pipeline.producer_acquire(smem_pipe_write);
-        using BarrierType = typename MainloopPipeline::ProducerBarrierType;
-        BarrierType* tma_barrier = pipeline.producer_get_barrier(smem_pipe_write);
-
-        int write_stage = smem_pipe_write.index();
-        copy(tma_load_a.with(*tma_barrier, mcast_mask_a), tAgA(_,_,_,*k_tile_iter), tAsA(_,_,_,write_stage));
-        copy(tma_load_b.with(*tma_barrier, mcast_mask_b), tBgB(_,_,_,*k_tile_iter), tBsB(_,_,_,write_stage));
-        ++k_tile_iter;
-        ++smem_pipe_write;
-      }
-      k_tile_count -= k_tile_prologue;
-
       // Mainloop
       CUTLASS_PRAGMA_NO_UNROLL
       for ( ; k_tile_count > 0; --k_tile_count)
diff --git a/include/cutlass/gemm/device/base_grouped.h b/include/cutlass/gemm/device/base_grouped.h
index 207266c7..f3094e90 100644
--- a/include/cutlass/gemm/device/base_grouped.h
+++ b/include/cutlass/gemm/device/base_grouped.h
@@ -301,18 +301,19 @@ class BaseGrouped {
       return 0;
     }
 
-    result = cudaGetDeviceProperties(&properties, device_idx);
+    int multiprocessor_count;
+    result = cudaDeviceGetAttribute(&multiprocessor_count,
+      cudaDevAttrMultiProcessorCount, device_idx);
     if (result != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      result = cudaGetLastError();
-      CUTLASS_TRACE_HOST("  cudaGetDeviceProperties() returned error "
-          << cudaGetErrorString(result));
+      CUTLASS_TRACE_HOST(
+        "  cudaDeviceGetAttribute() returned error "
+        << cudaGetErrorString(result));
       return 0;
     }
 
-    bool override_sm_count = (available_sm_count < 0 || available_sm_count > properties.multiProcessorCount);
+    bool override_sm_count = (available_sm_count < 0 || available_sm_count > multiprocessor_count);
     if (override_sm_count) {
-      available_sm_count = properties.multiProcessorCount;
+      available_sm_count = multiprocessor_count;
     }
 
     int max_active_blocks = maximum_active_blocks();
@@ -440,8 +441,6 @@ class BaseGrouped {
     cudaError_t result = cudaGetLastError();
 
     if (result != cudaSuccess) {
-      // Call cudaGetLastError() to clear the error bit
-      result = cudaGetLastError();
       CUTLASS_TRACE_HOST("  grid launch failed with error " << cudaGetErrorString(result));
       return Status::kErrorInternal;
     }
diff --git a/include/cutlass/gemm/device/default_gemm_configuration.h b/include/cutlass/gemm/device/default_gemm_configuration.h
index 8d193c5b..f4ce5851 100644
--- a/include/cutlass/gemm/device/default_gemm_configuration.h
+++ b/include/cutlass/gemm/device/default_gemm_configuration.h
@@ -490,9 +490,9 @@ struct DefaultGemmConfiguration<arch::OpClassTensorOp, arch::Sm80, double,
   static int const kAlignmentA = 1;
   static int const kAlignmentB = 1;
   
-  using ThreadblockShape = GemmShape<128, 256, 64>;
-  using WarpShape = GemmShape<64, 64, 64>;
-  using InstructionShape = GemmShape<16, 8, 16>;
+  using ThreadblockShape = GemmShape<128, 128, 16>;
+  using WarpShape = GemmShape<32, 64, 16>;
+  using InstructionShape = GemmShape<8, 8, 4>;
   static int const kStages = 3;
 
   using EpilogueOutputOp = epilogue::thread::LinearCombination<
diff --git a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
index 54b7d61c..e0940f14 100644
--- a/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
+++ b/include/cutlass/gemm/device/gemm_universal_with_broadcast.h
@@ -31,7 +31,7 @@
 
 /*! \file
     \brief Template for a GEMM kernel that can broadcast bias vector in the
-           epigloue.
+           epilogue.
 */
 
 #pragma once
diff --git a/include/cutlass/gemm/device/gemv_strided_batched.h b/include/cutlass/gemm/device/gemv_strided_batched.h
new file mode 100644
index 00000000..005afda9
--- /dev/null
+++ b/include/cutlass/gemm/device/gemv_strided_batched.h
@@ -0,0 +1,167 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/numeric_types.h"
+#include "cutlass/arch/arch.h"
+#include "cutlass/device_kernel.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/gemm/threadblock/threadblock_swizzle.h"
+#include "cutlass/gemm/kernel/gemm_universal.h"
+
+#include "cutlass/gemm/kernel/default_gemm_universal.h"
+#include "cutlass/gemm/device/default_gemm_configuration.h"
+#include "cutlass/gemm/device/gemm_universal_base.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace device {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvKernel_>
+class GemvStridedBatched {
+public:
+
+  using GemvKernel = GemvKernel_;
+
+  using ElementA = typename GemvKernel::ElementA;
+  using LayoutA = typename GemvKernel::LayoutA;
+  using ElementB = typename GemvKernel::ElementB;
+  using ElementC = typename GemvKernel::ElementC;
+
+  using ElementAccumulator = typename GemvKernel::ElementAccumulator;
+
+  using EpilogueOutputOp = typename GemvKernel::EpilogueOutputOp;
+
+  static ComplexTransform const kTransformA = GemvKernel::kTransformA;
+  static ComplexTransform const kTransformB = GemvKernel::kTransformB;
+
+  static int const kThreadCount = GemvKernel::kThreadCount;
+  static int const mThreadCount = GemvKernel::mThreadCount;
+
+  static int const kStages = GemvKernel::kStages;
+
+  static int const kAlignmentA = GemvKernel::kAlignmentA;
+  static int const kAlignmentB = GemvKernel::kAlignmentB;
+  static int const kAlignmentC = GemvKernel::kAlignmentC;
+
+  using Arguments = typename GemvKernel::Arguments;
+  using Params = typename GemvKernel::Params;
+
+private:
+
+  Params params_;
+
+public:
+
+  /// Constructs the Gemv.
+  GemvStridedBatched() {}
+
+  /// Determines whether the Gemv can execute the given problem.
+  static Status can_implement(Arguments const& args) {
+          return GemvKernel::can_implement(args);
+  }
+
+  /// Gets the workspace size
+  static size_t get_workspace_size(Arguments const& args) { return 0; }
+
+  /// Initializes Gemv state from arguments.
+  Status initialize(Arguments const &args, void *workspace = nullptr, cudaStream_t stream = nullptr) {
+    params_ = Params(args);
+
+    if (args.problem_size.column() % GemvKernel::kElementsPerAccess) {
+      return Status::kErrorMisalignedOperand;
+    }
+
+    return Status::kSuccess;
+  }
+
+  /// Lightweight update given a subset of arguments
+  Status update(Arguments const &args, void *workspace = nullptr) {
+    return params_.update(args);
+  }
+
+  /// Runs the kernel using initialized state.
+  Status run(cudaStream_t stream = nullptr) {
+
+    dim3 grid(1, 1, params_.batch_count % 65536);
+    dim3 block(kThreadCount, mThreadCount, 1);
+
+    int smem_size = 0;
+
+    // Launch
+    cutlass::Kernel<GemvKernel><<<grid, block, smem_size, stream>>>(params_);
+
+    //
+    // Query for errors
+    //
+
+    cudaError_t result = cudaGetLastError();
+
+    return result == cudaSuccess ? Status::kSuccess : Status::kErrorInternal;
+  }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(cudaStream_t stream = nullptr) { return run(stream); }
+
+  /// Runs the kernel using initialized state.
+  Status operator()(
+      Arguments const &args,
+      void *workspace = nullptr,
+      cudaStream_t stream = nullptr) {
+
+    Status status = initialize(args, workspace, stream);
+
+    if (status == Status::kSuccess) {
+        status = run(stream);
+    }
+
+    return status;
+  }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+
+} // namespace device
+} // namespace gemm
+} // namespace cutlass
+
+////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/dispatch_policy.hpp b/include/cutlass/gemm/dispatch_policy.hpp
index 8de19d46..aee918ee 100644
--- a/include/cutlass/gemm/dispatch_policy.hpp
+++ b/include/cutlass/gemm/dispatch_policy.hpp
@@ -150,7 +150,7 @@ template<
   int Stages_,
   class ClusterShape_ = Shape<_1,_1,_1>,
   class KernelSchedule = KernelTmaWarpSpecialized,
-  int PipelineAsyncMmaStages_ = 1
+  int PipelineAsyncMmaStages_ = 0
 >
 struct MainloopSm90TmaGmmaRmemAWarpSpecialized {
   constexpr static int Stages = Stages_;
diff --git a/include/cutlass/gemm/kernel/gemv_strided_batched.h b/include/cutlass/gemm/kernel/gemv_strided_batched.h
new file mode 100644
index 00000000..ddd42c26
--- /dev/null
+++ b/include/cutlass/gemm/kernel/gemv_strided_batched.h
@@ -0,0 +1,368 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief
+*/
+
+#pragma once
+
+#include "cutlass/cutlass.h"
+#include "cutlass/fast_math.h"
+#include "cutlass/matrix_coord.h"
+#include "cutlass/complex.h"
+#include "cutlass/tensor_ref.h"
+
+#include "cutlass/arch/memory.h"
+#include "cutlass/arch/cache_operation.h"
+
+#include "cutlass/gemm/gemm.h"
+#include "cutlass/layout/matrix.h"
+
+#include "cutlass/numeric_conversion.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace cutlass {
+namespace gemm {
+namespace kernel {
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <
+    typename ElementA_,             /// matrix
+    typename LayoutA_,
+    typename ElementB_,             /// vector
+    typename ElementC_,
+    typename ElementAccumulator_,
+    int kElementsPerAccess_,
+    typename EpilogueOutputOp_
+>
+struct GemvStridedBatched {
+public:
+
+  using ElementA = ElementA_;
+  using LayoutA = layout::RowMajor;
+  using TensorRefA = TensorRef<ElementA, LayoutA>;
+
+  static_assert(std::is_same<LayoutA, LayoutA_>::value,
+                "Only supported for row-major A matrix");
+
+  using ElementB = ElementB_;
+  using ElementC = ElementC_;
+
+  using ElementAccumulator = ElementAccumulator_;
+  using EpilogueOutputOp = EpilogueOutputOp_;
+
+  static ComplexTransform const kTransformA = ComplexTransform::kNone;
+  static ComplexTransform const kTransformB = ComplexTransform::kNone;
+
+  static FloatRoundStyle const Round = cutlass::FloatRoundStyle::round_to_nearest;
+
+  // number of return elements in a global access
+  static int const kElementsPerAccess = kElementsPerAccess_;
+  
+  using FragmentA = Array<ElementA, kElementsPerAccess>;
+  using FragmentB = Array<ElementB, kElementsPerAccess>;
+  using FragmentCompute = Array<ElementAccumulator, kElementsPerAccess>;
+
+  // thread block shape (kThreadCount, mThreadCount)
+  static int const kThreadCount = std::min(static_cast<int>(128 / (kElementsPerAccess * sizeof(ElementA))), 16);
+  static int const mThreadCount = 128 / kThreadCount;
+
+  // rolling tile shape
+  static int const kTileA = kThreadCount * kElementsPerAccess;
+  static int const mTileA = mThreadCount * 8;
+
+  //
+  // Structures
+  //
+
+  /// Argument structure
+  struct Arguments
+  {
+    MatrixCoord problem_size;
+    int32_t batch_count;
+    typename EpilogueOutputOp::Params output_op;
+
+    TensorRefA ref_A;
+
+    ElementB const *ptr_B;
+    ElementC const *ptr_C;
+    ElementC *ptr_D;
+
+    int64_t batch_stride_A;
+    int64_t batch_stride_B;
+    int64_t batch_stride_C;
+    int64_t batch_stride_D;
+
+    //
+    // Methods
+    //
+
+    Arguments() : batch_count(0) {}
+
+    Arguments(
+        MatrixCoord problem_size,
+        int32_t batch_count,
+
+        typename EpilogueOutputOp::Params output_op,
+        TensorRefA ref_A,
+        void const *ptr_B,
+        void const *ptr_C,
+        void *ptr_D,
+
+        int64_t batch_stride_A,
+        int64_t batch_stride_B,
+        int64_t batch_stride_C,
+        int64_t batch_stride_D) : problem_size(problem_size),
+                                  batch_count(batch_count),
+                                  output_op(output_op),
+                                  ref_A(ref_A),
+                                  ptr_B(static_cast<ElementB const *>(ptr_B)),
+                                  ptr_C(static_cast<ElementC const *>(ptr_C)),
+                                  ptr_D(static_cast<ElementC *>(ptr_D)),
+
+                                  batch_stride_A(batch_stride_A),
+                                  batch_stride_B(batch_stride_B),
+                                  batch_stride_C(batch_stride_C),
+                                  batch_stride_D(batch_stride_D)
+    {
+    }
+
+    Arguments(
+        MatrixCoord problem_size,
+        typename EpilogueOutputOp::Params output_op,
+        TensorRefA ref_A,
+        void const *ptr_B,
+        void const *ptr_C,
+        void *ptr_D) : Arguments(problem_size,
+                                 1,
+                                 1,
+                                 output_op,
+                                 ref_A,
+                                 ptr_B,
+                                 ptr_C,
+                                 ptr_D,
+                                 1,
+                                 1,
+                                 1,
+                                 1)
+    {
+    }
+
+    Status update(Arguments const &args)
+    {
+      problem_size = args.problem_size;
+      batch_count = args.batch_count;
+      output_op = args.output_op;
+      ref_A = ref_A;
+      ptr_B = args.ptr_B;
+      ptr_C = args.ptr_C;
+      ptr_D = args.ptr_D;
+      batch_stride_A = args.batch_stride_A;
+      batch_stride_B = args.batch_stride_B;
+      batch_stride_C = args.batch_stride_C;
+      batch_stride_D = args.batch_stride_D;
+
+      return Status::kSuccess;
+    }
+  };
+
+  using Params = Arguments;
+
+  /// Shared memory storage structure
+  union SharedStorage
+  {
+  };
+
+public:
+  //
+  // Methods
+  //
+
+  CUTLASS_DEVICE
+  GemvStridedBatched() {}
+
+  /// Determines whether kernel satisfies alignment
+  static Status can_implement(cutlass::MatrixCoord const &problem_size)
+  {
+    if (problem_size.column() % kElementsPerAccess != 0)
+      return Status::kErrorMisalignedOperand;
+    return Status::kSuccess;
+  }
+
+  static Status can_implement(Arguments const &args)
+  {
+    return can_implement(args.problem_size);
+  }
+
+  /// Executes one GEMV
+  CUTLASS_DEVICE
+  void operator()(Params const &params, SharedStorage &shared_storage)
+  {
+    // Loop over batch indices
+    for (int batch_idx = blockIdx.z; batch_idx < params.batch_count; batch_idx += gridDim.z)
+    {
+      int k_col_id = threadIdx.x;
+      int m_row_id = threadIdx.y;
+
+      // problem_size (row = m, column = k)
+      // matrix A (batch, m, k)
+      // vector B (batch, 1, k)
+      // vector C (batch, m, 1)
+      // vector D (batch, m, 1)
+
+      // move in the batch dimension
+      ElementA const *ptr_A = params.ref_A.data() + batch_idx * params.batch_stride_A;
+      ElementB const *ptr_B = params.ptr_B + batch_idx * params.batch_stride_B;
+
+      ElementC const *ptr_C = params.ptr_C + batch_idx * params.batch_stride_C;
+      ElementC *ptr_D = params.ptr_D + batch_idx * params.batch_stride_D;
+
+      // move in the k dimension
+      ptr_A += k_col_id * kElementsPerAccess;
+      ptr_B += k_col_id * kElementsPerAccess;
+
+      // move in the m dimension
+      ptr_A += m_row_id * params.problem_size.column();
+      ptr_C += m_row_id;
+      ptr_D += m_row_id;
+
+      NumericArrayConverter<ElementAccumulator, ElementA, kElementsPerAccess, Round> srcA_converter;
+      NumericArrayConverter<ElementAccumulator, ElementB, kElementsPerAccess, Round> srcB_converter;
+
+      for (; m_row_id < params.problem_size.row(); m_row_id += mTileA)
+      {
+        ElementAccumulator accum[mTileA / mThreadCount] = {0.f};
+
+        FragmentB fragB;
+        FragmentA fragA[mTileA / mThreadCount];
+
+        int mElemCountPerTile = min(mTileA / mThreadCount, (params.problem_size.row() - m_row_id - 1) / mThreadCount + 1);
+
+        int kUnroll = 0;
+
+        for (; kUnroll < params.problem_size.column() / kTileA * kTileA; kUnroll += kTileA)
+        {
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            // fetch from matrix A
+            arch::global_load<FragmentA,
+                              sizeof(FragmentA),
+                              arch::CacheOperation::LastUse>(fragA[m], (ptr_A + kUnroll + m * mThreadCount * params.problem_size.column()), true);
+          }
+
+          // fetch from vector B
+          arch::global_load<FragmentB,
+                            sizeof(FragmentB),
+                            arch::CacheOperation::Always>(fragB, (ptr_B + kUnroll), true);
+
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            FragmentCompute fragB_Compute = srcB_converter(fragB);
+            FragmentCompute fragA_Compute = srcA_converter(fragA[m]);
+
+            // Math
+            CUTLASS_PRAGMA_UNROLL
+            for (int e = 0; e < kElementsPerAccess; e++)
+            {
+              accum[m] += fragA_Compute.at(e) * fragB_Compute.at(e);
+            }
+          }
+        }
+
+        // calculate the rest of K elements
+        // each thread fetch 1 element each time
+        for (int k = kUnroll + k_col_id; k < params.problem_size.column(); k += kThreadCount)
+        {
+          ElementB b = *(ptr_B - k_col_id * kElementsPerAccess + k);
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            ElementA a = *(ptr_A - k_col_id * kElementsPerAccess + k + m * mThreadCount * params.problem_size.column());
+            accum[m] += ElementAccumulator(a) * ElementAccumulator(b);
+          }
+        }
+
+        EpilogueOutputOp output_op(params.output_op);
+        typename EpilogueOutputOp::FragmentOutput source_fragment[mTileA / mThreadCount];
+
+        // prefetch from source matrix C
+        if (output_op.is_source_needed())
+        {
+          for (int m = 0; m < mElemCountPerTile; m++)
+          {
+            source_fragment[m][0] = *(ptr_C + m * mThreadCount);
+          }
+        }
+
+        typename EpilogueOutputOp::FragmentAccumulator accum_fragment;
+        typename EpilogueOutputOp::FragmentOutput output_fragment;
+
+        for (int m = 0; m < mElemCountPerTile; m++)
+        {
+          for (int mask = (kThreadCount >> 1); mask > 0; mask >>= 1)
+          {
+            accum[m] += __shfl_xor_sync(0xFFFFFFFF, accum[m], mask, 32);
+          }
+
+          if (k_col_id == 0)
+          {
+            accum_fragment[0] = accum[m];
+
+            if (output_op.is_source_needed())
+            {
+              output_fragment = output_op(accum_fragment, source_fragment[m]);
+            }
+            else
+            {
+              output_fragment = output_op(accum_fragment);
+            }
+
+            *(ptr_D + m * mThreadCount) = output_fragment[0];
+          }
+        }
+
+        ptr_A += mTileA * params.problem_size.column();
+        ptr_C += mTileA;
+        ptr_D += mTileA;
+      }
+    }
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+} // namespace kernel
+} // namespace gemm
+} // namespace cutlass
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
index 768c780c..69f4bb7b 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma.hpp
@@ -166,8 +166,8 @@ class GemmUniversal<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
       return implementable;
     }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
     auto M = get<0>(args.problem_shape);
     auto N = get<1>(args.problem_shape);
     auto K = get<2>(args.problem_shape);
@@ -182,7 +182,17 @@ class GemmUniversal<
                         N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
     if (!implementable) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
+    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
     }
+
     return implementable;
   }
 
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
index d6619feb..38db8e0f 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized.hpp
@@ -173,8 +173,8 @@ class GemmUniversal<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
       return implementable;
     }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
     auto M = get<0>(args.problem_shape);
     auto N = get<1>(args.problem_shape);
     auto K = get<2>(args.problem_shape);
@@ -189,7 +189,17 @@ class GemmUniversal<
                         N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
     if (!implementable) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
+    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
     }
+
     return implementable;
   }
 
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
index 3b8e61e0..fe22b0df 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_cooperative.hpp
@@ -196,8 +196,8 @@ class GemmUniversal<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
       return implementable;
     }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
     auto M = get<0>(args.problem_shape);
     auto N = get<1>(args.problem_shape);
     auto K = get<2>(args.problem_shape);
@@ -212,7 +212,17 @@ class GemmUniversal<
                         N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
     if (!implementable) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
+    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
     }
+
     return implementable;
   }
 
diff --git a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
index af619b96..82cdf918 100644
--- a/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
+++ b/include/cutlass/gemm/kernel/sm90_gemm_tma_warpspecialized_pingpong.hpp
@@ -204,8 +204,8 @@ class GemmUniversal<
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Arguments or Problem Size don't meet the requirements.\n");
       return implementable;
     }
-    static constexpr int tma_alignment_bits = 128;
-    static constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
+    constexpr int tma_alignment_bits = 128;
+    constexpr int min_tma_aligned_elements = tma_alignment_bits / cutlass::sizeof_bits<ElementA>::value;
     auto M = get<0>(args.problem_shape);
     auto N = get<1>(args.problem_shape);
     auto K = get<2>(args.problem_shape);
@@ -220,7 +220,17 @@ class GemmUniversal<
                         N % min_tma_aligned_elements == 0 : M % min_tma_aligned_elements == 0));
     if (!implementable) {
       CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Problem Size doesn't meet the minimum alignment requirements for TMA.\n");
+      return implementable;
+    }
+
+    constexpr bool is_beta_supported = 
+      CollectiveEpilogue::ThreadEpilogueOp::kScale == cutlass::epilogue::thread::ScaleType::Default;
+    implementable = is_beta_supported || (args.epilogue.thread.beta == 0 && args.epilogue.thread.beta_ptr == nullptr);
+    if (!implementable) {
+      CUTLASS_TRACE_HOST("  CAN IMPLEMENT: Scaling params don't meet ThreadEpilogueOp requirements.\n");
+      return implementable;
     }
+
     return implementable;
   }
 
diff --git a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
index c1c47020..910a3f1d 100644
--- a/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
+++ b/include/cutlass/gemm/kernel/sm90_tile_scheduler.hpp
@@ -163,6 +163,12 @@ class PersistentTileSchedulerSm90 {
       int const min_num_gpc = sm_count < max_sm_per_gpc ? 1 : sm_count / max_sm_per_gpc;
       int const max_blk_occupancy_per_gpc = max_sm_per_gpc - (max_sm_per_gpc % size(cluster_shape));
       int blk_per_device = min_num_gpc * max_blk_occupancy_per_gpc;
+
+      // The calculation below allows for larger grid size launch for different GPUs.
+      int const num_gpc_residual = sm_count < max_sm_per_gpc ? 0 : sm_count % max_sm_per_gpc;
+      int const max_blk_occupancy_per_residual_gpc = num_gpc_residual - (num_gpc_residual % size(cluster_shape));
+      blk_per_device += max_blk_occupancy_per_residual_gpc;
+
       blk_per_device = sm_count < blk_per_device ? sm_count : blk_per_device;
 
       launch_grid.x = std::min(
diff --git a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
index a1d522bc..f0468f89 100644
--- a/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
+++ b/include/cutlass/gemm/threadblock/threadblock_swizzle_streamk.h
@@ -630,9 +630,6 @@ struct ThreadblockSwizzleStreamK {
   }
 
 
-// Guards needed for PyCUTLASS library generation
-#if !defined(CUTLASS_PYTHON_HOST_CC)
-
   //
   // Device-side interface
   //
@@ -692,7 +689,7 @@ struct ThreadblockSwizzleStreamK {
     return GemmCoord(m, n, get_batch_idx());
   }
 
-  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rastorization)
+  /// Obtains the calling threadblock's tiled coordinates for the given tile index (row-major rasterization)
   CUTLASS_DEVICE
   GemmCoord get_tile_offset_row_major(int tile_idx) const
   {
@@ -740,7 +737,7 @@ struct ThreadblockSwizzleStreamK {
     div_mod_sk_iters_per_region(region_idx, iter_in_region, iter);
 
     int big_block_iters = (sk_big_blocks_per_region * sk_iters_per_normal_block()) + sk_big_blocks_per_region;   // number of iterations in the region's big blocks
-    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal bocks
+    int normal_block_iters = iter_in_region - big_block_iters;                                                 // number of iterations in the region's normal blocks
 
     int big_block_idx_in_region = div_mod_sk_iters_per_big_block.div(iter_in_region);
     int normal_block_idx_in_region = sk_big_blocks_per_region + div_mod_sk_iters_per_normal_block.div(normal_block_iters);
@@ -794,8 +791,6 @@ struct ThreadblockSwizzleStreamK {
     return get_sk_block_idx(iter);
   }
 
-#endif // !defined(CUTLASS_PYTHON_HOST_CC)
-
 };
 
 /////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/include/cutlass/pipeline/sm90_pipeline.hpp b/include/cutlass/pipeline/sm90_pipeline.hpp
index d90a7f14..807a1399 100644
--- a/include/cutlass/pipeline/sm90_pipeline.hpp
+++ b/include/cutlass/pipeline/sm90_pipeline.hpp
@@ -450,7 +450,7 @@ private :
   CUTLASS_DEVICE
   void consumer_wait(uint32_t stage, uint32_t phase, ConsumerToken barrier_token) {
     if (barrier_token == BarrierStatus::WaitAgain) {
-      consumer_wait(stage, phase);
+      full_barrier_ptr_[stage].wait(phase);
     }
   }
 
@@ -654,7 +654,7 @@ public :
     consumer_release(state.index());
   }
 
-protected:
+private:
   FullBarrier *full_barrier_ptr_ = nullptr;
   EmptyBarrier *empty_barrier_ptr_ = nullptr;
   Params params_;
@@ -976,6 +976,11 @@ private :
     ++stage_;
   }
 
+  CUTLASS_DEVICE
+  void advance() {
+    ++stage_;
+  }
+
 private:
 
   CUTLASS_DEVICE
diff --git a/include/cutlass/semaphore.h b/include/cutlass/semaphore.h
index 27343f9e..afb11bd2 100644
--- a/include/cutlass/semaphore.h
+++ b/include/cutlass/semaphore.h
@@ -89,19 +89,16 @@ class Semaphore {
   /// Waits until the semaphore is equal to the given value
   CUTLASS_DEVICE
   void wait(int status = 0) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     while( __syncthreads_and(state != status) ) {
       fetch();
     }
 
     __syncthreads();
-#endif
   }
 
   /// Updates the lock with the given result
   CUTLASS_DEVICE
   void release(int status = 0) {
-#if !defined(CUTLASS_PYTHON_HOST_CC)
     __syncthreads();
 
     if (wait_thread) {
@@ -111,7 +108,6 @@ class Semaphore {
       asm volatile ("st.global.cg.b32 [%0], %1;\n" : : "l"(lock), "r"(status));
       #endif
     }
-#endif
   }
 };
 
diff --git a/media/docs/cute/01_layout.md b/media/docs/cute/01_layout.md
index 7f7c6f20..c1a25ac1 100644
--- a/media/docs/cute/01_layout.md
+++ b/media/docs/cute/01_layout.md
@@ -3,8 +3,8 @@
 ## Layout
 
 This document describes `Layout`, CuTe's core abstraction.
-A `Layout` maps from (a) logical coordinate space(s)
-to a physical index space.
+A `Layout` maps from a logical coordinate space
+to an index space.
 
 `Layout`s present a common interface to multidimensional array access
 that abstracts away the details of how the array's elements are organized in memory.
@@ -19,7 +19,11 @@ This can help users do things like partition layouts of data over layouts of thr
 
 ## Layouts and Tensors
 
-Any of the `Layout`s discussed in this section can be composed with data -- a pointer or an array -- to create a `Tensor`. The responsibility of the `Layout` is to define valid coordinate space(s) and, therefore, the logical shape of the data and map those into an index space. The index space is precisely the offset that would be used to index into the array of data.
+Any of the `Layout`s discussed in this section can be composed with data -- e.g., a pointer or an array -- to create a `Tensor`.
+The `Layout`'s logical coordinate space represents the logical "shape" of the data,
+e.g., the modes of the `Tensor` and their extents.
+The `Layout` maps a logical coordinate into an index,
+which is an offset to be used to index into the array of data.
 
 For details on `Tensor`, please refer to the
 [`Tensor` section of the tutorial](./03_tensor.md).
@@ -31,31 +35,31 @@ Both `Shape` and `Stride` are `IntTuple` types.
 
 ### IntTuple
 
-An `IntTuple` is an integer or a tuple of `IntTuple`s.
+An `IntTuple` is defined recursively as either a single integer, or a tuple of `IntTuple`s.
 This means that `IntTuple`s can be arbitrarily nested.
 Operations defined on `IntTuple`s include the following.
 
-* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`. Note that `get<0>` is defined for integer `IntTuples`.
+* `get<I>(IntTuple)`: The `I`th element of the `IntTuple`.  For an `IntTuple` consisting of a single integer, `get<0>` is just that integer.
 
-* `rank(IntTuple)`: The number of elements in an `IntTuple`. An int has rank 1, a tuple has rank `tuple_size`.
+* `rank(IntTuple)`: The number of elements in an `IntTuple`. A single integer has rank 1, and a tuple has rank `tuple_size`.
 
-* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc.
+* `depth(IntTuple)`: The number of hierarchical `IntTuple`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.
 
-* `size(IntTuple)`: The product of all elements of the IntTuple.
+* `size(IntTuple)`: The product of all elements of the `IntTuple`.
 
-We write `IntTuple`s with parenthesis to denote the hierarchy. E.g. `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s.
+We write `IntTuple`s with parenthesis to denote the hierarchy. For example, `6`, `(2)`, `(4,3)`, `(3,(6,2),8)` are all `IntTuple`s.
 
 ## Layout
 
-A `Layout` is then a pair of `IntTuple`s. The first defines the abstract *shape* of the layout and the second defines the *strides*, which map from coordinates within the shape to the index space.
+A `Layout` is then a pair of `IntTuple`s. The first element defines the abstract *shape* of the `Layout`, and the second element defines the *strides*, which map from coordinates within the shape to the index space.
 
-As a pair of `IntTuple`s, we can define many similar operations on `Layout`s including
+Since a `Layout` is just a pair of `IntTuple`s, we can define operations on `Layout`s analogous to those defined on `IntTuple`.
 
 * `get<I>(Layout)`: The `I`th sub-layout of the `Layout`.
 
 * `rank(Layout)`: The number of modes in a `Layout`.
 
-* `depth(Layout)`: The number of hierarchical `Layout`s. An int has depth 0, a tuple has depth 1, a tuple that contains a tuple has depth 2, etc.
+* `depth(Layout)`: The number of hierarchical `Layout`s. A single integer has depth 0, a tuple of integers has depth 1, a tuple that contains a tuple of integers has depth 2, etc.
 
 * `shape(Layout)`: The shape of the `Layout`.
 
@@ -86,7 +90,7 @@ These hierarchical access functions include the following.
 
 ### Vector examples
 
-Then, we can define a vector as any `Shape` and `Stride` pair with `rank == 1`.
+We define a vector as any `Shape` and `Stride` pair with `rank == 1`.
 For example, the `Layout`
 
 ```
@@ -95,9 +99,9 @@ Stride: (1)
 ```
 
 defines a contiguous 8-element vector.
-Similarly, with a stride of `(2)`,
+For a vector with the same Shape but a Stride of `(2)`,
 the interpretation is that the eight elements
-are stored at positions 0, 2, 4, $\dots$.
+are stored at positions 0, 2, 4, $\dots$, 14.
 
 By the above definition, we *also* interpret
 
@@ -168,9 +172,17 @@ auto layout_2x4 = make_layout(make_shape (2, make_shape (2,2)),
                               make_stride(4, make_stride(2,1)));
 ```
 
+The `make_layout` function returns a `Layout`.
+It deduces the returned `Layout`'s template arguments from the function's arguments.
+Similarly, the `make_shape` and `make_stride` functions
+return a `Shape` resp. `Stride`.
+CuTe often uses these `make_*` functions,
+because constructor template argument deduction (CTAD)
+does not work for `cute::tuple` as it works for `std::tuple`.
+
 ## Using a `Layout`
 
-The fundamental use of a `Layout` is to map between logical coordinate space(s) and index space. For example, to print an arbitrary rank-2 layout, we can write the function
+The fundamental use of a `Layout` is to map between logical coordinate space(s) and an index space. For example, to print an arbitrary rank-2 layout, we can write the function
 
 ```c++
 template <class Shape, class Stride>
diff --git a/media/docs/cute/02_layout_operations.md b/media/docs/cute/02_layout_operations.md
index d0c7f2ab..7860cb7d 100644
--- a/media/docs/cute/02_layout_operations.md
+++ b/media/docs/cute/02_layout_operations.md
@@ -73,6 +73,7 @@ In C++, we identify a Tuple with the
 `cute::tuple` behaves like `std::tuple`, but it works on device or host,
 and it imposes restrictions on its template arguments for performance and simplicity.
 
+
 #### IntTuple
 
 CuTe then defines an IntTuple as either an integer, or a Tuple of IntTuple.
@@ -136,7 +137,7 @@ This code produces the following text output.
 ```
 
 `print(layout(1, 1))` prints the mapping of
-the logical 2-D coordinate (1,1) to 1-D index, which is 4.
+the logical 2-D coordinate (1,1) to the 1-D index, which is 4.
 You can see that from the table,
 which shows the left logical index as the "row,"
 and the right logical index as the "column."
@@ -302,13 +303,13 @@ Both humans and CuTe compute composition using the following rules.
 
 2. Concatenation: A layout can be expressed as the concatenation of its sublayouts.  We denote concatenation with parentheses: $B = (B_0,B_1,...)$.  The CuTe function `make_layout`, when given zero or more `Layout`s, concatenates them.
 
-3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B0, B1, ...) = (A \circ B0, A \circ B1, ...)$.
+3. Composition is (left-)distributive with concatenation: $A \circ B = A \circ (B_0, B_1, ...) = (A \circ B_0, A \circ B_1, ...)$.
 
 4. "Base case": For layouts $A = a : b$ and $B = c : d$ with integral shape and stride, $A \circ B = R = c : (b * d)$.
 
 5. By-mode composition: Let $\langle B, C \rangle$ (angle brackets, not parentheses)
-   denote a tuple of two layouts B and C, not their concatenation.  Let A = (A0, A1).
-   Then, $A \circ \langle B, C \rangle = (A0, A1) \circ \langle B, C \rangle = (A0 \circ B, A1 \circ C)$.
+   denote a tuple of two layouts B and C, not their concatenation.  Let $A = (A_0, A_1)$.
+   Then, $A \circ \langle B, C \rangle = (A_0, A_1) \circ \langle B, C \rangle = (A_0 \circ B, A_1 \circ C)$.
    This allows the application of composition independently to sublayouts of $A$.
 
 #### Examples: Reshape a vector into a matrix
@@ -359,6 +360,55 @@ The resulting layout has shape $(4,5)$, just as before.  What are the strides?
 
 5. Result: (4:10, 5:2), which by concatenation is (4,5) : (10,2).
 
+#### Example: Reshape a matrix into another matrix
+
+The composition $((20,2):(16,4) \circ (4,5):(1,4))$
+expresses reshaping the matrix with layout (20,2):(16:4),
+into a 4 x 5 matrix in a column-major way.
+
+1. By deconcatenation, $(4,5) : (1,4)$ is $(4:1, 5:4)$.
+
+2. Composition is distributive, so $(20,2):(16,4) \circ (4:1, 5:4)$ is $((20,2):(16,4) \circ 4:1, (20,2):(16,4) \circ 5:4)$.
+
+3. $(20,2):(16,4) \circ 4:1$ has shape $4$ and stride $16$.  (4:1 expresses picking the first 4 consecutive elements of (20,2):(16,4).  These elements run down the 0th column (leftmost mode) of the layout, whose stride is 16.)
+
+4. $(20,2):(16,4) \circ 5:4$ has shape $5$ and stride $64 = 4 \cdot 16$.
+
+5. Result: $(4:16, 5:64)$, which by concatenation is $(4,5) : (16,64)$.
+
+We get exactly this result with CuTe
+if we use compile-time shapes and strides.
+The following C++ code prints `(_4,_5):(_16,_64).`
+
+```c++
+using namespace cute;
+auto a = make_layout(make_shape(Int<20>{}, _2{}), make_stride(_16{}, _4{}));
+auto b = make_layout(make_shape(     _4{}, _5{}), make_stride( _1{}, _4{}));
+auto c = composition(a, b);
+printf("\n");
+print(c);
+```
+
+Results may _look_ different (but are the same mathematically)
+if we use run-time integers.
+The following C++ code prints `((4,1),(5,1)):((16,4),(64,4)).`
+
+```c++
+using namespace cute;
+auto a = make_layout(make_shape(20, 2), make_stride(16, 4));
+auto b = make_layout(make_shape( 4, 5), make_stride( 1, 4));
+auto c = composition(a, b);
+printf("\n");
+print(c);
+```
+
+((4,1),(5,1)):((16,4),(64,4)) is effectively the same layout
+as (4,5) : (16,64), because the 1s in the shape don't affect the layout
+(as a mathematical function from one integer to one integer).
+CuTe chooses not to simplify layout computations
+with run-time values in them as much as it could,
+because simplifications involving run-time values have a run-time cost.
+
 ### Product
 
 CuTe includes four different kinds of layout products.
@@ -428,7 +478,7 @@ results in Shape ((2, 2), (3, 4)) and Stride ((1, 2), (16, 4)).
 | (1,1) |  3    | 19    | 35    |  7    | 23    | 39    | 11    | 27    | 43    | 15    | 31    | 47    |
 
 Note how the tile appears in the leftmost column and is reproduced
-in each column in the same order as the matrix-of-tiles. That is, 
+in each column in the same order as the matrix-of-tiles. That is,
 the tile can be indexed through the first mode of the result and the
 matrix-of-tiles can be indexed through the second mode.
 
@@ -456,8 +506,8 @@ Shape ((3, 2), (4, 2)) and Stride ((16, 1), (4, 2)).
 | (2,1) | 33    | 37    | 41    | 45    | 35    | 39    | 43    | 47    |
 
 The tile is now interleaved or "raked" with the other 3x4 matrix-of-tiles
-instead of appearing as blocks. Other references call this cyclic 
-distribution.
+instead of appearing as blocks. Other references call this a "cyclic
+distribution."
 
 This might look familiar if you have ever used ScaLAPACK.
 It expresses a 2-D block cyclic distribution of a 6 x 8 matrix
@@ -542,7 +592,87 @@ CuTe includes 3 different kinds of layout division operations.
 
 We will summarize these in the sections that follow.
 
-#### Logical divide : the intuitive tiling
+#### Logical divide
+
+##### Example worked in detail
+
+This section will work the following logical divide example in detail.
+
+```c++
+Layout a = make_layout(24, 2);
+Layout b = make_layout( 4, 2);
+Layout c = logical_divide(a, b);
+```
+
+Logical divide produces a rank-2 `Layout`,
+where mode 0 (the leftmost mode) corresponds to the divisor `b`,
+and mode 1 (the rightmost mode) corresponds to the "remainder."
+Intuitively, the remainder of 24 divided by 4 is 6,
+so we know that mode 1 has 6 elements.
+We just don't know its shape yet.
+
+CuTe defines `logical_divide(a, b)` as
+`composition(a, make_layout(b, complement(b, size(a))))`.
+Here, `size(a)` is 24.
+What is `complement(b, 24)`?
+Intuitively, it means "the remainder,"
+what's left over after applying `b` to 0, 1, 2, $\dots$, 23.
+
+The layout 4:2 means "take 4 elements at even-numbered indices."
+The following table overlays the range of 4:2
+atop the complement's codomain 0, 1, $\dots$, 23.
+
+| Range of 4:2  | 0     |       | 2     |       | 4     |       | 6     |     |     |     |         |     |
+| ---           | ---   | ---   | ---   | ---   | ---   | ---   | ---   | --- | --- | --- | ---     | --- |
+| Codomain      | 0     | 1     | 2     | 3     | 4     | 5     | 6     | 7   | 8   | 9   | $\dots$ | 23  |
+
+Layouts are linear, so their range must include zero.
+The complement of 4:2 with respect to 24 is thus a layout whose range
+
+* includes zero;
+
+* does not include any other elements of the range of 4:2
+    (i.e., satisfies the disjoint property; see above); and
+
+* includes as much of 0, 1, $\dots$, 23 as possible
+    (so that it forms the "remainder" of 4:2 with respect to 24).
+
+Intuitively, the range of the complement must look like this:
+0, 1, 8, 9, 16, 17.
+The resulting layout is ordered.
+It has size 6 and cosize 18,
+so it satisfies the bounded property (see above).
+This is the layout (2, 3) : (1, 8).
+(Going from this intuitive sense of the complement
+to knowing how to compute it directly
+is out of scope for this part of the tutorial.)
+
+The following table shows 4:2 with its complement (2, 3) : (1, 8).
+
+| Range of 4:2        | 0     |       | 2     |       | 4     |       | 6     |     |     |     |     |     |     |     |     |     |     |     |         |     |
+| ---                 | ---   | ---   | ---   | ---   | ---   | ---   | ---   | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---     | --- |
+| Codomain            | 0     | 1     | 2     | 3     | 4     | 5     | 6     | 7   | 8   | 9   | 10  | 11  | 12  | 13  | 14  | 15  | 16  | 17  | $\dots$ | 23  |
+| ---                 | ---   | ---   | ---   | ---   | ---   | ---   | ---   | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | --- | ---     | --- |
+| Range of complement | 0     | 1     |       |       |       |       |       |     | 8   | 9   |     |     |     |     |     |     | 16  | 17  |         |     |
+
+Now we know that `logical_divide`(24:2, 4:2) is
+`composition`(24:2, `make_layout`(4:2, (2,3):(1,8))).
+The composition of two layouts has the shape of the second (rightmost) layout,
+so the resulting shape is (4, (2, 3)).
+We see that the leftmost mode 4 corresponds to the divisor 4:2,
+and the rightmost mode (2, 3) describes what's "left over"
+from the original shape 24.
+
+What are the strides?
+We can start from the leftmost mode.
+4:2 takes every other element (the even-numbered elements) of 24:2.
+That's a stride-2 thing, striding over a stride-2 thing.
+The resulting stride is 4.
+Similarly, the stride 2 of 24:2
+doubles the two strides of the rightmost mode.
+The resulting layout is (4, (2, 3)) : (4, (2, 16)).
+
+##### Tiling example
 
 Suppose I have the 6 x 8 matrix from the Raked Product section
 and want to "collect" the `tile`, turning the Raked Product into
@@ -607,7 +737,7 @@ Note that this is the same layout as the result in the Logical Product section.
 That is, the first mode is our original tile (and can be interpreted as a 2x2 matrix itself)
 and the second mode is its logical layout within the raked layout.
 
-##### More Examples of Divide
+#### More Examples of Divide
 
 For brevity, shapes can be used with `logical_divide` and `tiled_divide` to quickly split and tile modes of a tensor. For example, this C++ code
 
diff --git a/python/cutlass/backend/compiler.py b/python/cutlass/backend/compiler.py
index c5b1caea..3b9ae1a6 100644
--- a/python/cutlass/backend/compiler.py
+++ b/python/cutlass/backend/compiler.py
@@ -200,7 +200,7 @@ def load_operation(self, op_key, extra_funcs):
             self.compiled_cache_host.insert(key, compiled_host_fns)
         return True
 
-    def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostlib_compilation):
+    def emit_compile_(self, operation_list, compilation_options):
         """
         Compile a list of kernels and store them into database
         """
@@ -306,41 +306,17 @@ def emit_compile_(self, operation_list, compilation_options, requires_nvcc_hostl
                 cubin_image = file.read()
 
         # Set up the host-side library code
-        if requires_nvcc_hostlib_compilation:
-            cmd_template = (
-                "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
-                % source_buffer_host
-            )
-            cmd = SubstituteTemplate(
-                cmd_template,
-                {
-                    "cuda_install_path": CUDA_INSTALL_PATH,
-                    "options": compilation_options.get_str(),
-                },
-            )
-        else:
-            options = compilation_options.get()
-            cmd = (
-                "echo '%s'|g++ -x c++ -fpermissive -w -fPIC -DCUTLASS_PYTHON_HOST_CC=1"
-                % source_buffer_host
-            )
-            filtered_opts = [
-                "-default-device",
-                "-Xcicc",
-                "-Xllc",
-                "--expt-relaxed-constexpr",
-                "-Xcudafe --diag_suppress=esa_on_defaulted_function_ignored",
-            ]
-            for opt in options:
-                opt = opt.decode("utf-8")
-                if opt not in filtered_opts and "-arch=sm_" not in opt:
-                    if "--include-path=" in opt:
-                        cmd += " " + opt.replace(
-                            "--include-path=",
-                            "-I",
-                        )
-                    else:
-                        cmd += " " + opt
+        cmd_template = (
+            "echo '%s'|${cuda_install_path}/bin/nvcc -x cu -Xcompiler=\"-fpermissive -w -fPIC\" ${options}"
+            % source_buffer_host
+        )
+        cmd = SubstituteTemplate(
+            cmd_template,
+            {
+                "cuda_install_path": CUDA_INSTALL_PATH,
+                "options": compilation_options.get_str(),
+            },
+        )
 
         tempfile.tempdir = "./"
         temp = tempfile.NamedTemporaryFile(
@@ -375,7 +351,6 @@ def add_module(self, operations, compile_options=None):
         # save the cubin
         operation_key = []
         operation_list = []
-        requires_nvcc_hostlib_compilation = False
         for operation in operations:
             # step 1: get kernel string as key
             key = operation.rt_module.emit() + operation.procedural_name() + self.backend
@@ -398,17 +373,9 @@ def add_module(self, operations, compile_options=None):
                 operation_list.append(operation.rt_module)
                 operation_key.append(key)
 
-            # Creating the Params structures for certain 3.0 kernels currently requires CUDA. For these cases, use NVCC to generate
-            # the PyCUTLASS host-side library. Otherwise, g++ will be used.
-            if isinstance(operation, GemmOperationUniversal) and operation.api == ApiVersion.v3x:
-                if self.backend == "nvrtc":
-                    raise RuntimeError("CUTLASS 3 kernels currently require NVCC for compilation.")
-
-                requires_nvcc_hostlib_compilation = True
-
         if len(operation_list) > 0:
             cubin_image, host_lib, host_file = self.emit_compile_(
-                operation_list, compile_options, requires_nvcc_hostlib_compilation)
+                operation_list, compile_options)
 
             err, module = cuda.cuModuleLoadData(cubin_image)
             if err != cuda.CUresult.CUDA_SUCCESS:
diff --git a/python/cutlass/cpp/include/tensor_ref_view.h b/python/cutlass/cpp/include/tensor_ref_view.h
index f99d6339..48872e61 100644
--- a/python/cutlass/cpp/include/tensor_ref_view.h
+++ b/python/cutlass/cpp/include/tensor_ref_view.h
@@ -43,10 +43,10 @@
 template<typename T, typename L, typename TF>
 void bind_tensor_ref_view(py::module &m, std::string name) {
     py::class_<cutlass::TensorRef<T, L>>(m, ("TensorRef" + name).c_str())
-        .def("__init__", [](cutlass::TensorRef<T, L>& tensor_ref, int64_t address, const L& layout_ ) {
+        .def(py::init([](int64_t address, const L& layout_ ) {
             T* ptr = reinterpret_cast< T*>(address);
-            new (&tensor_ref) cutlass::TensorRef<T, L>(ptr, layout_);
-        })
+            return new cutlass::TensorRef<T, L>(ptr, layout_);
+        }))
         .def("data", [](cutlass::TensorRef<T, L>& tensor_ref) {
             T* ptr = tensor_ref.data();
             return int64_t(ptr);
diff --git a/python/setup.py b/python/setup.py
index 4c97819a..a87c82f4 100644
--- a/python/setup.py
+++ b/python/setup.py
@@ -29,9 +29,12 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 #
 #################################################################################################
-
+import copy
 import os
+from pybind11.setup_helpers import Pybind11Extension
+import setuptools
 from setuptools import setup
+from setuptools.command.build_ext import build_ext
 
 
 def _cutlass_path_from_dir() -> str:
@@ -61,31 +64,57 @@ def _cuda_install_path_from_nvcc() -> str:
     else _cutlass_path_from_dir()
 )
 
+
 cuda_install_path = (
     os.getenv('CUDA_INSTALL_PATH')
     if os.getenv('CUDA_INSTALL_PATH') is not None
     else _cuda_install_path_from_nvcc()
 )
 
-ext_modules = []
 
-try:
-    from pybind11.setup_helpers import Pybind11Extension, build_ext
-    include_dirs = [
-        cutlass_path + '/include',
-        cuda_install_path + '/include',
-        cutlass_path + '/tools/util/include',
-        cutlass_path + '/test',
-    ]
+class BuildExtension(build_ext):
+    """
+    Wrapper around `build_ext` to use NVCC when compiling the CUTLASS Python-C++ bindings.
+    """
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+    def build_extensions(self):
+        original_compile = self.compiler._compile
+
+        def custom_compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+            """
+            Wrapper around build_ext.compiler._compile method
+            """
+            postargs = copy.deepcopy(extra_postargs)
+            postargs = [f for f in postargs if f not in ['-g0', '-fvisibility=hidden']]
+            postargs.extend(["-Xcompiler='-fPIC'", "-Xcompiler='-g0'", "-Xcompiler='-O3'", '-x', 'cu'])
+            try:
+                original_compiler = self.compiler.compiler_so
+                self.compiler.set_executable('compiler_so', [f'{cuda_install_path}/bin/nvcc'])
+                original_compile(obj, src, ext, cc_args, postargs, pp_opts)
+            finally:
+                self.compiler.set_executable('compiler_so', original_compiler)
+
+        self.compiler._compile = custom_compile
+        super().build_extensions()
+
+
+include_dirs = [
+    cutlass_path + '/include',
+    cuda_install_path + '/include',
+    cutlass_path + '/tools/util/include',
+    cutlass_path + '/test',
+]
+
 
-    ext_modules = [
-        Pybind11Extension('cutlass_bindings',
-                          ['cutlass/cpp/cutlass_bindings.cpp'],
-                          include_dirs=include_dirs,
-                          extra_compile_args=['-fpermissive', '-w', '-std=c++17', '-DCUTLASS_PYTHON_HOST_CC=1'])
-    ]
-except ImportError:
-    pass
+ext_modules = [
+    Pybind11Extension('cutlass_bindings',
+                      ['cutlass/cpp/cutlass_bindings.cpp'],
+                      include_dirs=include_dirs,
+                      extra_compile_args=['-Xcompiler="-fpermissive"', '-w', '-std=c++17'],
+                      libraries=['cudart'])
+]
 
 
 setup(
@@ -103,4 +132,7 @@ def _cuda_install_path_from_nvcc() -> str:
         'treelib'
         ],
     ext_modules=ext_modules,
+    cmdclass={
+                 'build_ext': BuildExtension
+             }
 )
diff --git a/test/unit/gemm/device/CMakeLists.txt b/test/unit/gemm/device/CMakeLists.txt
index 717dbd5b..ee13ee49 100644
--- a/test/unit/gemm/device/CMakeLists.txt
+++ b/test/unit/gemm/device/CMakeLists.txt
@@ -41,6 +41,7 @@ add_custom_target(
   cutlass_test_unit_gemm_device_tensorop_planar_complex
   cutlass_test_unit_gemm_device_sparse_tensorop_sm80
   cutlass_test_unit_gemv_device
+  cutlass_test_unit_gemv_device_strided_batched
   cutlass_test_unit_gemm_device_tensorop_sm90
   cutlass_test_unit_gemm_device_tensorop_cluster_multicast_sm90
 )
@@ -60,6 +61,7 @@ add_custom_target(
   test_unit_gemm_device_tensorop_planar_complex
   test_unit_gemm_device_sparse_tensorop_sm80
   test_unit_gemv_device
+  test_unit_gemv_device_strided_batched
   test_unit_gemm_device_tensorop_sm90
 )
 
@@ -498,6 +500,15 @@ cutlass_test_unit_add_executable(
   gemv.cu
 )
 
+cutlass_test_unit_add_executable(
+  cutlass_test_unit_gemv_device_strided_batched
+
+  BATCH_SOURCES ON
+  BATCH_SIZE 4
+
+  gemv_strided_batched.cu
+)
+
 if (NOT CUDA_COMPILER MATCHES "[Cc]lang")
 
 add_dependencies(
diff --git a/test/unit/gemm/device/gemm_testbed_3x.hpp b/test/unit/gemm/device/gemm_testbed_3x.hpp
index 456d7360..5f19032e 100644
--- a/test/unit/gemm/device/gemm_testbed_3x.hpp
+++ b/test/unit/gemm/device/gemm_testbed_3x.hpp
@@ -77,7 +77,8 @@ struct TestbedImpl {
   using StrideA  = typename Gemm::GemmKernel::StrideA;
   using ElementB = typename Gemm::GemmKernel::ElementB;
   using StrideB  = typename Gemm::GemmKernel::StrideB;
-  using ElementC = typename Gemm::GemmKernel::ElementC;
+  using ElementC = std::conditional_t<std::is_void_v<typename Gemm::GemmKernel::ElementC>,
+                    typename Gemm::GemmKernel::ElementD,typename Gemm::GemmKernel::ElementC>;
   using StrideC  = typename Gemm::GemmKernel::StrideC;
   using ElementD = typename Gemm::GemmKernel::ElementD;
   using StrideD  = typename Gemm::GemmKernel::StrideD;
diff --git a/test/unit/gemm/device/gemv_strided_batched.cu b/test/unit/gemm/device/gemv_strided_batched.cu
new file mode 100644
index 00000000..7f526ac0
--- /dev/null
+++ b/test/unit/gemm/device/gemv_strided_batched.cu
@@ -0,0 +1,490 @@
+/***************************************************************************************************
+ * Copyright (c) 2017 - 2023 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+ * SPDX-License-Identifier: BSD-3-Clause
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *
+ * 1. Redistributions of source code must retain the above copyright notice, this
+ * list of conditions and the following disclaimer.
+ *
+ * 2. Redistributions in binary form must reproduce the above copyright notice,
+ * this list of conditions and the following disclaimer in the documentation
+ * and/or other materials provided with the distribution.
+ *
+ * 3. Neither the name of the copyright holder nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+ * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+ * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+ * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ **************************************************************************************************/
+
+/*! \file
+    \brief Tests for device-wide strided batched GEMV interface
+*/
+
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+
+#include "cutlass/cutlass.h"
+#include "cutlass/gemm/kernel/gemv_strided_batched.h"
+#include "cutlass/gemm/device/gemv_strided_batched.h"
+
+#include "../../common/cutlass_unit_test.h"
+
+#include "cutlass/util/host_tensor.h"
+#include "cutlass/util/tensor_view_io.h"
+#include "cutlass/util/distribution.h"
+#include "cutlass/util/reference/host/tensor_fill.h"
+#include "cutlass/util/reference/host/tensor_copy.h"
+#include "cutlass/util/reference/host/tensor_compare.h"
+#include "cutlass/util/reference/host/tensor_norm.h"
+#include "cutlass/util/reference/host/gemm.h"
+#include "cutlass/util/reference/host/gemm_complex.h"
+
+#include "testbed_utils.h"
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+namespace test {
+namespace gemm {
+
+template <typename GemvStridedBatched>
+class TestbedStridedBatchedGemv
+{
+public:
+
+  using ElementA = typename GemvStridedBatched::ElementA;
+  using LayoutA = typename GemvStridedBatched::LayoutA;
+  using ElementB = typename GemvStridedBatched::ElementB;
+  using ElementC = typename GemvStridedBatched::ElementC;
+
+  using ElementAccumulator = typename GemvStridedBatched::ElementAccumulator;
+  using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute;
+
+  using LayoutV = cutlass::layout::RowMajor;
+
+private:
+
+  /// Initialization
+  cutlass::Distribution::Kind init_A;
+  cutlass::Distribution::Kind init_B;
+  cutlass::Distribution::Kind init_C;
+  uint64_t seed;
+
+  cutlass::HostTensor<ElementA, LayoutA> tensor_A;
+  cutlass::HostTensor<ElementB, LayoutV> tensor_B;
+  cutlass::HostTensor<ElementC, LayoutV> tensor_C;
+  cutlass::HostTensor<ElementC, LayoutV> tensor_D;
+  cutlass::HostTensor<ElementC, LayoutV> reference_D;
+
+public:
+
+  //
+  // Methods
+  //
+
+  TestbedStridedBatchedGemv(
+      cutlass::Distribution::Kind init_A_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_B_ = cutlass::Distribution::Uniform,
+      cutlass::Distribution::Kind init_C_ = cutlass::Distribution::Uniform,
+      uint64_t seed_ = 2023):
+      init_A(init_A_), init_B(init_B_), init_C(init_C_), seed(seed_) {}
+
+  /// Helper to initialize a tensor view
+  template <typename Element, typename Layout>
+  bool initialize_tensor(
+    cutlass::TensorView<Element, Layout> view, 
+    cutlass::Distribution::Kind dist_kind,
+    uint64_t seed) {
+
+    if (dist_kind == cutlass::Distribution::Uniform) {
+
+      double scope_max, scope_min;
+      int bits_input = cutlass::sizeof_bits<Element>::value;
+      int bits_output = cutlass::sizeof_bits<typename GemvStridedBatched::ElementC>::value;
+
+      if (bits_input == 1) {
+        scope_max = 2;
+        scope_min = 0;
+      } else if (bits_input <= 8) {
+        scope_max = 2;
+        scope_min = -2;
+      } else if (bits_output == 16) {
+        scope_max = 5;
+        scope_min = -5;
+      } else {
+        scope_max = 8;
+        scope_min = -8;
+      }
+
+      cutlass::reference::host::TensorFillRandomUniform(
+        view, seed, scope_max, scope_min, 0);
+    } 
+    else if (dist_kind == cutlass::Distribution::Identity) {
+
+      cutlass::reference::host::TensorFillIdentity(view);
+    } 
+    else if (dist_kind == cutlass::Distribution::Gaussian) {
+
+      cutlass::reference::host::TensorFillRandomGaussian(view, seed, 0, 0.5);
+    }
+    else if (dist_kind == cutlass::Distribution::Sequential) {
+
+      cutlass::reference::host::BlockFillSequential(
+        view.data(), view.capacity());
+    } 
+    else {
+      // TODO: Implement the rest
+      EXPECT_TRUE(false) << "Not implemented";
+      return false;
+    }
+
+    return true;
+  }
+
+  /// Initializes data structures
+  void initialize(
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count
+  ) {
+
+    //
+    // Allocate the GEMV workspace
+    //
+
+    tensor_A.resize({batch_count * problem_size.row(), problem_size.column()});
+    tensor_B.resize({batch_count * problem_size.column(), 1});
+    tensor_C.resize({batch_count * problem_size.row(), 1});
+    tensor_D.resize({batch_count * problem_size.row(), 1});
+    reference_D.resize({batch_count * problem_size.row(), 1}, false);
+
+    EXPECT_TRUE(initialize_tensor(tensor_A.host_view(), init_A, seed + 1));
+    EXPECT_TRUE(initialize_tensor(tensor_B.host_view(), init_B, seed + 2));
+    EXPECT_TRUE(initialize_tensor(tensor_C.host_view(), init_C, seed + 3));
+
+    // It is possible to randomly initialize to all zeros, so override this with non-zeros
+    // in the upper left corner of each operand.
+    tensor_A.host_view().at({0, 0}) = typename GemvStridedBatched::ElementA(1);
+    tensor_B.host_view().at({0, 0}) = typename GemvStridedBatched::ElementB(1);
+    tensor_C.host_view().at({0, 0}) = typename GemvStridedBatched::ElementC(1);
+
+    cutlass::reference::host::TensorCopy(reference_D.host_view(), tensor_C.host_view());
+
+    tensor_A.sync_device();
+    tensor_B.sync_device();
+    tensor_C.sync_device();
+    tensor_D.sync_device();
+  }  
+
+  /// Compares computed reference with device reference and outputs to a file if incorrect
+  bool compare_reference(
+    cutlass::MatrixCoord problem_size, 
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    tensor_D.sync_host();
+
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_A.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_B.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_C.host_view()), 0);
+    
+    EXPECT_GT(cutlass::reference::host::TensorNorm(tensor_D.host_view()), 0);
+    EXPECT_GT(cutlass::reference::host::TensorNorm(reference_D.host_view()), 0);
+
+    bool passed = cutlass::reference::host::TensorEquals(reference_D.host_view(), tensor_D.host_view());
+
+    EXPECT_TRUE(passed) << " mismatched reference";
+
+    if (!passed) {
+
+      std::ofstream file("testbed_universal_errors.txt");
+
+      file
+        << "problem: " << problem_size 
+        << ", alpha: " << alpha << ", beta: " << beta << "\n\n";
+
+      file 
+        << "A =\n" << tensor_A.host_view()
+        << "\nB =\n" << tensor_B.host_view()
+        << "\nC =\n" << tensor_C.host_view()
+        << "\n\nReference =\n" << reference_D.host_view()
+        << "\nComputed =\n" << tensor_D.host_view();
+    }
+
+    return passed;
+  }
+
+  /// Verifies the result
+  bool verify(
+    cutlass::MatrixCoord problem_size,
+    int32_t batch_count,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    ElementCompute alpha, 
+    ElementCompute beta) {
+
+    //
+    // Verify
+    //
+
+    cutlass::reference::host::GemmComplex<
+        typename GemvStridedBatched::ElementA, typename GemvStridedBatched::LayoutA,
+        typename GemvStridedBatched::ElementB, LayoutV,
+        typename GemvStridedBatched::ElementC, LayoutV, 
+        ElementCompute, ElementAccumulator
+    >(
+      {problem_size.row(), 1, problem_size.column()},
+      alpha, 
+      tensor_A.host_ref(),
+      GemvStridedBatched::kTransformA,
+      tensor_B.host_ref(),
+      GemvStridedBatched::kTransformB,
+      beta, 
+      tensor_C.host_ref(), 
+      reference_D.host_ref(), 
+      ElementAccumulator(0),
+      batch_count,
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    );
+
+    return compare_reference(problem_size, alpha, beta);
+  }
+
+  /// Runs one problem size
+  bool run(
+    cutlass::MatrixCoord problem_size, 
+    int32_t batch_count,
+    int64_t batch_stride_A,
+    int64_t batch_stride_B,
+    int64_t batch_stride_C,
+    int64_t batch_stride_D,
+    ElementCompute alpha,
+    ElementCompute beta) {
+
+    this->initialize(problem_size, batch_count);
+
+    //
+    // Initialize the GEMV operator
+    //
+
+    typename GemvStridedBatched::Arguments arguments{
+      problem_size,
+      batch_count,
+      {alpha, beta},
+      tensor_A.device_ref(),
+      tensor_B.device_data(),
+      tensor_C.device_data(),
+      tensor_D.device_data(),
+      batch_stride_A,
+      batch_stride_B,
+      batch_stride_C,
+      batch_stride_D
+    };
+
+    GemvStridedBatched gemm_op;
+
+    cutlass::Status status = gemm_op.can_implement(arguments);
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    size_t workspace_size = GemvStridedBatched::get_workspace_size(arguments);
+
+    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
+
+    status = gemm_op.initialize(arguments, workspace.get());
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Run the GEMV
+    //
+
+    status = gemm_op();
+
+    EXPECT_TRUE(status == cutlass::Status::kSuccess) << to_string(status);
+
+    //
+    // Verify
+    //
+
+    bool passed = this->verify(
+        problem_size,
+        batch_count,
+        batch_stride_A,
+        batch_stride_B,
+        batch_stride_C,
+        batch_stride_D,
+        alpha,
+        beta);
+    return passed;
+  }
+};
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <typename GemvStridedBatched> 
+bool TestAllGemv() {
+
+  using ElementCompute = typename GemvStridedBatched::EpilogueOutputOp::ElementCompute;
+
+  int Batch[] = {
+    1, 520, 1314
+  };
+
+  int M[] = {
+    1, 5, 16
+  };
+
+  int K[] = {
+    8, 128, 256
+  };
+
+  double Alpha[] = {
+    1, 1.25
+  };
+
+  double Beta[] = {
+    0, 1, 1.25
+  };
+
+  for (int b : Batch) {
+    for (int m : M) {
+      for (int k : K) {
+        for (double alpha : Alpha) {
+          for (double beta : Beta) {
+
+            TestbedStridedBatchedGemv<GemvStridedBatched> testbed;
+
+            if (!testbed.run(
+                    {m, k},
+                    b,
+                    m * k,
+                    k,
+                    m,
+                    m,
+                    ElementCompute(alpha),
+                    ElementCompute(beta))) {
+              return false;
+            }
+          }
+        }
+      }
+    }
+  }
+
+  return true;
+}
+
+} // namespace gemm
+} // namespace test
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_StridedBatchedGemv_f16n_f16_f16_simt_f32, Simple) {
+
+  using ElementInput = cutlass::half_t;
+  using ElementOutput = cutlass::half_t;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = float;
+  int const kElementsPerAccess = 8;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
+      cutlass::gemm::kernel::GemvStridedBatched<
+          ElementInput,       // Element A
+          LayoutA,            // Layout A
+          ElementInput,       // Element B
+          ElementOutput,      // Element C
+          ElementAccumulator, // Element accumulator
+          kElementsPerAccess,      // Element access granularity
+          EpilogueOp          // Output operator
+          >>;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());
+}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_StridedBatchedGemv_f32n_f32_f32_simt_f32, Simple) {
+
+  using ElementInput = float;
+  using ElementOutput = float;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = float;
+  int const kElementsPerAccess = 4;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
+      cutlass::gemm::kernel::GemvStridedBatched<
+          ElementInput,       // Element A
+          LayoutA,            // Layout A
+          ElementInput,       // Element B
+          ElementOutput,      // Element C
+          ElementAccumulator, // Element accumulator
+          kElementsPerAccess,      // Element access granularity
+          EpilogueOp          // Output operator
+          >>;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
+
+TEST(SM50_Device_StridedBatchedGemv_f64n_f64_f64_simt_f64, Simple) {
+
+  using ElementInput = double;
+  using ElementOutput = double;
+  using LayoutA = cutlass::layout::RowMajor;
+
+  using ElementAccumulator = double;
+  int const kElementsPerAccess = 2;
+  
+  using EpilogueOp = cutlass::epilogue::thread::LinearCombination<
+      ElementOutput,
+      1,
+      ElementAccumulator,
+      ElementAccumulator>;
+
+  using GemvStridedBatched = cutlass::gemm::device::GemvStridedBatched<
+      cutlass::gemm::kernel::GemvStridedBatched<
+          ElementInput,       // Element A
+          LayoutA,            // Layout A
+          ElementInput,       // Element B
+          ElementOutput,      // Element C
+          ElementAccumulator, // Element accumulator
+          kElementsPerAccess,      // Element access granularity
+          EpilogueOp          // Output operator
+          >>;
+
+  EXPECT_TRUE(test::gemm::TestAllGemv<GemvStridedBatched>());}
+
+/////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/library/scripts/gemm_operation.py b/tools/library/scripts/gemm_operation.py
index cb1075bb..0be042f9 100644
--- a/tools/library/scripts/gemm_operation.py
+++ b/tools/library/scripts/gemm_operation.py
@@ -182,9 +182,9 @@ def procedural_name(self):
           ar = self.arch,
           op = opcode_class_name,
           ex = self.extended_name_3x(),
-          tbm = self.tile_description.threadblock_shape[0],
-          tbn = self.tile_description.threadblock_shape[1],
-          tbk = self.tile_description.threadblock_shape[2],
+          tbm = self.tile_description.tile_shape[0],
+          tbn = self.tile_description.tile_shape[1],
+          tbk = self.tile_description.tile_shape[2],
           cm = self.tile_description.cluster_shape[0],
           cn = self.tile_description.cluster_shape[1],
           ck = self.tile_description.cluster_shape[2],
@@ -640,7 +640,7 @@ def __init__(self, operation_suffix = ''):
 using ${operation_name}_epilogue =
   typename cutlass::epilogue::collective::CollectiveBuilder<
     ${arch}, ${opcode_class},
-    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
     cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
     cutlass::epilogue::collective::EpilogueTileAuto,
     ${element_accumulator}, ${element_epilogue},
@@ -655,7 +655,7 @@ def __init__(self, operation_suffix = ''):
     ${element_a}, ${layout_a}, ${align_a},
     ${element_b}, ${layout_b}, ${align_b},
     ${element_accumulator},
-    cute::Shape<cute::_${threadblock_shape_m}, cute::_${threadblock_shape_n}, cute::_${threadblock_shape_k}>,
+    cute::Shape<cute::_${tile_shape_m}, cute::_${tile_shape_n}, cute::_${tile_shape_k}>,
     cute::Shape<cute::_${cluster_m},cute::_${cluster_n},cute::_${cluster_k}>,
     cutlass::gemm::collective::StageCountAutoCarveout<
       sizeof(typename ${operation_name}_epilogue::SharedStorage)>,
@@ -686,14 +686,14 @@ def instance_template(self):
   #
   def emit(self, operation):
 
-    threadblock_shape = operation.tile_description.threadblock_shape
+    tile_shape = operation.tile_description.tile_shape
     warp_count = operation.tile_description.warp_count
     # stage count set to zero indicates builder automatic stage selection
     if operation.tile_description.stages > 0:
       stage_count_string = f"cutlass::gemm::collective::StageCount<{str(operation.tile_description.stages)}>"
     else:
       stage_count_string = "cutlass::gemm::collective::StageCountAuto"
-    warp_shape = [threadblock_shape[idx] // warp_count[idx] for idx in range(3)]
+    warp_shape = [tile_shape[idx] // warp_count[idx] for idx in range(3)]
 
     instance_layout_A, instance_layout_B, instance_layout_C , instance_layout_D = \
       (operation.A.layout, operation.B.layout, operation.C.layout, operation.D.layout)
@@ -727,9 +727,9 @@ def emit(self, operation):
       'element_accumulator': DataTypeTag[operation.accumulator_type()],
       'opcode_class': OpcodeClassTag[operation.tile_description.math_instruction.opcode_class],
       'arch': "cutlass::arch::Sm%d" % operation.arch,
-      'threadblock_shape_m': str(operation.tile_description.threadblock_shape[0]),
-      'threadblock_shape_n': str(operation.tile_description.threadblock_shape[1]),
-      'threadblock_shape_k': str(operation.tile_description.threadblock_shape[2]),
+      'tile_shape_m': str(operation.tile_description.tile_shape[0]),
+      'tile_shape_n': str(operation.tile_description.tile_shape[1]),
+      'tile_shape_k': str(operation.tile_description.tile_shape[2]),
       'cluster_m': str(operation.tile_description.cluster_shape[0]),
       'cluster_n': str(operation.tile_description.cluster_shape[1]),
       'cluster_k': str(operation.tile_description.cluster_shape[2]),
diff --git a/tools/library/scripts/generator.py b/tools/library/scripts/generator.py
index ec15a1af..95595af4 100644
--- a/tools/library/scripts/generator.py
+++ b/tools/library/scripts/generator.py
@@ -91,22 +91,21 @@ def CreateGemmOperator(manifest, layouts, tile_descriptions, data_type, \
 
 # Generates 3.0 API based GemmUniversal API kernels. Alignment constraints are folded in with layouts
 def CreateGemmUniversal3xOperator(
-    manifest, layouts, tile_descriptions, data_type,
+    manifest, layouts, tile_descriptions, data_types,
     schedules = [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto]],
     complex_transforms=None,
     epilogue_functor=EpilogueFunctor.LinearCombination,
     swizzling_functor=SwizzlingFunctor.Identity1):
 
+  if type(data_types) is dict:
+    data_types = [data_types]
+
+  for s in schedules:
+    assert(len(s) == 2)
+
   if complex_transforms is None:
     complex_transforms = [(ComplexTransform.none, ComplexTransform.none), ]
 
-  element_a        = data_type["a_type"]
-  element_b        = data_type["b_type"]
-  element_c        = data_type["c_type"]
-  element_d        = data_type["d_type"]
-  element_acc      = data_type["acc_type"]
-  element_epilogue = data_type.get("epi_type", element_acc)
-
   operations = []
 
   # by default, only generate the largest tile and largest alignment
@@ -115,23 +114,25 @@ def CreateGemmUniversal3xOperator(
 
   for layout in layouts:
     for tile_description in tile_descriptions:
-      for complex_transform in complex_transforms:
-        for kernel_schedule, epilogue_schedule in schedules:
-          A = TensorDescription(
-              element_a, layout[0][0], layout[0][1], complex_transform[0])
-          B = TensorDescription(
-              element_b, layout[1][0], layout[1][1], complex_transform[1])
+      for data_type in data_types:
+        for complex_transform in complex_transforms:
+          for kernel_schedule, epilogue_schedule in schedules:
+            A = TensorDescription(
+                data_type["a_type"], layout[0][0], layout[0][1], complex_transform[0])
+            B = TensorDescription(
+                data_type["b_type"], layout[1][0], layout[1][1], complex_transform[1])
 
-          C = TensorDescription(element_c, layout[2][0], layout[2][1])
-          D = TensorDescription(element_d, layout[2][0], layout[2][1])
+            C = TensorDescription(data_type["c_type"], layout[2][0], layout[2][1])
+            D = TensorDescription(data_type["d_type"], layout[2][0], layout[2][1])
 
-          operation = GemmOperation(
-              GemmKind.Universal3x, tile_description.minimum_compute_capability,
-              tile_description, A, B, C, element_epilogue, epilogue_functor, swizzling_functor, D,
-              kernel_schedule, epilogue_schedule)
+            element_compute = data_type.get("epi_type", data_type["acc_type"])
+            operation = GemmOperation(
+                GemmKind.Universal3x, tile_description.minimum_compute_capability,
+                tile_description, A, B, C, element_compute, epilogue_functor, swizzling_functor, D,
+                kernel_schedule, epilogue_schedule)
 
-          manifest.append(operation)
-          operations.append(operation)
+            manifest.append(operation)
+            operations.append(operation)
 
   return operations
 
@@ -4118,21 +4119,19 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
         layout[2][1] = 8
 
     if CudaToolkitVersionSatisfies(cuda_version, 12, 1):
-      kernel_schedules = [
-        KernelScheduleType.ScheduleAuto,
-        KernelScheduleType.TmaWarpSpecializedCooperative,
-        KernelScheduleType.TmaWarpSpecializedPingpong,
-        KernelScheduleType.TmaWarpSpecialized
+      schedules = [
+        [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+        [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.NoSmemWarpSpecialized],
+        [KernelScheduleType.TmaWarpSpecializedPingpong, EpilogueScheduleType.NoSmemWarpSpecialized],
+        [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
       ]
     else:
-      kernel_schedules = [
-        KernelScheduleType.ScheduleAuto,
-        KernelScheduleType.TmaWarpSpecialized
+      schedules = [
+        [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+        [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
         # TmaWarpSpecializedCooperative and TmaWarpSpecializedPingpong require CUDA version >= 12.1 for optimal performance.
       ]
 
-    schedules = [[s, EpilogueScheduleType.ScheduleAuto] for s in kernel_schedules]
-
     CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type, schedules)
 
     # persistent kernels with TMA epilogues
@@ -4140,6 +4139,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
       CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
         [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
          [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+      # Emit instance without C allocation+load
+      data_type["c_type"] = DataType.void
+      CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type,
+        [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+         [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
 
     # for mixed precision kernels, also generate kernels that write output matrix in the A/B format
     # Avoid emitting two kernels if the accumulator type does not differ from the input type (e.g. F16 accumulation)
@@ -4166,6 +4170,11 @@ def GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version):
         CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
           [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
            [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
+        # Emit instance without C allocation+load
+        data_type_mixed["c_type"] = DataType.void
+        CreateGemmUniversal3xOperator(manifest, layouts, tile_descriptions, data_type_mixed,
+          [[KernelScheduleType.TmaWarpSpecializedPingpong,    EpilogueScheduleType.TmaWarpSpecialized],
+           [KernelScheduleType.TmaWarpSpecializedCooperative, EpilogueScheduleType.TmaWarpSpecializedCooperative]])
 
 #
 def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
@@ -4212,19 +4221,32 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
     "acc_type" : math_inst.element_accumulator,
     "epi_type" : math_inst.element_accumulator
   }
+
+  schedules = [
+    [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.ScheduleAuto],
+    [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.NoSmemWarpSpecialized]
+  ]
+
+  # TMA kernels with TT layout use EpilogueTransposed (NoSmemWarpSpecialized with swapped strides),
+  # because they use NN kernels underneath and transposing its epilogue will get the correct output
+  schedules_transposed_epilogue = [
+    [KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed],
+    [KernelScheduleType.TmaWarpSpecialized, EpilogueScheduleType.EpilogueTransposed]
+  ]
+
   # TMA kernels with TN or NN layout
   layouts_tf32_tn_nn = [layouts_tf32[0], layouts_tf32[2]]
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_tf32, schedules)
 
   # TMA kernels with NT layout, only support 64x128x32 tile for now.
   layouts_tf32_nt = [layouts_tf32[3]]
   tile_64x128x32_descriptions = [tile_descriptions[0], tile_descriptions[1], tile_descriptions[2]]
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32)
+  tile_128x128x32_descriptions = [tile_descriptions[3], tile_descriptions[4], tile_descriptions[5]]
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_tf32, schedules)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_tf32, [schedules[1]])
 
-  # TMA kernels with TT layout use EpilogueTransposed, because swapping NN kernel and transposed its epilogue will get the kernel
   layouts_tf32_tt = [layouts_tf32[1]]
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32,
-    [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]])
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_tf32, schedules_transposed_epilogue)
 
   # F32 kernel share same settings with tf32 I/O kernels excluding data type
   data_type_f32 = {
@@ -4236,10 +4258,10 @@ def GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version):
     "epi_type" : DataType.f32
   }
 
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32)
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32)
-  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32,
-    [[KernelScheduleType.ScheduleAuto, EpilogueScheduleType.EpilogueTransposed]])
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tn_nn, tile_descriptions, data_type_f32, schedules)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_64x128x32_descriptions, data_type_f32, schedules)
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_nt, tile_128x128x32_descriptions, data_type_f32, [schedules[1]])
+  CreateGemmUniversal3xOperator(manifest, layouts_tf32_tt, tile_descriptions, data_type_f32, schedules_transposed_epilogue)
 
 #
 def GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version):
@@ -4910,8 +4932,8 @@ def GenerateSM90_TensorOp_1684_symm_complex_gaussian(manifest, cuda_version):
 #
 def GenerateSM90(manifest, cuda_version):
   GenerateSM90_TensorOp_16b_WGMMA_gemm(manifest, cuda_version)
-  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
   GenerateSM90_TensorOp_tf32_WGMMA_gemm(manifest, cuda_version)
+  GenerateSM90_TensorOp_int8_WGMMA_gemm(manifest, cuda_version)
   GenerateSM90_TensorOp_1684(manifest, cuda_version)
   GenerateSM90_TensorOp_1684_complex(manifest, cuda_version)
   GenerateSM90_TensorOp_1684_complex_gaussian(manifest, cuda_version)
diff --git a/tools/library/scripts/library.py b/tools/library/scripts/library.py
index b12de786..2f0fcfb4 100644
--- a/tools/library/scripts/library.py
+++ b/tools/library/scripts/library.py
@@ -40,6 +40,7 @@ class GeneratorTarget(enum.Enum):
 
 #
 class DataType(enum.Enum):
+  void = enum_auto()  # primarily used to disable C tensor for epilogues
   b1 = enum_auto()
   u4 = enum_auto()
   u8 = enum_auto()
@@ -89,6 +90,7 @@ class DataType(enum.Enum):
 
 #
 DataTypeNames = {
+  DataType.void: "void",
   DataType.b1: "b1",
   DataType.u4: "u4",
   DataType.u8: "u8",
@@ -121,10 +123,11 @@ class DataType(enum.Enum):
   DataType.cs8: "cs8",
   DataType.cs16: "cs16",
   DataType.cs32: "cs32",
-  DataType.cs64: "cs64", 
+  DataType.cs64: "cs64",
 }
 
 DataTypeTag = {
+  DataType.void: "void",
   DataType.b1: "cutlass::uint1b_t",
   DataType.u4: "cutlass::uint4b_t",
   DataType.u8: "uint8_t",
@@ -161,6 +164,7 @@ class DataType(enum.Enum):
 }
 
 DataTypeSize = {
+  DataType.void: 0,
   DataType.b1: 1,
   DataType.u4: 4,
   DataType.u8: 8,
@@ -765,6 +769,7 @@ class TileDescription:
 
   def __init__(self, threadblock_shape, stages, warp_count, math_instruction, min_compute, max_compute, cluster_shape = [1,1,1]):
     self.threadblock_shape = threadblock_shape
+    self.tile_shape = threadblock_shape
     self.stages = stages
     self.warp_count = warp_count
     self.math_instruction = math_instruction
diff --git a/tools/library/scripts/manifest.py b/tools/library/scripts/manifest.py
index 8b53be41..48fdccb5 100644
--- a/tools/library/scripts/manifest.py
+++ b/tools/library/scripts/manifest.py
@@ -240,7 +240,9 @@ def __init__(self, args = None):
         self.kernel_filter_list = []
     else:
         self.kernel_filter_list = self.get_kernel_filters(args.kernel_filter_file)
-
+        _LOGGER.info("Using {filter_count} kernel filters from {filter_file}".format(
+            filter_count = len(self.kernel_filter_list), 
+            filter_file = args.kernel_filter_file))
 
     self.operation_count = 0
     self.operations_by_name = {}
@@ -311,19 +313,29 @@ def filter(self, operation):
       # compare against the include list
       for name_substr in self.kernel_names:
         if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug("Kernel {kernel} included due to filter string '{filt}'.".format(
+            kernel = operation.procedural_name(),
+            filt = name_substr))
           enabled = True
           break
 
       # compare against the exclude list
       for name_substr in self.ignore_kernel_names:
         if self._filter_string_matches(name_substr, name):
+          _LOGGER.debug("Kernel {kernel} ignored due to filter string '{filt}'.".format(
+            kernel = operation.procedural_name(),
+            filt = name_substr))
           enabled = False
           break
 
     if len(self.kernel_filter_list) > 0:
-        enabled = False
         if self.filter_out_kernels(operation.procedural_name(), self.kernel_filter_list):
-            enabled = True
+          _LOGGER.debug("Kernel {kernel} matched via kernel filter file.".format(kernel = operation.procedural_name()))
+          enabled = True
+        else:
+          _LOGGER.debug("Kernel {kernel} culled due to no match in kernel filter file.".format(kernel = operation.procedural_name()))
+          enabled = False
+
 
     # todo: filter based on compute data type
     return enabled
@@ -389,6 +401,8 @@ def emit(self, target = GeneratorTarget.Library):
     for operation_kind, configurations in self.operations.items():
       with operation_emitters[target](generated_path, operation_kind, self.args) as operation_kind_emitter:
         for configuration_name, operations in configurations.items():
+          _LOGGER.info("Emitting {config} with {num_ops} operations.".format(
+              config = configuration_name, num_ops = len(operations)))
           operation_kind_emitter.emit(configuration_name, operations)
 
         source_files += operation_kind_emitter.source_files
diff --git a/tools/library/src/library_internal.h b/tools/library/src/library_internal.h
index 5423edda..4e4e09d2 100644
--- a/tools/library/src/library_internal.h
+++ b/tools/library/src/library_internal.h
@@ -64,6 +64,10 @@ namespace library {
 
 template <typename T> struct NumericTypeMap;
 
+template <> struct NumericTypeMap<void> {
+  static NumericTypeID const kId = NumericTypeID::kVoid;
+};
+
 template <> struct NumericTypeMap<cutlass::uint1b_t> {
   static NumericTypeID const kId = NumericTypeID::kB1;
 };
diff --git a/tools/profiler/CMakeLists.txt b/tools/profiler/CMakeLists.txt
index 1675d42c..368f8b9a 100644
--- a/tools/profiler/CMakeLists.txt
+++ b/tools/profiler/CMakeLists.txt
@@ -107,15 +107,17 @@ set(CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM   --operation=Symm       --provid
 cutlass_add_executable_tests(
   test_profiler cutlass_profiler
   DEPENDEES test_all
-  TEST_COMMAND_OPTIONS 
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_GEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV2D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_CONV3D
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SPGEMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_RANK_2K
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_TRMM
-    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_SYMM
+  TEST_COMMAND_OPTIONS
+    GEMM
+    CONV2D
+    CONV3D
+    SPGEMM
+    RANK_K
+    RANK_2K
+    TRMM
+    SYMM
+  TEST_COMMAND_OPTIONS_PREFIX
+    CUTLASS_PROFILER_TEST_COMMAND_OPTIONS_
   DISABLE_EXECUTABLE_INSTALL_RULE
   )
 
diff --git a/tools/profiler/src/cutlass_profiler.cu b/tools/profiler/src/cutlass_profiler.cu
index 026ffdf0..a4f37780 100644
--- a/tools/profiler/src/cutlass_profiler.cu
+++ b/tools/profiler/src/cutlass_profiler.cu
@@ -124,7 +124,7 @@ int CutlassProfiler::operator()() {
     options_.execution_mode == ExecutionMode::kTrace) {
 
     // Profiles all operations
-    profile_();
+    return profile_();
   }
   else if (options_.execution_mode == ExecutionMode::kEnumerate) {
     // Enumerates all operations
@@ -157,7 +157,7 @@ int CutlassProfiler::profile_() {
 
       if (result) {
         return result;
-      } 
+      }
     }
   }
 
diff --git a/tools/profiler/src/device_allocation.cu b/tools/profiler/src/device_allocation.cu
index f464ccb7..600950e2 100644
--- a/tools/profiler/src/device_allocation.cu
+++ b/tools/profiler/src/device_allocation.cu
@@ -462,6 +462,13 @@ size_t DeviceAllocation::bytes() const {
 
 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_device(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
   cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyDeviceToDevice);
   if (result != cudaSuccess) {
     throw std::runtime_error("Failed device-to-device copy");
@@ -470,22 +477,43 @@ void DeviceAllocation::copy_from_device(void const *ptr) {
 
 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_from_host(void const *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
   cudaError_t result = cudaMemcpy(data(), ptr, bytes(), cudaMemcpyHostToDevice);
   if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed host-to-device copy");
   }
 }
 
 /// Copies from an equivalent-sized tensor in device memory
 void DeviceAllocation::copy_to_host(void *ptr) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping copy of size 0 allocation\n";
+#endif
+    return;
+  }
+
   cudaError_t result = cudaMemcpy(ptr, data(), bytes(), cudaMemcpyDeviceToHost);
   if (result != cudaSuccess) {
-    throw std::runtime_error("Failed device-to-device copy");
+    throw std::runtime_error("Failed device-to-host copy");
   }
 }
 
 void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
@@ -690,7 +718,14 @@ void DeviceAllocation::initialize_random_device(int seed, Distribution dist) {
 }
 
 void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
@@ -699,7 +734,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
   switch (type_) {
   case library::NumericTypeID::kFE4M3:
     cutlass::reference::host::BlockFillRandom<cutlass::float_e4m3_t>(
-      reinterpret_cast<cutlass::float_e4m3_t *>(pointer_),
+      reinterpret_cast<cutlass::float_e4m3_t *>(host_data.data()),
       capacity_,
       seed,
       dist
@@ -707,7 +742,7 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
     break;
   case library::NumericTypeID::kFE5M2:
     cutlass::reference::host::BlockFillRandom<cutlass::float_e5m2_t>(
-      reinterpret_cast<cutlass::float_e5m2_t *>(pointer_),
+      reinterpret_cast<cutlass::float_e5m2_t *>(host_data.data()),
       capacity_,
       seed,
       dist
@@ -904,7 +939,14 @@ void DeviceAllocation::initialize_random_host(int seed, Distribution dist) {
 }
 
 void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
@@ -934,7 +976,14 @@ void DeviceAllocation::initialize_random_sparsemeta_device(int seed, int MetaSiz
 }
 
 void DeviceAllocation::initialize_random_sparsemeta_host(int seed, int MetaSizeInBits) {
-  if (!good()) {
+  if (!bytes()) {
+#ifndef NDEBUG
+    std::cout << "Skipping initialization of size 0 allocation\n";
+#endif
+    return;
+  }
+
+  if (!data()) {
     throw std::runtime_error("Attempting to initialize invalid allocation.");
   }
 
diff --git a/tools/profiler/src/gemm_operation_profiler.cu b/tools/profiler/src/gemm_operation_profiler.cu
index 595c9084..a622e048 100644
--- a/tools/profiler/src/gemm_operation_profiler.cu
+++ b/tools/profiler/src/gemm_operation_profiler.cu
@@ -68,6 +68,7 @@ GemmOperationProfiler::GemmOperationProfiler(Options const &options):
       {ArgumentTypeID::kTensor, {"A"}, "Tensor storing the A operand"},
       {ArgumentTypeID::kTensor, {"B"}, "Tensor storing the B operand"},
       {ArgumentTypeID::kTensor, {"C"}, "Tensor storing the C operand"},
+      {ArgumentTypeID::kTensor, {"D"}, "Tensor storing the D output"},
       {ArgumentTypeID::kScalar, {"alpha", "epilogue::alpha"}, "Epilogue scalar alpha"},
       {ArgumentTypeID::kScalar, {"beta", "epilogue::beta"}, "Epilogue scalar beta"},
       {ArgumentTypeID::kEnumerated, {"split_k_mode", "split-k-mode"}, "Variant of split K mode(serial, parallel)"},
@@ -206,6 +207,10 @@ Status GemmOperationProfiler::GemmProblem::parse(
     return Status::kErrorInvalidProblem;
   }
 
+  if (!tensor_description_satisfies(operation_desc.D, "D", problem_space, problem)) {
+    return Status::kErrorInvalidProblem;
+  }
+
   if (!arg_as_scalar(
     this->alpha, 
     operation_desc.element_epilogue, 
@@ -307,6 +312,9 @@ void GemmOperationProfiler::GemmProblem::initialize_result(
   set_argument(result, "C", problem_space,
     std::string(library::to_string(operation_desc.C.element)) + ":" + library::to_string(operation_desc.C.layout));
 
+  set_argument(result, "D", problem_space,
+    std::string(library::to_string(operation_desc.D.element)) + ":" + library::to_string(operation_desc.D.layout));
+
   set_argument(result, "m", problem_space, m);
   set_argument(result, "n", problem_space, n);
   set_argument(result, "k", problem_space, k);
@@ -537,8 +545,6 @@ Status GemmOperationProfiler::initialize_workspace(
       problem_.batch_count * gemm_workspace_.problem_count
     );
 
-    gemm_workspace_.Reference->copy_from_device(gemm_workspace_.C->data());
-
     // NOTE: the leading non-batch strides are duplicated here for 3.0 API kernels
     gemm_workspace_.arguments.problem_size = {int(problem_.m), int(problem_.n), int(problem_.k)};
     gemm_workspace_.arguments.batch_count = problem_.batch_count;
diff --git a/tools/profiler/src/operation_profiler.cu b/tools/profiler/src/operation_profiler.cu
index b2e8f9b7..737821c1 100644
--- a/tools/profiler/src/operation_profiler.cu
+++ b/tools/profiler/src/operation_profiler.cu
@@ -270,17 +270,17 @@ int OperationProfiler::profile_all(
   ProblemSpace::Iterator problem_it = problem_space.begin();
   ProblemSpace::Iterator problem_end = problem_space.end();
 
-  bool continue_profiling = true, internal_error = false;
+  bool continue_profiling = true;
+  int retval = 0;
 
   // For each problem in problem space
   for (; continue_profiling && problem_it != problem_end; ++problem_it) {
-
     ProblemSpace::Problem problem = problem_it.at();
-
     report.next_problem();
 
     // For each operation in manifest
-    for (auto const & operation_ptr : manifest) {
+    int matched_operation_count = 0;
+    for (auto const& operation_ptr : manifest) {
 
       library::Operation const *operation = operation_ptr.get();
 
@@ -292,8 +292,8 @@ int OperationProfiler::profile_all(
 
       // Execute compatible cutlass operations if they satisfy the current device's compute capability
       if (operation->description().kind == kind_ &&
-        operation->description().provider == library::Provider::kCUTLASS &&
-        options.device.compute_capability() >= min_cc &&
+          operation->description().provider == library::Provider::kCUTLASS &&
+          options.device.compute_capability() >= min_cc &&
           options.device.compute_capability() <= max_cc) {
 
         std::string operation_name(operation->description().name);
@@ -320,7 +320,10 @@ int OperationProfiler::profile_all(
         if (!filtered_by_name || !satisfies(operation->description(), problem_space, problem)) {
           continue;
         }
-      
+
+        // we have found a kernel match, so increment the counter for match kernels
+        ++matched_operation_count;
+
         // A. Initialize configuration
         Status status = this->initialize_configuration(
           options,
@@ -374,7 +377,6 @@ int OperationProfiler::profile_all(
         //
 
         // B. Verify CUTLASS
-         
         if (continue_profiling && options.profiling.provider_enabled(library::Provider::kCUTLASS)) {
 
           continue_profiling = this->verify_cutlass(
@@ -426,10 +428,18 @@ int OperationProfiler::profile_all(
       if (!continue_profiling) {
         break;
       }
-    } 
+    }
+
+    // If we did not find any kernels that match our filters and error_on_no_match was set, report an error
+    if (options.profiling.error_on_no_match && matched_operation_count <= 0) {
+      #if !NDEBUG
+      std::cout << "Error: No matching kernels found with kernel selection filters [--error_on_no_match]" << std::endl;
+      #endif
+      retval = 1;
+    }
   }
 
-  return internal_error ? 1 : 0;
+  return retval;
 }
 
 ///////////////////////////////////////////////////////////////////////////////////////////////////
diff --git a/tools/profiler/src/options.cu b/tools/profiler/src/options.cu
index ab2b4ed0..05f65302 100644
--- a/tools/profiler/src/options.cu
+++ b/tools/profiler/src/options.cu
@@ -706,10 +706,12 @@ Options::Options(cutlass::CommandLine const &cmdline):
   }
   else if (cmdline.check_cmd_line_flag("kernels")) {
     cmdline.get_cmd_line_arguments("kernels", operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
   }
 
   if (cmdline.check_cmd_line_flag("ignore-kernels")) {
     cmdline.get_cmd_line_arguments("ignore-kernels", excluded_operation_names);
+    profiling.error_on_no_match = cmdline.check_cmd_line_flag("error-on-no-match");
   }
 
   // Prevent launches on the device for anything other than CUTLASS operation
diff --git a/tools/profiler/src/options.h b/tools/profiler/src/options.h
index eba0172f..d679c70e 100644
--- a/tools/profiler/src/options.h
+++ b/tools/profiler/src/options.h
@@ -196,6 +196,9 @@ class Options {
     /// If true, profiling is actually conducted.
     bool enabled;
 
+    /// If true, profiling returns an error code if no kernels are found to match the filters.
+    bool error_on_no_match = false;
+
     /// List of providers of each functionality to be profiled
     ProviderVector providers;