Skip to content

Commit

Permalink
Updates for 3.1 (#932)
Browse files Browse the repository at this point in the history
  • Loading branch information
ANIKET-SHIVAM authored and ttl10101 committed Feb 7, 2024
1 parent c641ebc commit 8002dc3
Show file tree
Hide file tree
Showing 51 changed files with 1,796 additions and 328 deletions.
56 changes: 35 additions & 21 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -59,6 +59,8 @@ endif()

find_package(Doxygen QUIET)

################################################################################

#
# CUTLASS 3.x requires C++17
#
Expand All @@ -80,6 +82,10 @@ endif()

message(STATUS "Default Install Location: ${CMAKE_INSTALL_PREFIX}")

set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
# 0 - Sanity, 1 - Release-Quality, 2 - Exhaustive

################################################################################
set(CUTLASS_ENABLE_HEADERS_ONLY OFF CACHE BOOL "Enable only the header library")

if(CUTLASS_ENABLE_HEADERS_ONLY)
Expand Down Expand Up @@ -112,6 +118,8 @@ if (CUTLASS_ENABLE_TESTS)
include(${CMAKE_CURRENT_SOURCE_DIR}/cmake/googletest.cmake)
endif()

################################################################################

set(CUTLASS_NVCC_ARCHS_SUPPORTED "")
if (CUDA_VERSION VERSION_GREATER_EQUAL 11.4 AND NOT CUDA_COMPILER MATCHES "[Cc]lang")
list(APPEND CUTLASS_NVCC_ARCHS_SUPPORTED 70 72 75 80 86 87)
Expand Down Expand Up @@ -197,15 +205,16 @@ set(CUTLASS_NVCC_EMBED_PTX ON CACHE BOOL "Embed compiled PTX into executables.")
set(CUTLASS_NVCC_KEEP OFF CACHE BOOL "Keep intermediate files generated by NVCC.")
set(CUTLASS_ENABLE_F16C OFF CACHE BOOL "Enable F16C x86 extensions in host code.")

################################################################################
#
# CUTLASS generator cmake configuration
#

set(CUTLASS_LIBRARY_OPERATIONS "all" CACHE STRING "Comma delimited list of operation name filters. Default '' means all operations are enabled.")
set(CUTLASS_LIBRARY_KERNELS "" CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_KERNELS ${CUTLASS_LIBRARY_KERNELS_INIT} CACHE STRING "Comma delimited list of kernel name filters. If unspecified, only the largest tile size is enabled. If 'all' is specified, all kernels are enabled.")
set(CUTLASS_LIBRARY_IGNORE_KERNELS "" CACHE STRING "Comma delimited list of kernel names to exclude from build.")

# Test Levels L0, L1, L2
set(CUTLASS_TEST_LEVEL "0" CACHE STRING "Level of tests to compile.")
################################################################################

set(CUTLASS_TEST_ENABLE_CACHED_RESULTS ON CACHE BOOL "Enable caching and reuse of test results in unit tests")

Expand All @@ -225,6 +234,8 @@ if (CUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED)
list(APPEND CUTLASS_CUDA_NVCC_FLAGS -DCUTLASS_CONV_UNIT_TEST_RIGOROUS_SIZE_ENABLED=1)
endif()

################################################################################

#
# CUDA 10.1 introduces "mma" in PTX performing collective matrix multiply operations.
#
Expand Down Expand Up @@ -650,14 +661,16 @@ function(cutlass_add_executable_tests NAME TARGET)
# DEPENDS: A list of targets or files on which this test is dependent.
# DEPENDEES: A list of targets which should depend on this test.
# TEST_COMMAND_OPTIONS: A list of variables (i.e. by reference params) which contain command line arguments
# to pass to the test executable. A unique test with suffix _0, _1, ... is generated for each set of
# to pass to the test executable. A unique test is generated for each set of
# options given. If this option is not used, a single test with no arguments is generated.
# TEST_COMMAND_OPTIONS_PREFIX: If provided, is added as a prefix to each TEST_COMMAND_OPTIONS value for
# generating the full variable name to be referenced.
# RESULT_CACHE_FILE: A file to be installed alongside the test executable with pre-computed
# test results to speed up test runtime.
#

set(options DISABLE_EXECUTABLE_INSTALL_RULE)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE)
set(oneValueArgs DISABLE_TESTS RESULT_CACHE_FILE TEST_COMMAND_OPTIONS_PREFIX)
set(multiValueArgs DEPENDS DEPENDEES TEST_COMMAND_OPTIONS)
cmake_parse_arguments(_ "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

Expand Down Expand Up @@ -701,7 +714,6 @@ function(cutlass_add_executable_tests NAME TARGET)
endif()

list(LENGTH __TEST_COMMAND_OPTIONS CMD_COUNT)
set(CMD_IDX 0)

if (CMD_COUNT GREATER 1)
add_custom_target(${NAME} DEPENDS ${TARGET} ${__DEPENDS})
Expand All @@ -710,12 +722,12 @@ function(cutlass_add_executable_tests NAME TARGET)
endforeach()
endif()

foreach(CMD_OPTIONS ${__TEST_COMMAND_OPTIONS})
foreach(CMD_OPTIONS_VAR IN LISTS __TEST_COMMAND_OPTIONS)

if (CMD_COUNT GREATER 1)
set(TEST_NAME ${NAME}_${CMD_IDX})
string(TOLOWER "${NAME}_${CMD_OPTIONS_VAR}" TEST_NAME)
else()
set(TEST_NAME ${NAME})
string(TOLOWER "${NAME}" TEST_NAME)
endif()

# The following rigmarole is needed to deal with spaces and possible quotes in
Expand All @@ -724,14 +736,14 @@ function(cutlass_add_executable_tests NAME TARGET)
# preserves any quotes. Note, they have to be in this order for it to work for
# all the use cases below.

set(CMD_OPTIONS ${${CMD_OPTIONS}})
list(JOIN CMD_OPTIONS " " TEST_COMMAND_OPTIONS)
separate_arguments(CMD_OPTIONS)
set(TEST_COMMAND_OPTIONS ${${__TEST_COMMAND_OPTIONS_PREFIX}${CMD_OPTIONS_VAR}})
list(JOIN TEST_COMMAND_OPTIONS " " TEST_COMMAND_OPTIONS)
separate_arguments(TEST_COMMAND_OPTIONS)

add_custom_target(
${TEST_NAME}
COMMAND
${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
DEPENDS
${TARGET}
)
Expand All @@ -746,7 +758,7 @@ function(cutlass_add_executable_tests NAME TARGET)

add_test(
NAME c${TEST_NAME}
COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${CMD_OPTIONS}
COMMAND ${CUTLASS_TEST_EXECUTION_ENVIRONMENT} $<TARGET_FILE:${TARGET}> ${TEST_COMMAND_OPTIONS}
)

set_tests_properties(c${TEST_NAME} PROPERTIES DISABLED ${__DISABLE_TESTS})
Expand All @@ -756,27 +768,28 @@ function(cutlass_add_executable_tests NAME TARGET)
# To run the tests from an install package with tests enabled, we need to generate test files
# that don't rely on the current directory structure in build.

set(TEST_GEN_DIR ${CMAKE_CURRENT_BINARY_DIR}/${NAME})
file(MAKE_DIRECTORY ${TEST_GEN_DIR})

set(TEST_NAME c${TEST_NAME})
set(TEST_EXE $<TARGET_FILE_NAME:${TARGET}>)
set(TEST_EXE_WORKING_DIRECTORY ./${CMAKE_INSTALL_BINDIR})
configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)
configure_file("${CUTLASS_CTEST_TEMPLATE_FILE}" "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake" @ONLY)

file(GENERATE
OUTPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
INPUT "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
OUTPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake"
INPUT "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.config.cmake"
)

install(
FILES "${CMAKE_PROJECT_DIR}${CMAKE_CURRENT_BINARY_DIR}/CTestTestfile.${TEST_NAME}.cmake"
FILES "${TEST_GEN_DIR}/CTestTestfile.${TEST_NAME}.cmake"
DESTINATION ${CUTLASS_TEST_INSTALL_PREFIX}/ctest/
)

set(CUTLASS_CTEST_GENERATED_FILES ${CUTLASS_CTEST_GENERATED_FILES};ctest/CTestTestfile.${TEST_NAME}.cmake CACHE INTERNAL "")

endif()

math(EXPR CMD_IDX "${CMD_IDX} + 1")

endforeach()

endfunction()
Expand All @@ -787,6 +800,7 @@ if (CUTLASS_ENABLE_TOOLS)
add_dependencies(test_all test_profiler)
endif()
endif()

if (CUTLASS_ENABLE_EXAMPLES)
add_subdirectory(examples)
add_dependencies(test_all test_examples)
Expand Down
1 change: 1 addition & 0 deletions examples/08_turing_tensorop_gemm/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@
cutlass_example_add_executable(
08_turing_tensorop_gemm
turing_tensorop_gemm.cu
DISABLE_TESTS ON
)

1 change: 1 addition & 0 deletions examples/12_gemm_bias_relu/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,5 +31,6 @@
cutlass_example_add_executable(
12_gemm_bias_relu
gemm_bias_relu.cu
DISABLE_TESTS ON
)

2 changes: 1 addition & 1 deletion examples/31_basic_syrk/basic_syrk.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
matrix multiply kernel to verify its correctness.
The CUTLASS Syrk template is instantiated in the function CutlassSsyrkNN. This is kernel computes
the symmetric rank-k update (SYRK) using double-precision doubleing-point arithmetic and assumes
the symmetric rank-k update (SYRK) using double-precision floating-point arithmetic and assumes
all matrices have column-major layout.
The threadblock tile size is chosen as 16x32x16 which offers good performance for large matrices.
Expand Down
2 changes: 1 addition & 1 deletion examples/32_basic_trmm/basic_trmm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@
matrix multiply kernel to verify its correctness.
The CUTLASS Trmm template is instantiated in the function CutlassStrmmNN. This is kernel computes
the triangular matrix product (TRMM) using double-precision doubleing-point arithmetic and assumes
the triangular matrix product (TRMM) using double-precision floating-point arithmetic and assumes
all matrices have column-major layout.
The threadblock tile size is chosen as 64x64x16 which offers good performance for large matrices.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -495,23 +495,23 @@ int main(int argc, const char **argv)
options.tensor_d.resize(options.problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from CUTLASS kernel
options.tensor_ref_d.resize(options.problem_size.mn()); // <- Create matrix D with dimensions M x N used to store output from reference kernel

// Fill matrix A on host with uniform-random data [4, -4]
// Fill matrix A on host with uniform-random data [2, -2]
cutlass::reference::host::TensorFillRandomUniform(
options.tensor_a.host_view(),
1,
ElementA(2),
ElementA(-2),
0);

// Fill matrix B on host with uniform-random data [4, -4]
// Fill matrix B on host with uniform-random data [2, -2]
cutlass::reference::host::TensorFillRandomUniform(
options.tensor_b.host_view(),
1,
ElementB(2),
ElementB(-2),
0);

// Fill matrix C on host with uniform-random data [4, -4]
// Fill matrix C on host with uniform-random data [2, -2]
cutlass::reference::host::TensorFillRandomUniform(
options.tensor_c.host_view(),
1,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -84,9 +84,10 @@
therefore letting the builder pick the collective specialization.
CUTLASS builders make an attempt to pick the best schedule when `Auto` is provided such that the
assembled collctives have the best performance, but this is not a guarantee. A user relying on `Auto`
assembled collectives have the best performance, but this is not a guarantee. A user relying on `Auto`
may get a free performance upgrade with newer CUTLASS releases in case we can provide more optimized
implementations that the builder can transparently assemble for `Auto`.
implementations that the builder can transparently assemble for `Auto`. But a user should not rely on
`Auto` if they require a specific scheduling policy and/or stage count to be used.
If a user decides to let the builders pick the collective specialization via `Auto` schedules,
they must be used for both mainloop and epilogue alike to ensure compatibility between the
Expand All @@ -99,11 +100,6 @@
in this manner remains the primary API for using CUTLASS 3 kernels. `CollectiveBuilder`s are
simply meant to be a convenience interface.
Note also that, while the selections made by CollectiveBuilder attempt to maximize performance, this is not
a guarantee. Furthermore, the behavior of the CollectiveBuilder when `Auto` parameters are provided is subject
to change in future CUTLASS releases -- do not rely on `Auto` if you require a specific scheduling policy and/or
stage count to be used.
Details of this example
-----------------------
This example walks through the use of the CollectiveBuilder with various schedules and stage counts specified.
Expand Down
13 changes: 13 additions & 0 deletions include/cute/arch/cluster_sm90.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -227,4 +227,17 @@ elect_one_leader_sync()
#endif
}

// Store value to remote shared memory in the cluster
CUTE_DEVICE
void
store_shared_remote(uint32_t value, uint32_t smem_addr, uint32_t mbarrier_addr, uint32_t dst_cta_rank)
{
#if defined(CUTE_ARCH_CLUSTER_SM90_ENABLED)
uint32_t dsmem_addr = set_block_rank(smem_addr, dst_cta_rank);
uint32_t remote_barrier_addr = set_block_rank(mbarrier_addr, dst_cta_rank);
asm volatile("st.async.shared::cluster.mbarrier::complete_tx::bytes.u32 [%0], %1, [%2];"
: : "r"(dsmem_addr), "r"(value), "r"(remote_barrier_addr));
#endif
}

} // end namespace cute
Loading

0 comments on commit 8002dc3

Please sign in to comment.