diff --git a/cgmanifests/generated/cgmanifest.json b/cgmanifests/generated/cgmanifest.json index 14246144d59b0..3feb6f95ec025 100644 --- a/cgmanifests/generated/cgmanifest.json +++ b/cgmanifests/generated/cgmanifest.json @@ -296,7 +296,7 @@ "component": { "type": "git", "git": { - "commitHash": "58817bf967d3b7b314233f6331f0430cf40651ae", + "commitHash": "04f93aadb8fca26c141f70ab19710c460b46f6c0", "repositoryUrl": "https://github.com/ROCmSoftwarePlatform/composable_kernel.git" }, "comments": "composable_kernel" diff --git a/cmake/deps.txt b/cmake/deps.txt index 91fabbbf9386d..b12fe750923f3 100644 --- a/cmake/deps.txt +++ b/cmake/deps.txt @@ -44,4 +44,4 @@ tensorboard;https://github.com/tensorflow/tensorboard/archive/373eb09e4c5d2b3cc2 cutlass;https://github.com/NVIDIA/cutlass/archive/refs/tags/v3.0.0.zip;0f95b3c1fc1bd1175c4a90b2c9e39074d1bccefd utf8_range;https://github.com/protocolbuffers/utf8_range/archive/72c943dea2b9240cd09efde15191e144bc7c7d38.zip;9925739c9debc0efa2adcb194d371a35b6a03156 extensions;https://github.com/microsoft/onnxruntime-extensions/archive/94142d8391c9791ec71c38336436319a2d4ac7a0.zip;4365ac5140338b4cb75a39944a4be276e3829b3c -composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/58817bf967d3b7b314233f6331f0430cf40651ae.zip;070c01f2ffdde6f515f3075b8fff3ea2ad819827 +composable_kernel;https://github.com/ROCmSoftwarePlatform/composable_kernel/archive/04f93aadb8fca26c141f70ab19710c460b46f6c0.zip;cd7083c2205b1a4fb6c1f9b7b0e597b8609f3f38 diff --git a/cmake/external/composable_kernel.cmake b/cmake/external/composable_kernel.cmake index a38ccbc9e9335..d464fdaa8eb07 100644 --- a/cmake/external/composable_kernel.cmake +++ b/cmake/external/composable_kernel.cmake @@ -12,7 +12,7 @@ if(NOT composable_kernel_POPULATED) FetchContent_Populate(composable_kernel) set(BUILD_DEV OFF CACHE BOOL "Disable -Weverything, otherwise, error: 'constexpr' specifier is incompatible with C++98 [-Werror,-Wc++98-compat]" FORCE) # Exclude i8 device gemm instances due to excessive long compilation time and not being used - set(DTYPES fp32 fp16 bf16) + set(DTYPES fp32 fp16 bf16 fp8) set(INSTANCES_ONLY ON) add_subdirectory(${composable_kernel_SOURCE_DIR} ${composable_kernel_BINARY_DIR} EXCLUDE_FROM_ALL) @@ -21,5 +21,5 @@ if(NOT composable_kernel_POPULATED) ${composable_kernel_SOURCE_DIR}/include ${composable_kernel_BINARY_DIR}/include ${composable_kernel_SOURCE_DIR}/library/include) - target_compile_definitions(onnxruntime_composable_kernel_includes INTERFACE __fp32__ __fp16__ __bf16__) + target_compile_definitions(onnxruntime_composable_kernel_includes INTERFACE __fp32__ __fp16__ __bf16__ __fp8__) endif() diff --git a/cmake/patches/composable_kernel/Fix_Clang_Build.patch b/cmake/patches/composable_kernel/Fix_Clang_Build.patch index 142963d511780..ea7a2d91ee5ab 100644 --- a/cmake/patches/composable_kernel/Fix_Clang_Build.patch +++ b/cmake/patches/composable_kernel/Fix_Clang_Build.patch @@ -1,8 +1,8 @@ diff --git a/CMakeLists.txt b/CMakeLists.txt -index 32defea75..8aa431e9c 100644 +index b09da41a8..fca2bdf69 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt -@@ -15,7 +15,7 @@ endif() +@@ -19,7 +19,7 @@ endif() set(version 1.1.0) # Check support for CUDA/HIP in Cmake @@ -11,7 +11,7 @@ index 32defea75..8aa431e9c 100644 list(APPEND CMAKE_MODULE_PATH "${PROJECT_SOURCE_DIR}/cmake") -@@ -167,27 +167,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) +@@ -173,27 +173,6 @@ set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) message("CMAKE_CXX_COMPILER_ID: ${CMAKE_CXX_COMPILER_ID}") @@ -39,7 +39,7 @@ index 32defea75..8aa431e9c 100644 ## HIP find_package(HIP REQUIRED) # Override HIP version in config.h, if necessary. -@@ -209,8 +188,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) +@@ -215,8 +194,6 @@ if( DEFINED CK_OVERRIDE_HIP_VERSION_PATCH ) message(STATUS "CK_HIP_VERSION_PATCH overriden with ${CK_OVERRIDE_HIP_VERSION_PATCH}") endif() message(STATUS "Build with HIP ${HIP_VERSION}") @@ -48,7 +48,7 @@ index 32defea75..8aa431e9c 100644 ## tidy include(EnableCompilerWarnings) -@@ -483,11 +460,3 @@ rocm_install(FILES +@@ -489,11 +466,3 @@ rocm_install(FILES set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE") set(CPACK_RPM_PACKAGE_LICENSE "MIT") @@ -60,21 +60,44 @@ index 32defea75..8aa431e9c 100644 - LDCONFIG - HEADER_ONLY -) +diff --git a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp +index 4cba0875d..07b0adb42 100644 +--- a/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp ++++ b/library/include/ck/library/tensor_operation_instance/gpu/gemm_splitk.hpp +@@ -58,7 +58,7 @@ void add_device_gemm_xdl_splitk_f32_f32_f32_mk_nk_mn_instances( + DeviceGemmSplitK>>& + instances); + #endif +-#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8)) ++#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)) + void add_device_gemm_xdl_splitk_f8_f16_f16_km_kn_mn_instances( + std::vector>>& +@@ -182,7 +182,7 @@ struct DeviceOperationInstanceFactory< + } + } + #endif +-#if(defined(CK_ENABLE_FP16) || defined(CK_ENABLE_FP8)) ++#if(defined(CK_ENABLE_FP16) && defined(CK_ENABLE_FP8)) + else if constexpr(is_same_v && is_same_v && + is_same_v) + { diff --git a/library/src/tensor_operation_instance/gpu/CMakeLists.txt b/library/src/tensor_operation_instance/gpu/CMakeLists.txt -index 1d54a141b..4edd7dbfb 100644 +index a0478c9f0..1e7782cd4 100644 --- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt +++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt -@@ -1,7 +1,13 @@ - function(add_instance_library INSTANCE_NAME) - message("adding instance ${INSTANCE_NAME}") -+ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) - add_library(${INSTANCE_NAME} OBJECT ${ARGN}) -+ # Always disable debug symbol and C debug assert due to -+ # - Linker error: ... relocation truncated to fit ..., caused by object files to be linked are too huge. -+ # - https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/622 -+ target_compile_options(${INSTANCE_NAME} PRIVATE -g0 -DNDEBUG) - target_compile_features(${INSTANCE_NAME} PUBLIC) -+ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1") - set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) - clang_tidy_check(${INSTANCE_NAME}) - endfunction(add_instance_library INSTANCE_NAME) +@@ -44,8 +44,14 @@ function(add_instance_library INSTANCE_NAME) + endforeach() + #only continue if there are some source files left on the list + if(ARGN) ++ set_source_files_properties(${ARGN} PROPERTIES LANGUAGE HIP) + add_library(${INSTANCE_NAME} OBJECT ${ARGN}) ++ # Always disable debug symbol and C debug assert due to ++ # - Linker error: ... relocation truncated to fit ..., caused by object files to be linked are too huge. ++ # - https://github.com/ROCmSoftwarePlatform/composable_kernel/issues/622 ++ target_compile_options(${INSTANCE_NAME} PRIVATE -g0 -DNDEBUG) + target_compile_features(${INSTANCE_NAME} PUBLIC) ++ target_compile_definitions(${INSTANCE_NAME} PRIVATE "__HIP_PLATFORM_AMD__=1" "__HIP_PLATFORM_HCC__=1") + set_target_properties(${INSTANCE_NAME} PROPERTIES POSITION_INDEPENDENT_CODE ON) + clang_tidy_check(${INSTANCE_NAME}) + set(result 0)