Skip to content

Commit

Permalink
Enable gfx12 support (#577)
Browse files Browse the repository at this point in the history
* Enable gfx12 support (#9)

* Initial support for gfx12

* Remove garbage file
  • Loading branch information
stanleytsang-amd authored Jul 3, 2024
1 parent db30c5b commit b245064
Show file tree
Hide file tree
Showing 5 changed files with 89 additions and 77 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ if(NOT USE_HIP_CPU)

if(GPU_TARGETS STREQUAL "all")
rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102"
TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102;gfx1200;gfx1201"
)
set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE)
endif()
Expand Down
6 changes: 3 additions & 3 deletions rocprim/include/rocprim/iterator/texture_cache_iterator.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,9 +221,9 @@ class texture_cache_iterator
#else
texture_type words[multiple];

#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__)
#pragma message "Texture cache iterator is not supported on gfx94x as the texture fetch functions in HIP are not available."
ROCPRIM_PRINT_ERROR_ONCE("WARNING: Usage of texture_cache_iterator on gfx94x device is not supported and will not produce valid results.")
#if defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__) || defined(__gfx1200__) || defined(__gfx1201__)
#pragma message "Texture cache iterator is not supported on gfx94x or gfx120x as the texture fetch functions in HIP are not available."
ROCPRIM_PRINT_ERROR_ONCE("WARNING: Usage of texture_cache_iterator on gfx94x or gfx120x devices is not supported and will not produce valid results.")
#else
ROCPRIM_UNROLL
for(unsigned int i = 0; i < multiple; i++)
Expand Down
90 changes: 48 additions & 42 deletions rocprim/include/rocprim/thread/thread_load.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -59,59 +59,65 @@ ROCPRIM_DEVICE __forceinline__ T AsmThreadLoad(void * ptr)

#if ROCPRIM_THREAD_LOAD_USE_CACHE_MODIFIERS == 1

// Important for syncing. Check section 9.2.2 or 7.3 in the following document
// http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf
#define ROCPRIM_ASM_THREAD_LOAD(cache_modifier, \
llvm_cache_modifier, \
type, \
interim_type, \
asm_operator, \
output_modifier, \
wait_cmd) \
template<> \
ROCPRIM_DEVICE __forceinline__ type AsmThreadLoad<cache_modifier, type>(void* ptr) \
{ \
interim_type retval; \
asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier "\n\t" \
"s_waitcnt " wait_cmd "(%2)" \
: "=" #output_modifier(retval) \
: "v"(ptr), "I"(0x00)); \
return retval; \
}
// Important for syncing. Check section 9.2.2 or 7.3 in the following document
// http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf
#define ROCPRIM_ASM_THREAD_LOAD(cache_modifier, \
llvm_cache_modifier, \
type, \
interim_type, \
asm_operator, \
output_modifier, \
wait_inst, \
wait_cmd) \
template<> \
ROCPRIM_DEVICE __forceinline__ type AsmThreadLoad<cache_modifier, type>(void* ptr) \
{ \
interim_type retval; \
asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier "\n\t" \
wait_inst wait_cmd "(%2)" \
: "=" #output_modifier(retval) \
: "v"(ptr), "I"(0x00)); \
return retval; \
}

// TODO Add specialization for custom larger data types
#define ROCPRIM_ASM_THREAD_LOAD_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_load_sbyte, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_load_sshort, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_load_ubyte, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_load_ushort, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_load_dword, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_load_dword, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_cmd);
#define ROCPRIM_ASM_THREAD_LOAD_GROUP(cache_modifier, llvm_cache_modifier, wait_inst, wait_cmd) \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_load_sbyte, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_load_sshort, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_load_ubyte, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_load_ushort, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_load_dword, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_load_dword, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_load_dwordx2, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_LOAD(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_load_dwordx2, v, wait_inst, wait_cmd);

// [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile')
#ifndef __HIP_CPU_RT__
#if defined(__gfx940__) || defined(__gfx941__)
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc1", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0 sc1", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0 sc1", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc1", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0 sc1", "s_waitcnt", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0 sc1", "s_waitcnt", "vmcnt");
#elif defined(__gfx942__)
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc0 nt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "sc0", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "sc0 nt", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "sc0", "s_waitcnt", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "sc0", "s_waitcnt", "vmcnt");
#elif defined(__gfx1200__) || defined(__gfx1201__)
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_loadcnt_dscnt", "");
#else
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "glc", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "glc", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "glc", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ca, "glc", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cg, "glc slc", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cv, "glc", "s_waitcnt", "vmcnt");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_volatile, "glc", "s_waitcnt", "vmcnt");
#endif

// TODO find correct modifiers to match these
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ldg, "", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cs, "", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_ldg, "", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_LOAD_GROUP(load_cs, "", "s_waitcnt", "");
#endif // __HIP_CPU_RT__

#endif
Expand Down
62 changes: 34 additions & 28 deletions rocprim/include/rocprim/thread/thread_store.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,54 +62,60 @@ ROCPRIM_DEVICE __forceinline__ void AsmThreadStore(void * ptr, T val)

// Important for syncing. Check section 9.2.2 or 7.3 in the following document
// http://developer.amd.com/wordpress/media/2013/12/AMD_GCN3_Instruction_Set_Architecture_rev1.1.pdf
#define ROCPRIM_ASM_THREAD_STORE(cache_modifier, \
#define ROCPRIM_ASM_THREAD_STORE(cache_modifier, \
llvm_cache_modifier, \
type, \
interim_type, \
asm_operator, \
output_modifier, \
wait_inst, \
wait_cmd) \
template<> \
ROCPRIM_DEVICE __forceinline__ void AsmThreadStore<cache_modifier, type>(void * ptr, type val) \
ROCPRIM_DEVICE __forceinline__ void AsmThreadStore<cache_modifier, type>(void * ptr, type val) \
{ \
interim_type temp_val = val; \
asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier : : "v"(ptr), #output_modifier(temp_val)); \
asm volatile("s_waitcnt " wait_cmd "(%0)" : : "I"(0x00)); \
interim_type temp_val = val; \
asm volatile(#asm_operator " %0, %1 " llvm_cache_modifier "\n\t" \
wait_inst wait_cmd "(%2)" \
: : "v"(ptr), #output_modifier(temp_val), "I"(0x00)); \
}

// TODO fix flat_store_ubyte and flat_store_sbyte issues
// TODO Add specialization for custom larger data types
#define ROCPRIM_ASM_THREAD_STORE_GROUP(cache_modifier, llvm_cache_modifier, wait_cmd) \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_store_byte, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_store_short, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_store_byte, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_store_short, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_store_dword, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_store_dword, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_cmd);
#define ROCPRIM_ASM_THREAD_STORE_GROUP(cache_modifier, llvm_cache_modifier, wait_inst, wait_cmd) \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int8_t, int16_t, flat_store_byte, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, int16_t, int16_t, flat_store_short, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint8_t, uint16_t, flat_store_byte, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint16_t, uint16_t, flat_store_short, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint32_t, uint32_t, flat_store_dword, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, float, uint32_t, flat_store_dword, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, uint64_t, uint64_t, flat_store_dwordx2, v, wait_inst, wait_cmd); \
ROCPRIM_ASM_THREAD_STORE(cache_modifier, llvm_cache_modifier, double, uint64_t, flat_store_dwordx2, v, wait_inst, wait_cmd);

// [HIP-CPU] MSVC: erronous inline assembly specification (Triggers error C2059: syntax error: 'volatile')

#ifndef __HIP_CPU_RT__
#if defined(__gfx940__) || defined(__gfx941__)
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0 sc1", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 sc1", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0 sc1", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0 sc1", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0 sc1", "s_waitcnt", ""); // TODO: gfx942 validation
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 sc1", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0 sc1", "s_waitcnt", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0 sc1", "s_waitcnt", "vmcnt");
#elif defined(__gfx942__)
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 nt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "sc0", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "sc0 nt", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "sc0", "s_waitcnt", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "sc0", "s_waitcnt", "vmcnt");
#elif defined(__gfx1200__) || defined(__gfx1201__)
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "scope:SCOPE_DEV", "s_wait_storecnt_dscnt", ""); // TODO: gfx942 validation
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "th:TH_DEFAULT scope:SCOPE_DEV", "s_wait_storecnt_dscnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "scope:SCOPE_DEV", "s_wait_storecnt_dscnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "scope:SCOPE_DEV", "s_wait_storecnt_dscnt", "");
#else
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "glc", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "glc slc", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "glc", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "glc", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wb, "glc", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cg, "glc slc", "s_waitcnt", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_wt, "glc", "s_waitcnt", "vmcnt");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_volatile, "glc", "s_waitcnt", "vmcnt");
#endif
// TODO find correct modifiers to match these
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cs, "", "");
ROCPRIM_ASM_THREAD_STORE_GROUP(store_cs, "", "s_waitcnt", "");
#endif // __HIP_CPU_RT__

#endif
Expand Down
6 changes: 3 additions & 3 deletions test/rocprim/test_texture_cache_iterator.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,9 @@ TYPED_TEST(RocprimTextureCacheIteratorTests, Transform)
hipDeviceProp_t props;
HIP_CHECK(hipGetDeviceProperties(&props, device_id));
std::string deviceName = std::string(props.gcnArchName);
if (deviceName.rfind("gfx94", 0) == 0) {
// This is a gfx94x device, so skip this test
GTEST_SKIP() << "Test not run on gfx94x as texture cache API is not supported";
if (deviceName.rfind("gfx94", 0) == 0 || deviceName.rfind("gfx120") == 0) {
// This is a gfx94x or gfx120x device, so skip this test
GTEST_SKIP() << "Test not run on gfx94x or gfx120x as texture cache API is not supported";
}

HIP_CHECK(hipSetDevice(device_id));
Expand Down

0 comments on commit b245064

Please sign in to comment.