From 4bc515aa62368c6189d19e14b3ec18cb6dd4415e Mon Sep 17 00:00:00 2001 From: jiabaxie Date: Tue, 16 Jan 2024 22:33:40 -0500 Subject: [PATCH 01/27] SWDEV-441487 - Strix Halo Support and Strix support in staging Change-Id: I96f009b417869f86a71a7683f51089f4b0461893 --- rocclr/cmake/ROCclrPAL.cmake | 2 ++ rocclr/device/device.cpp | 4 ++-- rocclr/device/pal/paldevice.cpp | 2 ++ rocclr/device/pal/palsettings.cpp | 2 ++ 4 files changed, 8 insertions(+), 2 deletions(-) diff --git a/rocclr/cmake/ROCclrPAL.cmake b/rocclr/cmake/ROCclrPAL.cmake index 3cc497e0f..da0446365 100644 --- a/rocclr/cmake/ROCclrPAL.cmake +++ b/rocclr/cmake/ROCclrPAL.cmake @@ -43,6 +43,8 @@ set(PAL_BUILD_NAVI31 ON) set(PAL_BUILD_NAVI32 ON) set(PAL_BUILD_NAVI33 ON) set(PAL_BUILD_PHOENIX1 ON) +set(PAL_BUILD_STRIX1 ON) +set(PAL_BUILD_STRIX_HALO ON) find_package(AMD_PAL) find_package(AMD_HSA_LOADER) diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index ac9100310..b9640e890 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -219,8 +219,8 @@ std::pair Isa::supportedIsas() { {"gfx1101", "gfx1101", true, true, 11, 0, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1102", "gfx1102", true, true, 11, 0, 2, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, {"gfx1103", "gfx1103", true, true, 11, 0, 3, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, - {"gfx1150", "gfx1150", true, true, 11, 5, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, - {"gfx1151", "gfx1151", true, true, 11, 5, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, + {"gfx1150", "gfx1150", true, true, 11, 5, 0, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, //Strix + {"gfx1151", "gfx1151", true, true, 11, 5, 1, NONE, NONE, 2, 32, 1, 256, 64 * Ki, 32}, //Strix Halo }; return std::make_pair(std::begin(supportedIsas_), std::end(supportedIsas_)); } diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 6f65ad422..3be791151 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -113,6 +113,8 @@ static constexpr PalDevice supportedPalDevices[] = { {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::Phoenix2}, {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::HawkPoint1}, {11, 0, 3, Pal::GfxIpLevel::GfxIp11_0, "gfx1103", Pal::AsicRevision::HawkPoint2}, + {11, 5, 0, Pal::GfxIpLevel::GfxIp11_5, "gfx1150", Pal::AsicRevision::Strix1}, + {11, 5, 1, Pal::GfxIpLevel::GfxIp11_5, "gfx1151", Pal::AsicRevision::StrixHalo}, }; static std::tuple findIsa(Pal::AsicRevision asicRevision, diff --git a/rocclr/device/pal/palsettings.cpp b/rocclr/device/pal/palsettings.cpp index a5d08c50b..ee85a1cd8 100644 --- a/rocclr/device/pal/palsettings.cpp +++ b/rocclr/device/pal/palsettings.cpp @@ -170,6 +170,8 @@ bool Settings::create(const Pal::DeviceProperties& palProp, amd::Os::getAppPathAndFileName(appName, appPathAndName); switch (palProp.revision) { + case Pal::AsicRevision::StrixHalo: + case Pal::AsicRevision::Strix1: // Fall through for Navi3x ... case Pal::AsicRevision::Navi33: case Pal::AsicRevision::Navi32: From 031addf37834c911afd4a2403f8ec9eb1364202b Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Thu, 16 Nov 2023 01:11:09 +0000 Subject: [PATCH 02/27] SWDEV-431367 - fix float compare for atomicMax/Min where -0.0 < +0.0 Change-Id: Ief99aac3775b0a0c6372215e5a0a9ecbbdca4ca5 --- .../include/hip/amd_detail/amd_hip_atomic.h | 42 +++++++++++++++---- 1 file changed, 34 insertions(+), 8 deletions(-) diff --git a/hipamd/include/hip/amd_detail/amd_hip_atomic.h b/hipamd/include/hip/amd_detail/amd_hip_atomic.h index c02a57b07..e35a79abd 100644 --- a/hipamd/include/hip/amd_detail/amd_hip_atomic.h +++ b/hipamd/include/hip/amd_detail/amd_hip_atomic.h @@ -612,11 +612,17 @@ float atomicMin(float* addr, float val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMin(addr, val); #else + typedef union u_hold { + float a; + unsigned int b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x80000000U == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value > val) { + while (!done && (value > val || (neg_zero && value == 0.0f))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -625,7 +631,7 @@ float atomicMin(float* addr, float val) { unsigned int *uaddr = (unsigned int *)addr; unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __uint_as_float(value) > val) { + while (!done && (__uint_as_float(value) > val || (neg_zero && __uint_as_float(value) == 0.0f))) { done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -658,11 +664,17 @@ double atomicMin(double* addr, double val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMin(addr, val); #else + typedef union u_hold { + double a; + unsigned long long b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x8000000000000000ULL == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value > val) { + while (!done && (value > val || (neg_zero && value == 0.0))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -671,7 +683,8 @@ double atomicMin(double* addr, double val) { unsigned long long *uaddr = (unsigned long long *)addr; unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __longlong_as_double(value) > val) { + while (!done && + (__longlong_as_double(value) > val || (neg_zero && __longlong_as_double(value) == 0.0))) { done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -856,11 +869,17 @@ float atomicMax(float* addr, float val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMax(addr, val); #else + typedef union u_hold { + float a; + unsigned int b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x80000000U == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) float value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value < val) { + while (!done && (value < val || (neg_zero && value == 0.0f))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -869,7 +888,7 @@ float atomicMax(float* addr, float val) { unsigned int *uaddr = (unsigned int *)addr; unsigned int value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __uint_as_float(value) < val) { + while (!done && (__uint_as_float(value) < val || (neg_zero && __uint_as_float(value) == 0.0f))) { done = __atomic_compare_exchange_n(uaddr, &value, __float_as_uint(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } @@ -902,11 +921,17 @@ double atomicMax(double* addr, double val) { #if defined(__AMDGCN_UNSAFE_FP_ATOMICS__) return unsafeAtomicMax(addr, val); #else + typedef union u_hold { + double a; + unsigned long long b; + } u_hold_t; + u_hold_t u{val}; + bool neg_zero = 0x8000000000000000ULL == u.b; #if __has_builtin(__hip_atomic_load) && \ __has_builtin(__hip_atomic_compare_exchange_strong) double value = __hip_atomic_load(addr, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); bool done = false; - while (!done && value < val) { + while (!done && (value < val || (neg_zero && value == 0.0))) { done = __hip_atomic_compare_exchange_strong(addr, &value, val, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT); } @@ -915,7 +940,8 @@ double atomicMax(double* addr, double val) { unsigned long long *uaddr = (unsigned long long *)addr; unsigned long long value = __atomic_load_n(uaddr, __ATOMIC_RELAXED); bool done = false; - while (!done && __longlong_as_double(value) < val) { + while (!done && + (__longlong_as_double(value) < val || (neg_zero && __longlong_as_double(value) == 0.0))) { done = __atomic_compare_exchange_n(uaddr, &value, __double_as_longlong(val), false, __ATOMIC_RELAXED, __ATOMIC_RELAXED); } From de1c5b2db4b25b1a7f91aff39eda7d46e0ae14d8 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Thu, 18 Jan 2024 21:51:22 +0800 Subject: [PATCH 03/27] SWDEV-441065 - Fix device accessable host memory copy path Change-Id: Ia6b9a428da7b54819fb3be928a33cdffcd04ecdf Signed-off-by: Lang Yu --- hipamd/src/hip_memory.cpp | 121 +++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 33 deletions(-) diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 64ed22985..418ee0884 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -1820,28 +1820,43 @@ hipError_t ihipMemcpyDtoHCommand(amd::Command*& command, void* srcDevice, void* amd::Memory* srcMemory; amd::BufferRect srcRect; amd::BufferRect dstRect; + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset); + hipError_t status = ihipMemcpyDtoHValidate(srcDevice, dstHost, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, srcMemory, srcRect, dstRect); if (status != hipSuccess) { return status; } + amd::Coord3D srcStart(srcRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); - amd::ReadMemoryCommand* readCommand = + if (dstMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, + *srcMemory, *dstMemory, srcOrigin, dstOrigin, + copyRegion, srcRect, dstRect, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::ReadMemoryCommand* readCommand = new amd::ReadMemoryCommand(*stream, CL_COMMAND_READ_BUFFER_RECT, amd::Command::EventWaitList{}, *srcMemory, srcStart, copyRegion, dstHost, srcRect, dstRect, copyMetadata); + if (readCommand == nullptr) { + return hipErrorOutOfMemory; + } - if (readCommand == nullptr) { - return hipErrorOutOfMemory; + if (!readCommand->validatePeerMemory()) { + delete readCommand; + return hipErrorInvalidValue; + } + command = readCommand; } - if (!readCommand->validatePeerMemory()) { - delete readCommand; - return hipErrorInvalidValue; - } - command = readCommand; return hipSuccess; } @@ -1884,6 +1899,8 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo amd::Memory* dstMemory; amd::BufferRect srcRect; amd::BufferRect dstRect; + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset); hipError_t status = ihipMemcpyHtoDValidate(srcHost, dstDevice, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstRowPitch, dstSlicePitch, @@ -1891,21 +1908,33 @@ hipError_t ihipMemcpyHtoDCommand(amd::Command*& command, const void* srcHost, vo if (status != hipSuccess) { return status; } + amd::Coord3D dstStart(dstRect.start_, 0, 0); amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); - amd::WriteMemoryCommand* writeCommand = new amd::WriteMemoryCommand( + if (srcMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_BUFFER_RECT, amd::Command::EventWaitList{}, + *srcMemory, *dstMemory, srcOrigin, dstOrigin, + copyRegion, srcRect, dstRect, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::WriteMemoryCommand *writeCommand = new amd::WriteMemoryCommand( *stream, CL_COMMAND_WRITE_BUFFER_RECT, amd::Command::EventWaitList{}, *dstMemory, dstStart, copyRegion, srcHost, dstRect, srcRect, copyMetadata); + if (writeCommand == nullptr) { + return hipErrorOutOfMemory; + } - if (writeCommand == nullptr) { - return hipErrorOutOfMemory; + if (!writeCommand->validatePeerMemory()) { + delete writeCommand; + return hipErrorInvalidValue; + } + command = writeCommand; } - if (!writeCommand->validatePeerMemory()) { - delete writeCommand; - return hipErrorInvalidValue; - } - command = writeCommand; return hipSuccess; } @@ -2068,6 +2097,8 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi hip::Stream* stream, bool isAsync = false) { amd::Image* dstImage; size_t start = 0; //!< Start offset for the copy region + size_t sOffset = 0; + amd::Memory* srcMemory = getMemoryObject(srcHost, sOffset); hipError_t status = ihipMemcpyHtoAValidate(srcHost, dstArray, srcOrigin, dstOrigin, copyRegion, srcRowPitch, srcSlicePitch, dstImage, start); @@ -2076,20 +2107,31 @@ hipError_t ihipMemcpyHtoACommand(amd::Command*& command, const void* srcHost, hi } amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); - amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand( + if (srcMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_BUFFER_TO_IMAGE, amd::Command::EventWaitList{}, + *srcMemory, *dstImage, srcOrigin, dstOrigin, + copyRegion, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::WriteMemoryCommand* writeMemCmd = new amd::WriteMemoryCommand( *stream, CL_COMMAND_WRITE_IMAGE, amd::Command::EventWaitList{}, *dstImage, dstOrigin, copyRegion, static_cast(srcHost) + start, srcRowPitch, srcSlicePitch, copyMetadata); + if (writeMemCmd == nullptr) { + return hipErrorOutOfMemory; + } - if (writeMemCmd == nullptr) { - return hipErrorOutOfMemory; + if (!writeMemCmd->validatePeerMemory()) { + delete writeMemCmd; + return hipErrorInvalidValue; + } + command = writeMemCmd; } - if (!writeMemCmd->validatePeerMemory()) { - delete writeMemCmd; - return hipErrorInvalidValue; - } - command = writeMemCmd; return hipSuccess; } @@ -2127,8 +2169,9 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray_t srcArray, vo hip::Stream* stream, bool isAsync = false) { amd::Image* srcImage; amd::BufferRect dstRect; - amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); size_t start = 0; //!< Start offset for the copy region + size_t dOffset = 0; + amd::Memory* dstMemory = getMemoryObject(dstHost, dOffset); hipError_t status = ihipMemcpyAtoHValidate(srcArray, dstHost, srcOrigin, dstOrigin, copyRegion, dstRowPitch, dstSlicePitch, srcImage, start); @@ -2136,20 +2179,32 @@ hipError_t ihipMemcpyAtoHCommand(amd::Command*& command, hipArray_t srcArray, vo return status; } - amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand( + amd::CopyMetadata copyMetadata(isAsync, amd::CopyMetadata::CopyEnginePreference::SDMA); + if (dstMemory) { + amd::CopyMemoryCommand *copyCommand = new amd::CopyMemoryCommand( + *stream, CL_COMMAND_COPY_IMAGE_TO_BUFFER, amd::Command::EventWaitList{}, + *srcImage, *dstMemory, srcOrigin, dstOrigin, copyRegion, copyMetadata); + if (copyCommand == nullptr) { + return hipErrorOutOfMemory; + } + command = copyCommand; + } else { + amd::ReadMemoryCommand* readMemCmd = new amd::ReadMemoryCommand( *stream, CL_COMMAND_READ_IMAGE, amd::Command::EventWaitList{}, *srcImage, srcOrigin, copyRegion, static_cast(dstHost) + start, dstRowPitch, dstSlicePitch, copyMetadata); - if (readMemCmd == nullptr) { - return hipErrorOutOfMemory; - } + if (readMemCmd == nullptr) { + return hipErrorOutOfMemory; + } - if (!readMemCmd->validatePeerMemory()) { - delete readMemCmd; - return hipErrorInvalidValue; + if (!readMemCmd->validatePeerMemory()) { + delete readMemCmd; + return hipErrorInvalidValue; + } + command = readMemCmd; } - command = readMemCmd; + return hipSuccess; } From 4254e3854a180c8fd892c21c8c8c73905bcc7fbe Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Thu, 18 Jan 2024 23:24:12 +0000 Subject: [PATCH 04/27] SWDEV-441201,SWDEV-441727 - Move BlitProgram creation after trap handler build Change-Id: I89199ccc4bdf79940dc6693e07c03115c52bf613 --- rocclr/device/pal/paldevice.cpp | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/rocclr/device/pal/paldevice.cpp b/rocclr/device/pal/paldevice.cpp index 3be791151..4d9df9d11 100644 --- a/rocclr/device/pal/paldevice.cpp +++ b/rocclr/device/pal/paldevice.cpp @@ -2603,15 +2603,6 @@ bool Device::createBlitProgram() { } } - blitProgram_ = new BlitProgram(context_); - // Create blit programs - if (blitProgram_ == nullptr || !blitProgram_->create(this, extraBlits, ocl20)) { - delete blitProgram_; - blitProgram_ = nullptr; - LogError("Couldn't create blit kernels!"); - result = false; - } - if (settings().useLightning_) { const std::string TrapHandlerAsm = TrapHandlerCode; // Create a program for trap handler @@ -2637,6 +2628,15 @@ bool Device::createBlitProgram() { DevLogPrintfError("Trap handler creation failed\n"); } } + + blitProgram_ = new BlitProgram(context_); + // Create blit programs + if (blitProgram_ == nullptr || !blitProgram_->create(this, extraBlits, ocl20)) { + delete blitProgram_; + blitProgram_ = nullptr; + LogError("Couldn't create blit kernels!"); + result = false; + } return result; } From 7edfd70d0f45566caa33fbe171b89af7388938a9 Mon Sep 17 00:00:00 2001 From: Lang Yu Date: Wed, 3 Jan 2024 08:51:02 +0800 Subject: [PATCH 05/27] SWDEV-439419 - Unlock host ptr when device ptr equals host ptr hsa_amd_memory_lock_to_pool() and hsa_amd_memory_unlock() should be called balanced. Change-Id: I8b1549861bff752aabbb6399d717d1e346079a38 Signed-off-by: Lang Yu --- rocclr/device/rocm/rocmemory.cpp | 16 +++++++++++----- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/rocclr/device/rocm/rocmemory.cpp b/rocclr/device/rocm/rocmemory.cpp index 2ef82fec0..9887d2333 100644 --- a/rocclr/device/rocm/rocmemory.cpp +++ b/rocclr/device/rocm/rocmemory.cpp @@ -682,15 +682,12 @@ void Buffer::destroy() { } if (deviceMemory_ != nullptr) { + bool needUnlockHostMem = false; if (deviceMemory_ != owner()->getHostMem()) { // if they are identical, the host pointer will be // deallocated later on => avoid double deallocation if (isHostMemDirectAccess()) { - if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { - if (dev().agent_profile() != HSA_PROFILE_FULL) { - hsa_amd_memory_unlock(owner()->getHostMem()); - } - } + needUnlockHostMem = true; } else { dev().memFree(deviceMemory_, size()); const_cast(dev()).updateFreeMemory(size(), true); @@ -705,6 +702,15 @@ void Buffer::destroy() { } else if ((memFlags & CL_MEM_ALLOC_HOST_PTR) && (owner()->getContext().devices().size() == 1)) { dev().hostFree(deviceMemory_, size()); + } else if (isHostMemDirectAccess()) { + needUnlockHostMem = true; + } + } + + if (needUnlockHostMem) { + if (memFlags & (CL_MEM_USE_HOST_PTR | CL_MEM_ALLOC_HOST_PTR)) { + if (dev().agent_profile() != HSA_PROFILE_FULL) + hsa_amd_memory_unlock(owner()->getHostMem()); } } } From 551cdcd4cfc1a9bb8b8ebfd315320629baf05655 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Thu, 18 Jan 2024 00:44:39 +0000 Subject: [PATCH 06/27] SWDEV-425605 - Add new comgr compile to reloc use AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE action to compile source to realoc. Currently we have source->bc, link->bc and bc->realoc. This new action replaces the three steps with one. Change-Id: I6ba551b8d04c7e06f41c4324026e4dcd2db1970f --- hipamd/src/hiprtc/hiprtcComgrHelper.cpp | 90 +++++++++++++++++++++++-- hipamd/src/hiprtc/hiprtcComgrHelper.hpp | 7 +- hipamd/src/hiprtc/hiprtcInternal.cpp | 76 +++------------------ hipamd/src/hiprtc/hiprtcInternal.hpp | 1 - 4 files changed, 100 insertions(+), 74 deletions(-) diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp index debecbe91..ce039ac4c 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp @@ -585,6 +585,86 @@ bool createAction(amd_comgr_action_info_t& action, std::vector& opt return AMD_COMGR_STATUS_SUCCESS; } +bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, + std::vector& compileOptions, std::string& buildLog, + std::vector& exe) { + amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP; + amd_comgr_action_info_t action; + amd_comgr_data_set_t reloc; + amd_comgr_data_set_t output; + amd_comgr_data_set_t input = compileInputs; + + if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { + return false; + } + + if (auto res = amd::Comgr::create_data_set(&reloc); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + return false; + } + + if (auto res = amd::Comgr::create_data_set(&output); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE, action, + input, reloc); + res != AMD_COMGR_STATUS_SUCCESS) { + extractBuildLog(reloc, buildLog); + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + if (!extractBuildLog(reloc, buildLog)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + amd::Comgr::destroy_action_info(action); + if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action, + reloc, output); + res != AMD_COMGR_STATUS_SUCCESS) { + extractBuildLog(output, buildLog); + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (!extractBuildLog(output, buildLog)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (!extractByteCodeBinary(output, AMD_COMGR_DATA_KIND_EXECUTABLE, exe)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + // Clean up + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return true; +} + bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode) { @@ -646,8 +726,7 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i return false; } - if (auto res = - amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); res != AMD_COMGR_STATUS_SUCCESS) { amd::Comgr::destroy_action_info(action); amd::Comgr::destroy_data_set(output); @@ -915,9 +994,9 @@ bool fillMangledNames(std::vector& dataVec, std::map(it.first.data()); + char* data = const_cast(it.first.data()); if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, NULL)) { amd::Comgr::release_data(dataObject); @@ -925,7 +1004,8 @@ bool fillMangledNames(std::vector& dataVec, std::map mName(new char[Size]()); - if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { + if (auto res = + amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { amd::Comgr::release_data(dataObject); return false; } diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp index d34c9264c..ff51f668a 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp @@ -41,6 +41,9 @@ bool extractByteCodeBinary(const amd_comgr_data_set_t inDataSet, bool createAction(amd_comgr_action_info_t& action, std::vector& options, const std::string& isa, const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_NONE); +bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, + std::vector& compileOptions, std::string& buildLog, + std::vector& exe); bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode); @@ -54,8 +57,8 @@ bool dumpIsaFromBC(const amd_comgr_data_set_t isaInputs, const std::string& isa, std::vector& exeOptions, std::string name, std::string& buildLog); bool demangleName(const std::string& mangledName, std::string& demangledName); std::string handleMangledName(std::string loweredName); -bool fillMangledNames(std::vector& executable, std::map& mangledNames, - bool isBitcode); +bool fillMangledNames(std::vector& executable, + std::map& mangledNames, bool isBitcode); void GenerateUniqueFileName(std::string& name); } // namespace helpers } // namespace hiprtc diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index 46a034c82..1323dc914 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -116,7 +116,6 @@ bool RTCProgram::findIsa() { // RTC Compile Program Member Functions void RTCProgram::AppendOptions(const std::string app_env_var, std::vector* options) { - if (options == nullptr) { LogError("Append options passed is nullptr."); return; @@ -261,10 +260,6 @@ bool RTCCompileProgram::transformOptions(std::vector& compile_optio i = "--offload-arch=" + val; continue; } - if (i == "--save-temps") { - settings_.dumpISA = true; - continue; - } } // Removed consumed options @@ -300,78 +295,27 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg compileOpts.reserve(compile_options_.size() + options.size() + 2); compileOpts.insert(compileOpts.end(), options.begin(), options.end()); - if (!fgpu_rdc_) { - compileOpts.push_back("-Xclang"); - compileOpts.push_back("-disable-llvm-passes"); - } - if (!transformOptions(compileOpts)) { LogError("Error in hiprtc: unable to transform options"); return false; } - if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { - LogError("Error in hiprtc: unable to compile source to bitcode"); - return false; - } - - if (fgpu_rdc_ && !mangled_names_.empty()) { - if (!fillMangledNames(LLVMBitcode_, mangled_names_, true)) { - LogError("Error in hiprtc: unable to fill mangled names"); + if (fgpu_rdc_) { + if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { + LogError("Error in hiprtc: unable to compile source to bitcode"); return false; } - - return true; - } - - std::string linkFileName = "linked"; - if (!addCodeObjData(link_input_, LLVMBitcode_, linkFileName, AMD_COMGR_DATA_KIND_BC)) { - LogError("Error in hiprtc: unable to add linked code object"); - return false; - } - - std::vector LinkedLLVMBitcode; - if (!linkLLVMBitcode(link_input_, isa_, link_options_, build_log_, LinkedLLVMBitcode)) { - LogError("Error in hiprtc: unable to add device libs to linked bitcode"); - return false; - } - - std::string linkedFileName = "LLVMBitcode.bc"; - if (!addCodeObjData(exec_input_, LinkedLLVMBitcode, linkedFileName, AMD_COMGR_DATA_KIND_BC)) { - LogError("Error in hiprtc: unable to add device libs linked code object"); - return false; - } - - std::vector exe_options; - // Find the options passed by the app which can be used during BC to Relocatable phase. - if (!findExeOptions(options, exe_options)) { - LogError("Error in hiprtc: unable to find executable options"); - return false; - } - - std::vector exeOpts(exe_options_); - exeOpts.reserve(exeOpts.size() + exe_options.size() + 2); - // Add these below options by default for optimizations during BC to Relocatable phase. - exeOpts.push_back("-mllvm"); - exeOpts.push_back("-amdgpu-internalize-symbols"); - // User provided options are appended at the end since they can override the above - // default options if necessary - exeOpts.insert(exeOpts.end(), exe_options.begin(), exe_options.end()); - - if (settings_.dumpISA) { - if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) { - LogError("Error in hiprtc: unable to dump isa code"); + } else { + LogInfo("Using the new path of comgr"); + if (!compileToExecutable(compile_input_, isa_, compileOpts, build_log_, executable_)) { + LogError("Failing to compile to realloc"); return false; } } - if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) { - LogError("Error in hiprtc: unable to create executable"); - return false; - } - if (!mangled_names_.empty()) { - if (!fillMangledNames(executable_, mangled_names_, false)) { + auto& compile_step_output = fgpu_rdc_ ? LLVMBitcode_ : executable_; + if (!fillMangledNames(compile_step_output, mangled_names_, fgpu_rdc_)) { LogError("Error in hiprtc: unable to fill mangled names"); return false; } @@ -380,6 +324,7 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg return true; } + void RTCCompileProgram::stripNamedExpression(std::string& strippedName) { if (strippedName.back() == ')') { strippedName.pop_back(); @@ -453,7 +398,6 @@ RTCLinkProgram::RTCLinkProgram(std::string name) : RTCProgram(name) { bool RTCLinkProgram::AddLinkerOptions(unsigned int num_options, hiprtcJIT_option* options_ptr, void** options_vals_ptr) { for (size_t opt_idx = 0; opt_idx < num_options; ++opt_idx) { - switch (options_ptr[opt_idx]) { case HIPRTC_JIT_MAX_REGISTERS: link_args_.max_registers_ = *(reinterpret_cast(&options_vals_ptr[opt_idx])); diff --git a/hipamd/src/hiprtc/hiprtcInternal.hpp b/hipamd/src/hiprtc/hiprtcInternal.hpp index a1965d1b1..7e4d12399 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.hpp +++ b/hipamd/src/hiprtc/hiprtcInternal.hpp @@ -107,7 +107,6 @@ static void crashWithMessage(std::string message) { } struct Settings { - bool dumpISA{false}; bool offloadArchProvided{false}; }; From 251727e2c9bb01d67195173d72a70de90fa67d56 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Fri, 8 Dec 2023 07:29:57 +0000 Subject: [PATCH 07/27] SWDEV-422207 - Added debug env to dump graph during Instantiation Change-Id: Ibde2ae5b8d240f3986bcd168facc513a319c0f17 --- hipamd/src/hip_graph.cpp | 19 +++++++++++++++---- rocclr/utils/flags.hpp | 2 ++ 2 files changed, 17 insertions(+), 4 deletions(-) diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp index 7cfb1babd..9e35fe037 100644 --- a/hipamd/src/hip_graph.cpp +++ b/hipamd/src/hip_graph.cpp @@ -32,6 +32,7 @@ std::vector g_captureStreams; amd::Monitor g_captureStreamsLock{"StreamCaptureGlobalList"}; amd::Monitor g_streamSetLock{"StreamCaptureset"}; std::unordered_set g_allCapturingStreams; +hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags); inline hipError_t ihipGraphAddNode(hip::GraphNode* graphNode, hip::Graph* graph, hip::GraphNode* const* pDependencies, size_t numDependencies, @@ -1223,6 +1224,16 @@ hipError_t ihipGraphInstantiate(hip::GraphExec** pGraphExec, hip::Graph* graph, flags); if (*pGraphExec != nullptr) { graph->SetGraphInstantiated(true); + if (DEBUG_HIP_GRAPH_DOT_PRINT) { + static int i = 1; + std::string filename = + "graph_" + std::to_string(amd::Os::getProcessId()) + "_dot_print_" + std::to_string(i++); + hipError_t status = + ihipGraphDebugDotPrint(reinterpret_cast(graph), filename.c_str(), 0); + if (status == hipSuccess) { + LogPrintfInfo("[hipGraph] graph dump:%s", filename.c_str()); + } + } return (*pGraphExec)->Init(); } else { return hipErrorOutOfMemory; @@ -2550,13 +2561,10 @@ hipError_t hipGraphKernelNodeCopyAttributes(hipGraphNode_t hSrc, hipGraphNode_t } hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags) { - if (graph == nullptr || path == nullptr) { - return hipErrorInvalidValue; - } std::ofstream fout; fout.open(path, std::ios::out); if (fout.fail()) { - ClPrint(amd::LOG_INFO, amd::LOG_API, "[hipGraph] Error during opening of file : %s", path); + LogPrintfError("[hipGraph] Error during opening of file : %s", path); return hipErrorOperatingSystem; } fout << "digraph dot {" << std::endl; @@ -2568,6 +2576,9 @@ hipError_t ihipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned i hipError_t hipGraphDebugDotPrint(hipGraph_t graph, const char* path, unsigned int flags) { HIP_INIT_API(hipGraphDebugDotPrint, graph, path, flags); + if (graph == nullptr || path == nullptr) { + return hipErrorInvalidValue; + } HIP_RETURN(ihipGraphDebugDotPrint(graph, path, flags)); } diff --git a/rocclr/utils/flags.hpp b/rocclr/utils/flags.hpp index 51ebdd0d3..cd1f40c42 100644 --- a/rocclr/utils/flags.hpp +++ b/rocclr/utils/flags.hpp @@ -245,6 +245,8 @@ release(cstring, HIPRTC_LINK_OPTIONS_APPEND, "", \ "Set link options needed for hiprtc compilation") \ release(bool, HIP_VMEM_MANAGE_SUPPORT, true, \ "Virtual Memory Management Support") \ +release(bool, DEBUG_HIP_GRAPH_DOT_PRINT, false, \ + "Enable/Disable graph debug dot print dump") \ namespace amd { From 28565666b7de1c53330a515a20cc211c7afea3bf Mon Sep 17 00:00:00 2001 From: kjayapra-amd Date: Fri, 15 Dec 2023 10:40:18 -0500 Subject: [PATCH 08/27] SWDEV-437832 - Adding device property to check if the device is accelerator. Change-Id: I8349e99c03422c268bbb60a8c143bd492d9cec09 --- hipamd/src/hip_device.cpp | 2 ++ rocclr/device/device.hpp | 2 ++ rocclr/device/rocm/rocdevice.cpp | 13 +++++++++++++ 3 files changed, 17 insertions(+) diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 5657fd637..2053461af 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -463,6 +463,8 @@ hipError_t ihipGetDeviceProperties(hipDeviceProp_tR0600* props, int device) { deviceProps.timelineSemaphoreInteropSupported = 0; deviceProps.unifiedFunctionPointers = 0; + deviceProps.integrated = info.accelerator_; + *props = deviceProps; return hipSuccess; } diff --git a/rocclr/device/device.hpp b/rocclr/device/device.hpp index d00dfb4f6..2dc7b09ee 100644 --- a/rocclr/device/device.hpp +++ b/rocclr/device/device.hpp @@ -624,6 +624,8 @@ struct Info : public amd::EmbeddedObject { //! global CU mask which will be applied to all queues created on this device std::vector globalCUMask_; + bool accelerator_; //!< Accelerator or discrete graphics card. + //! AQL Barrier Value Packet support bool aqlBarrierValue_; diff --git a/rocclr/device/rocm/rocdevice.cpp b/rocclr/device/rocm/rocdevice.cpp index 3b6b4d4d5..8758c6291 100644 --- a/rocclr/device/rocm/rocdevice.cpp +++ b/rocclr/device/rocm/rocdevice.cpp @@ -1836,6 +1836,19 @@ bool Device::populateOCLDeviceConstants() { std::numeric_limits::max(); // gfx10+ does not share SGPRs between waves } + uint8_t memory_properties[8]; + // Get the memory property from ROCr. + if (HSA_STATUS_SUCCESS != hsa_agent_get_info(bkendDevice_, + (hsa_agent_info_t) HSA_AMD_AGENT_INFO_MEMORY_PROPERTIES, + memory_properties)) { + LogError("HSA_AGENT_INFO_AMD_MEMORY_PROPERTIES query failed"); + } + + // Check if the device is APU + if (hsa_flag_isset64(memory_properties, HSA_AMD_MEMORY_PROPERTY_AGENT_IS_APU)) { + info_.accelerator_ = 1; + } + return true; } From fdcd9d5b8e3c8f46f5ad2b62cbeeffd3b38bff53 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Fri, 19 Jan 2024 15:56:15 +0000 Subject: [PATCH 09/27] SWDEV-442126 - Fix use_after_free case in ExtractFatBinaryUsingCOMGR Change-Id: Ie07cf82025b65e4f95d4ce3cf46a6b8081f29b6a --- hipamd/src/hip_fatbin.cpp | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp index 9d5591d05..5d0d00ac9 100644 --- a/hipamd/src/hip_fatbin.cpp +++ b/hipamd/src/hip_fatbin.cpp @@ -115,7 +115,7 @@ void ListAllDeviceWithNoCOFromBundle(const std::unordered_map& devices) { - amd_comgr_data_t data_object; + amd_comgr_data_t data_object {0}; amd_comgr_status_t comgr_status = AMD_COMGR_STATUS_SUCCESS; hipError_t hip_status = hipSuccess; amd_comgr_code_object_info_t* query_list_array = nullptr; @@ -266,12 +266,6 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vectordeviceId()]->program_ = new amd::Program(*(device->asContext())); } - - if ((comgr_status = amd_comgr_release_data(data_object)) != AMD_COMGR_STATUS_SUCCESS) { - LogPrintfError("Releasing COMGR data failed with status %d ", comgr_status); - return hipErrorInvalidValue; - } - } while(0); // Clean up file and memory resouces if hip_status failed for some reason. @@ -292,7 +286,9 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector Date: Mon, 22 Jan 2024 11:49:01 -0500 Subject: [PATCH 10/27] Revert "SWDEV-425605 - Add new comgr compile to reloc" This reverts commit 551cdcd4cfc1a9bb8b8ebfd315320629baf05655. Reason for revert: breaks windows-psdb Change-Id: I8b2fcaf0d0be09a9364e386127cabafb68a67e0d --- hipamd/src/hiprtc/hiprtcComgrHelper.cpp | 90 ++----------------------- hipamd/src/hiprtc/hiprtcComgrHelper.hpp | 7 +- hipamd/src/hiprtc/hiprtcInternal.cpp | 76 ++++++++++++++++++--- hipamd/src/hiprtc/hiprtcInternal.hpp | 1 + 4 files changed, 74 insertions(+), 100 deletions(-) diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp index ce039ac4c..debecbe91 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp @@ -585,86 +585,6 @@ bool createAction(amd_comgr_action_info_t& action, std::vector& opt return AMD_COMGR_STATUS_SUCCESS; } -bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, - std::vector& compileOptions, std::string& buildLog, - std::vector& exe) { - amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP; - amd_comgr_action_info_t action; - amd_comgr_data_set_t reloc; - amd_comgr_data_set_t output; - amd_comgr_data_set_t input = compileInputs; - - if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { - return false; - } - - if (auto res = amd::Comgr::create_data_set(&reloc); res != AMD_COMGR_STATUS_SUCCESS) { - amd::Comgr::destroy_action_info(action); - return false; - } - - if (auto res = amd::Comgr::create_data_set(&output); res != AMD_COMGR_STATUS_SUCCESS) { - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(reloc); - return false; - } - - if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE, action, - input, reloc); - res != AMD_COMGR_STATUS_SUCCESS) { - extractBuildLog(reloc, buildLog); - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(reloc); - amd::Comgr::destroy_data_set(output); - return false; - } - - if (!extractBuildLog(reloc, buildLog)) { - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(reloc); - amd::Comgr::destroy_data_set(output); - return false; - } - - amd::Comgr::destroy_action_info(action); - if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(reloc); - amd::Comgr::destroy_data_set(output); - return false; - } - - if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action, - reloc, output); - res != AMD_COMGR_STATUS_SUCCESS) { - extractBuildLog(output, buildLog); - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(output); - amd::Comgr::destroy_data_set(reloc); - return false; - } - - if (!extractBuildLog(output, buildLog)) { - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(output); - amd::Comgr::destroy_data_set(reloc); - return false; - } - - if (!extractByteCodeBinary(output, AMD_COMGR_DATA_KIND_EXECUTABLE, exe)) { - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(output); - amd::Comgr::destroy_data_set(reloc); - return false; - } - - // Clean up - amd::Comgr::destroy_action_info(action); - amd::Comgr::destroy_data_set(output); - amd::Comgr::destroy_data_set(reloc); - return true; -} - bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode) { @@ -726,7 +646,8 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i return false; } - if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); + if (auto res = + amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); res != AMD_COMGR_STATUS_SUCCESS) { amd::Comgr::destroy_action_info(action); amd::Comgr::destroy_data_set(output); @@ -994,9 +915,9 @@ bool fillMangledNames(std::vector& dataVec, std::map(it.first.data()); + char *data = const_cast(it.first.data()); if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, NULL)) { amd::Comgr::release_data(dataObject); @@ -1004,8 +925,7 @@ bool fillMangledNames(std::vector& dataVec, std::map mName(new char[Size]()); - if (auto res = - amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { + if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { amd::Comgr::release_data(dataObject); return false; } diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp index ff51f668a..d34c9264c 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp @@ -41,9 +41,6 @@ bool extractByteCodeBinary(const amd_comgr_data_set_t inDataSet, bool createAction(amd_comgr_action_info_t& action, std::vector& options, const std::string& isa, const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_NONE); -bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, - std::vector& compileOptions, std::string& buildLog, - std::vector& exe); bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode); @@ -57,8 +54,8 @@ bool dumpIsaFromBC(const amd_comgr_data_set_t isaInputs, const std::string& isa, std::vector& exeOptions, std::string name, std::string& buildLog); bool demangleName(const std::string& mangledName, std::string& demangledName); std::string handleMangledName(std::string loweredName); -bool fillMangledNames(std::vector& executable, - std::map& mangledNames, bool isBitcode); +bool fillMangledNames(std::vector& executable, std::map& mangledNames, + bool isBitcode); void GenerateUniqueFileName(std::string& name); } // namespace helpers } // namespace hiprtc diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index 1323dc914..46a034c82 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -116,6 +116,7 @@ bool RTCProgram::findIsa() { // RTC Compile Program Member Functions void RTCProgram::AppendOptions(const std::string app_env_var, std::vector* options) { + if (options == nullptr) { LogError("Append options passed is nullptr."); return; @@ -260,6 +261,10 @@ bool RTCCompileProgram::transformOptions(std::vector& compile_optio i = "--offload-arch=" + val; continue; } + if (i == "--save-temps") { + settings_.dumpISA = true; + continue; + } } // Removed consumed options @@ -295,27 +300,78 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg compileOpts.reserve(compile_options_.size() + options.size() + 2); compileOpts.insert(compileOpts.end(), options.begin(), options.end()); + if (!fgpu_rdc_) { + compileOpts.push_back("-Xclang"); + compileOpts.push_back("-disable-llvm-passes"); + } + if (!transformOptions(compileOpts)) { LogError("Error in hiprtc: unable to transform options"); return false; } - if (fgpu_rdc_) { - if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { - LogError("Error in hiprtc: unable to compile source to bitcode"); + if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { + LogError("Error in hiprtc: unable to compile source to bitcode"); + return false; + } + + if (fgpu_rdc_ && !mangled_names_.empty()) { + if (!fillMangledNames(LLVMBitcode_, mangled_names_, true)) { + LogError("Error in hiprtc: unable to fill mangled names"); return false; } - } else { - LogInfo("Using the new path of comgr"); - if (!compileToExecutable(compile_input_, isa_, compileOpts, build_log_, executable_)) { - LogError("Failing to compile to realloc"); + + return true; + } + + std::string linkFileName = "linked"; + if (!addCodeObjData(link_input_, LLVMBitcode_, linkFileName, AMD_COMGR_DATA_KIND_BC)) { + LogError("Error in hiprtc: unable to add linked code object"); + return false; + } + + std::vector LinkedLLVMBitcode; + if (!linkLLVMBitcode(link_input_, isa_, link_options_, build_log_, LinkedLLVMBitcode)) { + LogError("Error in hiprtc: unable to add device libs to linked bitcode"); + return false; + } + + std::string linkedFileName = "LLVMBitcode.bc"; + if (!addCodeObjData(exec_input_, LinkedLLVMBitcode, linkedFileName, AMD_COMGR_DATA_KIND_BC)) { + LogError("Error in hiprtc: unable to add device libs linked code object"); + return false; + } + + std::vector exe_options; + // Find the options passed by the app which can be used during BC to Relocatable phase. + if (!findExeOptions(options, exe_options)) { + LogError("Error in hiprtc: unable to find executable options"); + return false; + } + + std::vector exeOpts(exe_options_); + exeOpts.reserve(exeOpts.size() + exe_options.size() + 2); + // Add these below options by default for optimizations during BC to Relocatable phase. + exeOpts.push_back("-mllvm"); + exeOpts.push_back("-amdgpu-internalize-symbols"); + // User provided options are appended at the end since they can override the above + // default options if necessary + exeOpts.insert(exeOpts.end(), exe_options.begin(), exe_options.end()); + + if (settings_.dumpISA) { + if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) { + LogError("Error in hiprtc: unable to dump isa code"); return false; } } + if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) { + LogError("Error in hiprtc: unable to create executable"); + return false; + } + if (!mangled_names_.empty()) { - auto& compile_step_output = fgpu_rdc_ ? LLVMBitcode_ : executable_; - if (!fillMangledNames(compile_step_output, mangled_names_, fgpu_rdc_)) { + if (!fillMangledNames(executable_, mangled_names_, false)) { LogError("Error in hiprtc: unable to fill mangled names"); return false; } @@ -324,7 +380,6 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg return true; } - void RTCCompileProgram::stripNamedExpression(std::string& strippedName) { if (strippedName.back() == ')') { strippedName.pop_back(); @@ -398,6 +453,7 @@ RTCLinkProgram::RTCLinkProgram(std::string name) : RTCProgram(name) { bool RTCLinkProgram::AddLinkerOptions(unsigned int num_options, hiprtcJIT_option* options_ptr, void** options_vals_ptr) { for (size_t opt_idx = 0; opt_idx < num_options; ++opt_idx) { + switch (options_ptr[opt_idx]) { case HIPRTC_JIT_MAX_REGISTERS: link_args_.max_registers_ = *(reinterpret_cast(&options_vals_ptr[opt_idx])); diff --git a/hipamd/src/hiprtc/hiprtcInternal.hpp b/hipamd/src/hiprtc/hiprtcInternal.hpp index 7e4d12399..a1965d1b1 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.hpp +++ b/hipamd/src/hiprtc/hiprtcInternal.hpp @@ -107,6 +107,7 @@ static void crashWithMessage(std::string message) { } struct Settings { + bool dumpISA{false}; bool offloadArchProvided{false}; }; From 77e059ca0dbf14ff7591c043fb9b89cb3079eca5 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Mon, 22 Jan 2024 16:58:12 +0000 Subject: [PATCH 11/27] SWDEV-433312 - Return invalidArgument in cuMemSetAccess for hipMemAccessFlagsProtRead Change-Id: I8e5c460ac310ad32849359720a40fc5131d4f32c --- hipamd/src/hip_vm.cpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/hipamd/src/hip_vm.cpp b/hipamd/src/hip_vm.cpp index 2335c426b..031c00183 100644 --- a/hipamd/src/hip_vm.cpp +++ b/hipamd/src/hip_vm.cpp @@ -288,6 +288,10 @@ hipError_t hipMemSetAccess(void* ptr, size_t size, const hipMemAccessDesc* desc, HIP_RETURN(hipErrorInvalidValue) } + if (desc[desc_idx].flags == hipMemAccessFlagsProtRead) { + HIP_RETURN(hipErrorInvalidValue) + } + auto& dev = g_devices[desc[desc_idx].location.id]; amd::Device::VmmAccess access_flags = static_cast(desc[desc_idx].flags); From b181dbb094ac5d5328b81775afeaa58a88be1229 Mon Sep 17 00:00:00 2001 From: Vikram Date: Thu, 11 Jan 2024 11:48:51 -0500 Subject: [PATCH 12/27] SWDEV-420140 - Prefer forward slashes for path seperators in compiler RT path with windows Certain build systems such as NMake seem to interpret backslashes as escapes. Change-Id: Ifc04d9427148e83084efb1af94a1ce85cb2a6c4e --- hipamd/hip-config-amd.cmake | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/hipamd/hip-config-amd.cmake b/hipamd/hip-config-amd.cmake index b27029be8..00f05dc9c 100755 --- a/hipamd/hip-config-amd.cmake +++ b/hipamd/hip-config-amd.cmake @@ -178,6 +178,12 @@ if( CLANGRT_Error ) else() # Add support for __fp16 and _Float16, explicitly link with compiler-rt if( "${CLANGRT_BUILTINS_FETCH_EXIT_CODE}" STREQUAL "0" ) + # The HIP_CXX_COMPILER by default prefers backward slashes for path seperators on windows. + # Prefer forward slashes here to avoid escaping issues on certain build systems. + if(WIN32) + string(REPLACE "\\" "/" CLANGRT_BUILTINS ${CLANGRT_BUILTINS}) + endif() + # CLANG_RT Builtins found Successfully Set interface link libraries property set_property(TARGET hip::host APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}") set_property(TARGET hip::device APPEND PROPERTY INTERFACE_LINK_LIBRARIES "${CLANGRT_BUILTINS}") From 4440bac22d79573b0911a149b9ef6978fad5f2d8 Mon Sep 17 00:00:00 2001 From: Ioannis Assiouras Date: Mon, 22 Jan 2024 13:51:37 +0000 Subject: [PATCH 13/27] SWDEV-442421 - Fixed case where hipIpcGetMemHandle erroneously returns hipSuccess Change-Id: Ie322a1a0165111d2ca129e68f49b54fe93aa088e --- rocclr/device/device.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/rocclr/device/device.cpp b/rocclr/device/device.cpp index b9640e890..c52c2e042 100644 --- a/rocclr/device/device.cpp +++ b/rocclr/device/device.cpp @@ -855,7 +855,7 @@ bool Device::IpcCreate(void* dev_ptr, size_t* mem_size, void* handle, size_t* me auto dev_mem = static_cast(amd_mem_obj->getDeviceMemory(*this)); auto result = dev_mem->ExportHandle(handle); - return true; + return result; } // ================================================================================================ From c87ac8c513e06793376a49b4f9e37e39bbdbf9eb Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Mon, 22 Jan 2024 16:00:33 -0500 Subject: [PATCH 14/27] SWDEV-311271 - Add extra logic to reduce memory usage Add logic to prevent significant grow of reserved memory in mempool. Change-Id: If034433c26e904e3037200a593ef338896d7f16d --- hipamd/src/hip_mempool_impl.cpp | 33 +++++++++++++++++++++++++++------ 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/hipamd/src/hip_mempool_impl.cpp b/hipamd/src/hip_mempool_impl.cpp index 6971c0fa9..3afe065bb 100644 --- a/hipamd/src/hip_mempool_impl.cpp +++ b/hipamd/src/hip_mempool_impl.cpp @@ -44,18 +44,21 @@ void Heap::AddMemory(amd::Memory* memory, const MemoryTimestamp& ts) { amd::Memory* Heap::FindMemory(size_t size, hip::Stream* stream, bool opportunistic, void* dptr) { amd::Memory* memory = nullptr; auto start = allocations_.lower_bound({size, nullptr}); - // Runtime can accept an allocation with 12.5% on the size threshold - uint32_t i = 0; - for (auto it = start; (it != allocations_.end()) && (it->first.first <= (size / 8.0) * 9);) { - i++; + for (auto it = start; it != allocations_.end();) { bool check_address = (dptr == nullptr); if (it->first.second->getSvmPtr() == dptr) { // If the search is done for the specified address then runtime must wait it->second.Wait(); check_address = true; } + // Runtime can accept an allocation with 12.5% on the size threshold + bool opp_mode = opportunistic; + if (it->first.first > (size / 8.0) * 9) { + // Disable opportunistic mode for more aggressive search + opp_mode = false; + } // Check if size can match and it's safe to use this resource. - if (check_address && (it->second.IsSafeFind(stream, opportunistic))) { + if (check_address && (it->second.IsSafeFind(stream, opp_mode))) { memory = it->first.second; total_size_ -= memory->getSize(); // Remove found allocation from the map @@ -219,6 +222,22 @@ void* MemoryPool::AllocateMemory(size_t size, hip::Stream* stream, void* dptr) { bool MemoryPool::FreeMemory(amd::Memory* memory, hip::Stream* stream) { amd::ScopedLock lock(lock_pool_ops_); + // If the free heap grows over the busy heap, then force release + if (free_heap_.GetTotalSize() > busy_heap_.GetTotalSize()) { + // Use event base release to reduce memory pressure + constexpr size_t kBytesToHold = 0; + free_heap_.ReleaseAllMemory(kBytesToHold); + + // If free mmeory is less than 12.5% of total, then force wait release + size_t free = 0; + size_t total = 0; + hipError_t err = hipMemGetInfo(&free, &total); + if ((err == hipSuccess) && (free < (total >> 3))) { + constexpr bool kSafeRelease = true; + free_heap_.ReleaseAllMemory(free_heap_.GetTotalSize() >> 1, kSafeRelease); + } + } + MemoryTimestamp ts; // Remove memory object from the busy pool if (!busy_heap_.RemoveMemory(memory, &ts)) { @@ -327,6 +346,7 @@ hipError_t MemoryPool::SetAttribute(hipMemPoolAttr attr, void* value) { return hipErrorInvalidValue; } free_heap_.SetMaxTotalSize(reset); + busy_heap_.SetMaxTotalSize(reset); break; case hipMemPoolAttrUsedMemCurrent: // Should be GetAttribute only @@ -372,7 +392,8 @@ hipError_t MemoryPool::GetAttribute(hipMemPoolAttr attr, void* value) { break; case hipMemPoolAttrReservedMemHigh: // High watermark of all allocated memory in OS, since the last reset - *reinterpret_cast(value) = busy_heap_.GetTotalSize() + free_heap_.GetMaxTotalSize(); + *reinterpret_cast(value) = busy_heap_.GetMaxTotalSize() + + free_heap_.GetMaxTotalSize(); break; case hipMemPoolAttrUsedMemCurrent: // Total currently used memory by the pool From 1416b9f5302aeaa7bfea1b7985bb3a0f8676dc8e Mon Sep 17 00:00:00 2001 From: Alex Xie Date: Wed, 29 Nov 2023 16:26:40 -0500 Subject: [PATCH 15/27] SWDEV-433820 - Optimize queue initialization (ROCM) Change-Id: I0619a9f66ae3d64a0f3fc36384f46adb302e725d --- rocclr/device/rocm/rocvirtual.cpp | 55 ++++++++++++++----------------- 1 file changed, 24 insertions(+), 31 deletions(-) diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 0046f130b..74bc741db 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -2857,50 +2857,43 @@ bool VirtualGPU::createVirtualQueue(uint deviceQueueSize) } uint64_t vqVA = reinterpret_cast(vqMem->getDeviceMemory()); - uint64_t pattern = 0; - amd::Coord3D origin(0, 0, 0); - amd::Coord3D region(virtualQueue_->getSize(), 1, 1); - if (!dev().xferMgr().fillBuffer(*vqMem, &pattern, sizeof(pattern), region, origin, region)) { - return false; - } + // Use shadow to prepare the data structure in host. + auto shadow = std::make_unique(allocSize); + + std::memset(&shadow[0], 0, allocSize); - AmdVQueueHeader header = {}; + AmdVQueueHeader* header = reinterpret_cast(&shadow[0]); // Initialize the virtual queue header - header.aql_slot_num = numSlots; - header.event_slot_num = dev().settings().numDeviceEvents_; - header.event_slot_mask = vqVA + eventMaskOffs; - header.event_slots = vqVA + eventsOffs; - header.aql_slot_mask = vqVA + slotMaskOffs; - header.wait_size = dev().settings().numWaitEvents_; - header.arg_size = dev().info().maxParameterSize_ + 64; - header.mask_groups = maskGroups_; - - amd::Coord3D origin_header(0); - amd::Coord3D region_header(sizeof(AmdVQueueHeader)); - - if (!dev().xferMgr().writeBuffer(&header, *vqMem, origin_header, region_header)) { - return false; - } + header->aql_slot_num = numSlots; + header->event_slot_num = dev().settings().numDeviceEvents_; + header->event_slot_mask = vqVA + eventMaskOffs; + header->event_slots = vqVA + eventsOffs; + header->aql_slot_mask = vqVA + slotMaskOffs; + header->wait_size = dev().settings().numWaitEvents_; + header->arg_size = dev().info().maxParameterSize_ + 64; + header->mask_groups = maskGroups_; // Go over all slots and perform initialization - AmdAqlWrap slot = {}; size_t offset = sizeof(AmdVQueueHeader); for (uint i = 0; i < numSlots; ++i) { + AmdAqlWrap * slot = reinterpret_cast(&shadow[0] + offset); uint64_t argStart = vqVA + argOffs + i * singleArgSize; - amd::Coord3D origin_slot(offset); - amd::Coord3D region_slot(sizeof(AmdAqlWrap)); - - slot.aql.kernarg_address = reinterpret_cast(argStart); - slot.wait_list = argStart + dev().info().maxParameterSize_ + 64; - if (!dev().xferMgr().writeBuffer(&slot, *vqMem, origin_slot, region_slot)) { - return false; - } + slot->aql.kernarg_address = reinterpret_cast(argStart); + slot->wait_list = argStart + dev().info().maxParameterSize_ + 64; offset += sizeof(AmdAqlWrap); } + amd::Coord3D origin (0, 0, 0); + amd::Coord3D region (allocSize, 1, 1); + + // copy the data structure from host to GPU + if (!dev().xferMgr().writeBuffer(&shadow[0], *vqMem, origin, region)) { + return false; + } + deviceQueueSize_ = deviceQueueSize; schedulerThreads_ = numSlots / (DeviceQueueMaskSize * maskGroups_); From 7ff366342ee7155c6244502dc8d304bbf23fbfc5 Mon Sep 17 00:00:00 2001 From: Ranjith Ramakrishnan Date: Thu, 18 Jan 2024 12:58:40 -0800 Subject: [PATCH 16/27] SWDEV-441937 - Use the disable linemarker option for generating the precompiled header file hipRTC The precompiled header files have hard coded paths in comments. Using the disable linemarker option(-P) will skip the generation of comments Change-Id: Ifb134052996c343f5405e954784b4b2c286c36b1 --- hipamd/src/hip_embed_pch.sh | 2 +- hipamd/src/hiprtc/CMakeLists.txt | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/hipamd/src/hip_embed_pch.sh b/hipamd/src/hip_embed_pch.sh index 2d998407a..9b01da872 100755 --- a/hipamd/src/hip_embed_pch.sh +++ b/hipamd/src/hip_embed_pch.sh @@ -203,7 +203,7 @@ __hipRTC_header_size: EOF set -x - $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -o $tmp/hiprtc && + $LLVM_DIR/bin/clang -O3 --hip-path=$HIP_INC_DIR/.. -std=c++14 -nogpulib --hip-version=4.4 -isystem $HIP_INC_DIR -isystem $HIP_BUILD_INC_DIR -isystem $HIP_AMD_INC_DIR --cuda-device-only -D__HIPCC_RTC__ -x hip $tmp/hipRTC_header.h -E -P -o $tmp/hiprtc && cat $macroFile >> $tmp/hiprtc && $LLVM_DIR/bin/llvm-mc -o $tmp/hiprtc_header.o $tmp/hipRTC_header.mcin --filetype=obj && $LLVM_DIR/bin/clang $tmp/hiprtc_header.o -o $rtc_shared_lib_out -shared && diff --git a/hipamd/src/hiprtc/CMakeLists.txt b/hipamd/src/hiprtc/CMakeLists.txt index b1745a02f..6565e25ec 100644 --- a/hipamd/src/hiprtc/CMakeLists.txt +++ b/hipamd/src/hiprtc/CMakeLists.txt @@ -168,7 +168,7 @@ generate_hiprtc_mcin("${HIPRTC_GEN_MCIN}" "${HIPRTC_GEN_PREPROCESSED}") # Note: second command appends define macros at build time. add_custom_command( OUTPUT ${HIPRTC_GEN_PREPROCESSED} - COMMAND $ -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR} -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -DHIP_VERSION_MAJOR=${HIP_LIB_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_LIB_VERSION_MINOR} -x hip ${HIPRTC_GEN_HEADER} -E -o ${HIPRTC_GEN_PREPROCESSED} + COMMAND $ -O3 --rocm-path=${PROJECT_SOURCE_DIR}/include/.. -std=c++17 -nogpulib --hip-version=${HIP_LIB_VERSION_MAJOR}.${HIP_LIB_VERSION_MINOR} -isystem ${HIP_COMMON_INCLUDE_DIR} -isystem ${PROJECT_SOURCE_DIR}/include -isystem ${PROJECT_BINARY_DIR}/include -isystem ${CMAKE_CURRENT_SOURCE_DIR}/include --cuda-device-only -D__HIPCC_RTC__ -DHIP_VERSION_MAJOR=${HIP_LIB_VERSION_MAJOR} -DHIP_VERSION_MINOR=${HIP_LIB_VERSION_MINOR} -x hip ${HIPRTC_GEN_HEADER} -E -P -o ${HIPRTC_GEN_PREPROCESSED} COMMAND ${CMAKE_COMMAND} -DHIPRTC_ADD_MACROS=1 -DHIPRTC_HEADERS="${HIPRTC_HEADERS}" -DHIPRTC_PREPROCESSED_FILE=${HIPRTC_GEN_PREPROCESSED} -P ${HIPRTC_CMAKE} DEPENDS clang ${HIPRTC_GEN_HEADER}) add_custom_command( From 768168142b2f4312727b92ce67a601a2ce89ca3c Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Mon, 22 Jan 2024 18:12:33 -0500 Subject: [PATCH 17/27] SWDEV-311271 - Move mempool clean-up into hipEventSynchronize Generic event synchronize can be called from mempool implementation directly. Change-Id: Id66ab9cdab486390df4dd405a4ac291eb0bb568f --- hipamd/src/hip_event.cpp | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/hipamd/src/hip_event.cpp b/hipamd/src/hip_event.cpp index d233e4243..f527766e4 100644 --- a/hipamd/src/hip_event.cpp +++ b/hipamd/src/hip_event.cpp @@ -85,8 +85,6 @@ hipError_t Event::synchronize() { event_->awaitCompletion(); } } - // Release freed memory for all memory pools on the device - hip_device->ReleaseFreedMemory(); return hipSuccess; } @@ -443,7 +441,12 @@ hipError_t hipEventSynchronize(hipEvent_t event) { if (hip::Stream::StreamCaptureOngoing(e->GetCaptureStream()) == true) { HIP_RETURN(hipErrorStreamCaptureUnsupported); } - HIP_RETURN(e->synchronize()); + + hipError_t status = e->synchronize(); + // Release freed memory for all memory pools on the device + g_devices[e->deviceId()]->ReleaseFreedMemory(); + + HIP_RETURN(status); } hipError_t ihipEventQuery(hipEvent_t event) { From b5224f25c8e1b9ba23b52c0eb3325608a3f0b6b1 Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Thu, 18 Jan 2024 00:44:39 +0000 Subject: [PATCH 18/27] SWDEV-425605 - Add new comgr compile to reloc use AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE action to compile source to realoc. Currently we have source->bc, link->bc and bc->realoc. This new action replaces the three steps with one. Change-Id: I8089cbef681e079702fefc2d2085a23bc3578d02 --- hipamd/src/hiprtc/hiprtcComgrHelper.cpp | 91 +++++++++++++++++++++++-- hipamd/src/hiprtc/hiprtcComgrHelper.hpp | 8 ++- hipamd/src/hiprtc/hiprtcInternal.cpp | 77 +++------------------ hipamd/src/hiprtc/hiprtcInternal.hpp | 16 ++--- 4 files changed, 109 insertions(+), 83 deletions(-) diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp index debecbe91..d213afe0f 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.cpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.cpp @@ -585,6 +585,87 @@ bool createAction(amd_comgr_action_info_t& action, std::vector& opt return AMD_COMGR_STATUS_SUCCESS; } +bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, + std::vector& compileOptions, + std::vector& linkOptions, std::string& buildLog, + std::vector& exe) { + amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_HIP; + amd_comgr_action_info_t action; + amd_comgr_data_set_t reloc; + amd_comgr_data_set_t output; + amd_comgr_data_set_t input = compileInputs; + + if (auto res = createAction(action, compileOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { + return false; + } + + if (auto res = amd::Comgr::create_data_set(&reloc); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + return false; + } + + if (auto res = amd::Comgr::create_data_set(&output); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_COMPILE_SOURCE_TO_RELOCATABLE, action, + input, reloc); + res != AMD_COMGR_STATUS_SUCCESS) { + extractBuildLog(reloc, buildLog); + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + if (!extractBuildLog(reloc, buildLog)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + amd::Comgr::destroy_action_info(action); + if (auto res = createAction(action, linkOptions, isa, lang); res != AMD_COMGR_STATUS_SUCCESS) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(reloc); + amd::Comgr::destroy_data_set(output); + return false; + } + + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_RELOCATABLE_TO_EXECUTABLE, action, + reloc, output); + res != AMD_COMGR_STATUS_SUCCESS) { + extractBuildLog(output, buildLog); + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (!extractBuildLog(output, buildLog)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + if (!extractByteCodeBinary(output, AMD_COMGR_DATA_KIND_EXECUTABLE, exe)) { + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return false; + } + + // Clean up + amd::Comgr::destroy_action_info(action); + amd::Comgr::destroy_data_set(output); + amd::Comgr::destroy_data_set(reloc); + return true; +} + bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode) { @@ -646,8 +727,7 @@ bool linkLLVMBitcode(const amd_comgr_data_set_t linkInputs, const std::string& i return false; } - if (auto res = - amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); + if (auto res = amd::Comgr::do_action(AMD_COMGR_ACTION_LINK_BC_TO_BC, action, linkInputs, output); res != AMD_COMGR_STATUS_SUCCESS) { amd::Comgr::destroy_action_info(action); amd::Comgr::destroy_data_set(output); @@ -915,9 +995,9 @@ bool fillMangledNames(std::vector& dataVec, std::map(it.first.data()); + char* data = const_cast(it.first.data()); if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, NULL)) { amd::Comgr::release_data(dataObject); @@ -925,7 +1005,8 @@ bool fillMangledNames(std::vector& dataVec, std::map mName(new char[Size]()); - if (auto res = amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { + if (auto res = + amd::Comgr::map_name_expression_to_symbol_name(dataObject, &Size, data, mName.get())) { amd::Comgr::release_data(dataObject); return false; } diff --git a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp index d34c9264c..05e1c013d 100644 --- a/hipamd/src/hiprtc/hiprtcComgrHelper.hpp +++ b/hipamd/src/hiprtc/hiprtcComgrHelper.hpp @@ -41,6 +41,10 @@ bool extractByteCodeBinary(const amd_comgr_data_set_t inDataSet, bool createAction(amd_comgr_action_info_t& action, std::vector& options, const std::string& isa, const amd_comgr_language_t lang = AMD_COMGR_LANGUAGE_NONE); +bool compileToExecutable(const amd_comgr_data_set_t compileInputs, const std::string& isa, + std::vector& compileOptions, + std::vector& linkOptions, std::string& buildLog, + std::vector& exe); bool compileToBitCode(const amd_comgr_data_set_t compileInputs, const std::string& isa, std::vector& compileOptions, std::string& buildLog, std::vector& LLVMBitcode); @@ -54,8 +58,8 @@ bool dumpIsaFromBC(const amd_comgr_data_set_t isaInputs, const std::string& isa, std::vector& exeOptions, std::string name, std::string& buildLog); bool demangleName(const std::string& mangledName, std::string& demangledName); std::string handleMangledName(std::string loweredName); -bool fillMangledNames(std::vector& executable, std::map& mangledNames, - bool isBitcode); +bool fillMangledNames(std::vector& executable, + std::map& mangledNames, bool isBitcode); void GenerateUniqueFileName(std::string& name); } // namespace helpers } // namespace hiprtc diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index 46a034c82..b4d10e0ee 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -116,7 +116,6 @@ bool RTCProgram::findIsa() { // RTC Compile Program Member Functions void RTCProgram::AppendOptions(const std::string app_env_var, std::vector* options) { - if (options == nullptr) { LogError("Append options passed is nullptr."); return; @@ -261,10 +260,6 @@ bool RTCCompileProgram::transformOptions(std::vector& compile_optio i = "--offload-arch=" + val; continue; } - if (i == "--save-temps") { - settings_.dumpISA = true; - continue; - } } // Removed consumed options @@ -300,78 +295,28 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg compileOpts.reserve(compile_options_.size() + options.size() + 2); compileOpts.insert(compileOpts.end(), options.begin(), options.end()); - if (!fgpu_rdc_) { - compileOpts.push_back("-Xclang"); - compileOpts.push_back("-disable-llvm-passes"); - } - if (!transformOptions(compileOpts)) { LogError("Error in hiprtc: unable to transform options"); return false; } - if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { - LogError("Error in hiprtc: unable to compile source to bitcode"); - return false; - } - - if (fgpu_rdc_ && !mangled_names_.empty()) { - if (!fillMangledNames(LLVMBitcode_, mangled_names_, true)) { - LogError("Error in hiprtc: unable to fill mangled names"); + if (fgpu_rdc_) { + if (!compileToBitCode(compile_input_, isa_, compileOpts, build_log_, LLVMBitcode_)) { + LogError("Error in hiprtc: unable to compile source to bitcode"); return false; } - - return true; - } - - std::string linkFileName = "linked"; - if (!addCodeObjData(link_input_, LLVMBitcode_, linkFileName, AMD_COMGR_DATA_KIND_BC)) { - LogError("Error in hiprtc: unable to add linked code object"); - return false; - } - - std::vector LinkedLLVMBitcode; - if (!linkLLVMBitcode(link_input_, isa_, link_options_, build_log_, LinkedLLVMBitcode)) { - LogError("Error in hiprtc: unable to add device libs to linked bitcode"); - return false; - } - - std::string linkedFileName = "LLVMBitcode.bc"; - if (!addCodeObjData(exec_input_, LinkedLLVMBitcode, linkedFileName, AMD_COMGR_DATA_KIND_BC)) { - LogError("Error in hiprtc: unable to add device libs linked code object"); - return false; - } - - std::vector exe_options; - // Find the options passed by the app which can be used during BC to Relocatable phase. - if (!findExeOptions(options, exe_options)) { - LogError("Error in hiprtc: unable to find executable options"); - return false; - } - - std::vector exeOpts(exe_options_); - exeOpts.reserve(exeOpts.size() + exe_options.size() + 2); - // Add these below options by default for optimizations during BC to Relocatable phase. - exeOpts.push_back("-mllvm"); - exeOpts.push_back("-amdgpu-internalize-symbols"); - // User provided options are appended at the end since they can override the above - // default options if necessary - exeOpts.insert(exeOpts.end(), exe_options.begin(), exe_options.end()); - - if (settings_.dumpISA) { - if (!dumpIsaFromBC(exec_input_, isa_, exeOpts, name_, build_log_)) { - LogError("Error in hiprtc: unable to dump isa code"); + } else { + LogInfo("Using the new path of comgr"); + if (!compileToExecutable(compile_input_, isa_, compileOpts, link_options_, build_log_, + executable_)) { + LogError("Failing to compile to realloc"); return false; } } - if (!createExecutable(exec_input_, isa_, exeOpts, build_log_, executable_)) { - LogError("Error in hiprtc: unable to create executable"); - return false; - } - if (!mangled_names_.empty()) { - if (!fillMangledNames(executable_, mangled_names_, false)) { + auto& compile_step_output = fgpu_rdc_ ? LLVMBitcode_ : executable_; + if (!fillMangledNames(compile_step_output, mangled_names_, fgpu_rdc_)) { LogError("Error in hiprtc: unable to fill mangled names"); return false; } @@ -380,6 +325,7 @@ bool RTCCompileProgram::compile(const std::vector& options, bool fg return true; } + void RTCCompileProgram::stripNamedExpression(std::string& strippedName) { if (strippedName.back() == ')') { strippedName.pop_back(); @@ -453,7 +399,6 @@ RTCLinkProgram::RTCLinkProgram(std::string name) : RTCProgram(name) { bool RTCLinkProgram::AddLinkerOptions(unsigned int num_options, hiprtcJIT_option* options_ptr, void** options_vals_ptr) { for (size_t opt_idx = 0; opt_idx < num_options; ++opt_idx) { - switch (options_ptr[opt_idx]) { case HIPRTC_JIT_MAX_REGISTERS: link_args_.max_registers_ = *(reinterpret_cast(&options_vals_ptr[opt_idx])); diff --git a/hipamd/src/hiprtc/hiprtcInternal.hpp b/hipamd/src/hiprtc/hiprtcInternal.hpp index a1965d1b1..5e196b7de 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.hpp +++ b/hipamd/src/hiprtc/hiprtcInternal.hpp @@ -76,8 +76,9 @@ static amd::Monitor g_hiprtcInitlock{"hiprtcInit lock"}; #define HIPRTC_INIT_API_INTERNAL(...) \ amd::Thread* thread = amd::Thread::current(); \ if (!VDI_CHECK_THREAD(thread)) { \ - ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, "An internal error has occurred." \ - " This may be due to insufficient memory."); \ + ClPrint(amd::LOG_NONE, amd::LOG_ALWAYS, \ + "An internal error has occurred." \ + " This may be due to insufficient memory."); \ HIPRTC_RETURN(HIPRTC_ERROR_INTERNAL_ERROR); \ } \ amd::ScopedLock lock(g_hiprtcInitlock); \ @@ -107,7 +108,6 @@ static void crashWithMessage(std::string message) { } struct Settings { - bool dumpISA{false}; bool offloadArchProvided{false}; }; @@ -156,10 +156,8 @@ class RTCCompileProgram : public RTCProgram { bool addBuiltinHeader(); bool transformOptions(std::vector& compile_options); bool findExeOptions(const std::vector& options, - std::vector& exe_options); - void AppendCompileOptions() { - AppendOptions(HIPRTC_COMPILE_OPTIONS_APPEND, &compile_options_); - } + std::vector& exe_options); + void AppendCompileOptions() { AppendOptions(HIPRTC_COMPILE_OPTIONS_APPEND, &compile_options_); } RTCCompileProgram() = delete; RTCCompileProgram(RTCCompileProgram&) = delete; @@ -288,9 +286,7 @@ class RTCLinkProgram : public RTCProgram { bool AddLinkerData(void* image_ptr, size_t image_size, std::string link_file_name, hiprtcJITInputType input_type); bool LinkComplete(void** bin_out, size_t* size_out); - void AppendLinkerOptions() { - AppendOptions(HIPRTC_LINK_OPTIONS_APPEND, &link_options_); - } + void AppendLinkerOptions() { AppendOptions(HIPRTC_LINK_OPTIONS_APPEND, &link_options_); } }; // Thread Local Storage Variables Aggregator Class From e657b5a5fd441fd60c56c020e820b93a8e2a78a7 Mon Sep 17 00:00:00 2001 From: German Andryeyev Date: Mon, 22 Jan 2024 15:45:01 -0500 Subject: [PATCH 19/27] SWDEV-441482 - Add missing entry point hipDestroyExternalSemaphore_fn was missing initialization Change-Id: Ieab019afd0a0c62a2aa534fb262436edda3499db --- hipamd/src/hip_api_trace.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp index 2ec7ab86d..919e148f4 100644 --- a/hipamd/src/hip_api_trace.cpp +++ b/hipamd/src/hip_api_trace.cpp @@ -808,6 +808,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) { ptrDispatchTable->hipCtxSetSharedMemConfig_fn = hip::hipCtxSetSharedMemConfig; ptrDispatchTable->hipCtxSynchronize_fn = hip::hipCtxSynchronize; ptrDispatchTable->hipDestroyExternalMemory_fn = hip::hipDestroyExternalMemory; + ptrDispatchTable->hipDestroyExternalSemaphore_fn = hip::hipDestroyExternalSemaphore; ptrDispatchTable->hipDestroySurfaceObject_fn = hip::hipDestroySurfaceObject; ptrDispatchTable->hipDestroyTextureObject_fn = hip::hipDestroyTextureObject; ptrDispatchTable->hipDeviceCanAccessPeer_fn = hip::hipDeviceCanAccessPeer; From 74edd40d26b049d7e9fd39faade8a6a83915f6df Mon Sep 17 00:00:00 2001 From: Jatin Chaudhary Date: Fri, 19 Jan 2024 17:30:44 +0000 Subject: [PATCH 20/27] SWDEV-425605 - remove exe_options, it was unused Change-Id: I0651f6a836d2f063caf651520de96d7675428771 --- hipamd/src/hiprtc/hiprtcInternal.cpp | 2 -- hipamd/src/hiprtc/hiprtcInternal.hpp | 1 - 2 files changed, 3 deletions(-) diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index b4d10e0ee..89aa9064e 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -170,8 +170,6 @@ RTCCompileProgram::RTCCompileProgram(std::string name_) : RTCProgram(name_), fgp compile_options_.push_back("-fms-compatibility"); #endif AppendCompileOptions(); - - exe_options_.push_back("-O3"); } bool RTCCompileProgram::addSource(const std::string& source, const std::string& name) { diff --git a/hipamd/src/hiprtc/hiprtcInternal.hpp b/hipamd/src/hiprtc/hiprtcInternal.hpp index 5e196b7de..aae8b64f4 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.hpp +++ b/hipamd/src/hiprtc/hiprtcInternal.hpp @@ -131,7 +131,6 @@ class RTCProgram { std::vector executable_; amd_comgr_data_set_t exec_input_; - std::vector exe_options_; }; class RTCCompileProgram : public RTCProgram { From eeece5432afe836a0a2491977b1e63cdad4216eb Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Wed, 3 Jan 2024 15:06:54 +0000 Subject: [PATCH 21/27] SWDEV-439637 - Fix undefined symbol error(github id- #3382) Change-Id: Ic7812e21618f48c737ce2c51b22a153099cc5d66 --- hipamd/include/hip/amd_detail/hip_api_trace.hpp | 2 ++ hipamd/src/amdhip.def | 1 - hipamd/src/hip_api_trace.cpp | 2 ++ hipamd/src/hip_hcc.map.in | 1 - hipamd/src/hip_table_interface.cpp | 3 +++ 5 files changed, 7 insertions(+), 2 deletions(-) diff --git a/hipamd/include/hip/amd_detail/hip_api_trace.hpp b/hipamd/include/hip/amd_detail/hip_api_trace.hpp index 7bae717bf..21f37258d 100644 --- a/hipamd/include/hip/amd_detail/hip_api_trace.hpp +++ b/hipamd/include/hip/amd_detail/hip_api_trace.hpp @@ -936,6 +936,7 @@ typedef hipError_t (*t_hipGraphExecExternalSemaphoresWaitNodeSetParams)(hipGraph typedef hipError_t (*t_hipGraphAddNode)(hipGraphNode_t *pGraphNode, hipGraph_t graph, const hipGraphNode_t *pDependencies, size_t numDependencies, hipGraphNodeParams *nodeParams); +typedef hipError_t (*t_hipExtGetLastError)(); // HIP Compiler dispatch table struct HipCompilerDispatchTable { @@ -1396,4 +1397,5 @@ struct HipDispatchTable { t_hipGraphExecExternalSemaphoresSignalNodeSetParams hipGraphExecExternalSemaphoresSignalNodeSetParams_fn; t_hipGraphExecExternalSemaphoresWaitNodeSetParams hipGraphExecExternalSemaphoresWaitNodeSetParams_fn; t_hipGraphAddNode hipGraphAddNode_fn; + t_hipExtGetLastError hipExtGetLastError_fn; }; diff --git a/hipamd/src/amdhip.def b/hipamd/src/amdhip.def index 2133815d8..f7ac5df29 100644 --- a/hipamd/src/amdhip.def +++ b/hipamd/src/amdhip.def @@ -427,7 +427,6 @@ hipMemcpy2DAsync_spt hipMemcpyFromSymbolAsync_spt hipMemcpyToSymbolAsync_spt hipMemcpyFromArray_spt -hipMemcpy2DToArray_spt hipMemcpy2DFromArrayAsync_spt hipMemcpy2DToArrayAsync_spt hipDrvGetErrorName diff --git a/hipamd/src/hip_api_trace.cpp b/hipamd/src/hip_api_trace.cpp index 919e148f4..c06682fbd 100644 --- a/hipamd/src/hip_api_trace.cpp +++ b/hipamd/src/hip_api_trace.cpp @@ -756,6 +756,7 @@ hipError_t hipGraphExternalSemaphoresWaitNodeSetParams( hipGraphNode_t hNode, const hipExternalSemaphoreWaitNodeParams* nodeParams); hipError_t hipModuleLaunchCooperativeKernelMultiDevice(hipFunctionLaunchParams* launchParamsList, unsigned int numDevices, unsigned int flags); +hipError_t hipExtGetLastError(); } // namespace hip namespace hip { @@ -1225,6 +1226,7 @@ void UpdateDispatchTable(HipDispatchTable* ptrDispatchTable) { ptrDispatchTable->hipGetStreamDeviceId_fn = hip::hipGetStreamDeviceId; ptrDispatchTable->hipDrvGraphAddMemsetNode_fn = hip::hipDrvGraphAddMemsetNode; ptrDispatchTable->hipGetDevicePropertiesR0000_fn = hip::hipGetDevicePropertiesR0000; + ptrDispatchTable->hipExtGetLastError_fn = hip::hipExtGetLastError; } #if HIP_ROCPROFILER_REGISTER > 0 diff --git a/hipamd/src/hip_hcc.map.in b/hipamd/src/hip_hcc.map.in index b8f0b3da6..fc7d91e8b 100644 --- a/hipamd/src/hip_hcc.map.in +++ b/hipamd/src/hip_hcc.map.in @@ -486,7 +486,6 @@ global: hipMemcpyFromSymbolAsync_spt; hipMemcpyToSymbolAsync_spt; hipMemcpyFromArray_spt; - hipMemcpy2DToArray_spt; hipMemcpy2DFromArrayAsync_spt; hipMemcpy2DToArrayAsync_spt; hipDrvGetErrorName; diff --git a/hipamd/src/hip_table_interface.cpp b/hipamd/src/hip_table_interface.cpp index 8324e4237..81d681949 100644 --- a/hipamd/src/hip_table_interface.cpp +++ b/hipamd/src/hip_table_interface.cpp @@ -1711,3 +1711,6 @@ hipError_t hipLaunchHostFunc_spt(hipStream_t stream, hipHostFn_t fn, void* userD extern "C" int hipGetStreamDeviceId(hipStream_t stream) { return hip::GetHipDispatchTable()->hipGetStreamDeviceId_fn(stream); } +hipError_t hipExtGetLastError() { + return hip::GetHipDispatchTable()->hipExtGetLastError_fn(); +} From a99b163eba57947e77e583494537717a52bc3803 Mon Sep 17 00:00:00 2001 From: Anusha GodavarthySurya Date: Mon, 8 Jan 2024 17:42:55 +0000 Subject: [PATCH 22/27] SWDEV-439628 - hipGraphExecKernelNodeSetParams to update graph kernel node params with graph performance optimizations. During hipGraphExecKernelNodeSetParams kernel function can also be updated. Hence size required for kernel parameters differs from what is allocated during graphInstantiation. So, create new 128KB kernel pool and allocate kernel args from the pool. If the pool is full create new 128KB pool. Release kernel pools when graph exec object is destroyed. Change-Id: I9567946d63400c79cbfd4c5439c654c92557ceae --- hipamd/src/hip_context.cpp | 6 ++-- hipamd/src/hip_device.cpp | 8 +++-- hipamd/src/hip_graph.cpp | 10 +++++- hipamd/src/hip_graph_internal.cpp | 55 +++++++++++++++++++++++++++---- hipamd/src/hip_graph_internal.hpp | 36 ++++++++++++-------- hipamd/src/hip_internal.hpp | 4 +-- 6 files changed, 91 insertions(+), 28 deletions(-) diff --git a/hipamd/src/hip_context.cpp b/hipamd/src/hip_context.cpp index 47569f066..4b7f5c809 100644 --- a/hipamd/src/hip_context.cpp +++ b/hipamd/src/hip_context.cpp @@ -90,7 +90,7 @@ void setCurrentDevice(unsigned int index) { hip::Stream* getStream(hipStream_t stream, bool wait) { if (stream == nullptr) { - return getNullStream(); + return getNullStream(wait); } else { hip::Stream* hip_stream = reinterpret_cast(stream); if (wait && !(hip_stream->Flags() & hipStreamNonBlocking)) { @@ -128,9 +128,9 @@ int getDeviceID(amd::Context& ctx) { } // ================================================================================================ -hip::Stream* getNullStream() { +hip::Stream* getNullStream(bool wait ) { Device* device = getCurrentDevice(); - return device ? device->NullStream() : nullptr; + return device ? device->NullStream(wait) : nullptr; } hipError_t hipInit(unsigned int flags) { diff --git a/hipamd/src/hip_device.cpp b/hipamd/src/hip_device.cpp index 2053461af..bcf6830f1 100644 --- a/hipamd/src/hip_device.cpp +++ b/hipamd/src/hip_device.cpp @@ -30,7 +30,7 @@ namespace hip { // ================================================================================================ -hip::Stream* Device::NullStream() { +hip::Stream* Device::NullStream(bool wait) { if (null_stream_ == nullptr) { null_stream_ = new Stream(this, Stream::Priority::Normal, 0, true); } @@ -38,8 +38,10 @@ hip::Stream* Device::NullStream() { if (null_stream_ == nullptr) { return nullptr; } - // Wait for all active streams before executing commands on the default - iHipWaitActiveStreams(null_stream_); + if (wait == true) { + // Wait for all active streams before executing commands on the default + iHipWaitActiveStreams(null_stream_); + } return null_stream_; } diff --git a/hipamd/src/hip_graph.cpp b/hipamd/src/hip_graph.cpp index 9e35fe037..89b3190fd 100644 --- a/hipamd/src/hip_graph.cpp +++ b/hipamd/src/hip_graph.cpp @@ -1552,7 +1552,15 @@ hipError_t hipGraphExecKernelNodeSetParams(hipGraphExec_t hGraphExec, hipGraphNo if (clonedNode == nullptr) { HIP_RETURN(hipErrorInvalidValue); } - HIP_RETURN(reinterpret_cast(clonedNode)->SetParams(pNodeParams)); + hipError_t status = reinterpret_cast(clonedNode)->SetParams(pNodeParams); + if(status != hipSuccess) { + HIP_RETURN(status); + } + if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { + status = reinterpret_cast(hGraphExec) + ->UpdateAQLPacket(reinterpret_cast(clonedNode)); + } + HIP_RETURN(status); } hipError_t hipGraphChildGraphNodeGetGraph(hipGraphNode_t node, hipGraph_t* pGraph) { diff --git a/hipamd/src/hip_graph_internal.cpp b/hipamd/src/hip_graph_internal.cpp index ee70f8b1b..f3a5be088 100644 --- a/hipamd/src/hip_graph_internal.cpp +++ b/hipamd/src/hip_graph_internal.cpp @@ -332,6 +332,9 @@ hipError_t GraphExec::CreateStreams(uint32_t num_streams) { } parallel_streams_.push_back(stream); } + // Don't wait for other streams to finish. + // Capture stream is to capture AQL packet. + capture_stream_ = hip::getNullStream(false); return hipSuccess; } @@ -353,13 +356,10 @@ hipError_t GraphExec::CaptureAQLPackets() { hipError_t status = hipSuccess; if (parallelLists_.size() == 1) { size_t kernArgSizeForGraph = 0; - hip::Stream* stream = nullptr; // GPU packet capture is enabled for kernel nodes. Calculate the kernel // arg size required for all graph kernel nodes to allocate for (const auto& list : parallelLists_) { - stream = GetAvailableStreams(); for (auto& node : list) { - node->SetStream(stream, this); if (node->GetType() == hipGraphNodeTypeKernel) { kernArgSizeForGraph += reinterpret_cast(node)->GetKerArgSize(); } @@ -386,7 +386,6 @@ hipError_t GraphExec::CaptureAQLPackets() { for (auto& node : topoOrder_) { if (node->GetType() == hipGraphNodeTypeKernel) { auto kernelNode = reinterpret_cast(node); - status = node->CreateCommand(node->GetQueue()); // From the kernel pool allocate the kern arg size required for the current kernel node. address kernArgOffset = allocKernArg(kernelNode->GetKernargSegmentByteSize(), kernelNode->GetKernargSegmentAlignment()); @@ -394,7 +393,7 @@ hipError_t GraphExec::CaptureAQLPackets() { return hipErrorMemoryAllocation; } // Form GPU packet capture for the kernel node. - kernelNode->CaptureAndFormPacket(kernArgOffset); + kernelNode->CaptureAndFormPacket(capture_stream_, kernArgOffset) ; } } @@ -408,7 +407,7 @@ hipError_t GraphExec::CaptureAQLPackets() { address dev_ptr = kernarg_pool_graph_ + kernarg_pool_size_graph_ - sizeof(int); *dev_ptr = host_val; if (device->info().hdpMemFlushCntl == nullptr) { - amd::Command* command = new amd::Marker(*stream, true); + amd::Command* command = new amd::Marker(*capture_stream_, true); if (command != nullptr) { command->enqueue(); command->release(); @@ -426,6 +425,50 @@ hipError_t GraphExec::CaptureAQLPackets() { return status; } +hipError_t GraphExec::UpdateAQLPacket(hip::GraphKernelNode* node) { + if (parallelLists_.size() == 1) { + size_t pool_new_usage = 0; + address result = nullptr; + if (!kernarg_graph_.empty()) { + // 1. Allocate memory for the kernel args + size_t kernArgSizeForNode = 0; + kernArgSizeForNode = node->GetKerArgSize(); + + result = amd::alignUp(kernarg_graph_.back() + kernarg_graph_cur_offset_, + node->GetKernargSegmentAlignment()); + pool_new_usage = (result + kernArgSizeForNode) - kernarg_graph_.back(); + } + if (pool_new_usage != 0 && pool_new_usage <= kernarg_graph_size_) { + kernarg_graph_cur_offset_ = pool_new_usage; + } else { + address kernarg_graph; + auto device = g_devices[ihipGetDevice()]->devices()[0]; + if (device->info().largeBar_) { + kernarg_graph = reinterpret_cast
(device->deviceLocalAlloc(kernarg_graph_size_)); + } else { + kernarg_graph = reinterpret_cast
( + device->hostAlloc(kernarg_graph_size_, 0, amd::Device::MemorySegment::kKernArg)); + } + kernarg_graph_.push_back(kernarg_graph); + kernarg_graph_cur_offset_ = 0; + + // 1. Allocate memory for the kernel args + size_t kernArgSizeForNode = 0; + kernArgSizeForNode = node->GetKerArgSize(); + result = amd::alignUp(kernarg_graph_.back() + kernarg_graph_cur_offset_, + node->GetKernargSegmentAlignment()); + const size_t pool_new_usage = (result + kernArgSizeForNode) - kernarg_graph_.back(); + if (pool_new_usage <= kernarg_graph_size_) { + kernarg_graph_cur_offset_ = pool_new_usage; + } + } + + // 2. copy kernel args / create new AQL packet + node->CaptureAndFormPacket(capture_stream_, result); + } + return hipSuccess; +} + hipError_t FillCommands(std::vector>& parallelLists, std::unordered_map>& nodeWaitLists, std::vector& topoOrder, Graph* clonedGraph, diff --git a/hipamd/src/hip_graph_internal.hpp b/hipamd/src/hip_graph_internal.hpp index 7f6543cb9..2229b75d8 100644 --- a/hipamd/src/hip_graph_internal.hpp +++ b/hipamd/src/hip_graph_internal.hpp @@ -543,7 +543,7 @@ struct Graph { graphInstantiated_ = graphInstantiate; } }; - +struct GraphKernelNode; struct GraphExec { std::vector> parallelLists_; // Topological order of the graph doesn't include nodes embedded as part of the child graph @@ -551,6 +551,7 @@ struct GraphExec { std::unordered_map> nodeWaitLists_; struct Graph* clonedGraph_; std::vector parallel_streams_; + hip::Stream* capture_stream_; uint currentQueueIndex_; std::unordered_map clonedNodes_; amd::Command* lastEnqueuedCommand_; @@ -563,6 +564,10 @@ struct GraphExec { address kernarg_pool_graph_ = nullptr; uint32_t kernarg_pool_size_graph_ = 0; uint32_t kernarg_pool_cur_graph_offset_ = 0; + std::vector
kernarg_graph_; + uint32_t kernarg_graph_cur_offset_ = 0; + uint32_t kernarg_graph_size_ = 128 * Ki; + public: GraphExec(std::vector& topoOrder, std::vector>& lists, std::unordered_map>& nodeWaitLists, struct Graph*& clonedGraph, @@ -591,6 +596,9 @@ struct GraphExec { auto device = g_devices[ihipGetDevice()]->devices()[0]; if (DEBUG_CLR_GRAPH_PACKET_CAPTURE) { device->hostFree(kernarg_pool_graph_, kernarg_pool_size_graph_); + for (auto& element : kernarg_graph_) { + device->hostFree(element, kernarg_graph_size_); + } } amd::ScopedLock lock(graphExecSetLock_); graphExecSet_.erase(this); @@ -636,6 +644,7 @@ struct GraphExec { hipError_t Run(hipStream_t stream); // Capture GPU Packets from graph commands hipError_t CaptureAQLPackets(); + hipError_t UpdateAQLPacket(hip::GraphKernelNode* node); }; struct ChildGraphNode : public GraphNode { @@ -793,19 +802,20 @@ class GraphKernelNode : public GraphNode { out << "];"; } - void CaptureAndFormPacket(address kernArgOffset) { - for (auto& command : commands_) { - reinterpret_cast(command)->setCapturingState( - true, GetAqlPacket(), kernArgOffset); - - // Enqueue command to capture GPU Packet. The packet is not submitted to the device. - // The packet is stored in gpuPacket_ and submitted during graph launch. - command->submit(*(command->queue())->vdev()); - // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes - SetKernelName(reinterpret_cast(command)->kernel().name()); - command->release(); - } + void CaptureAndFormPacket(hip::Stream* capture_stream, address kernArgOffset) { + hipError_t status = CreateCommand(capture_stream); + for (auto& command : commands_) { + reinterpret_cast(command)->setCapturingState( + true, GetAqlPacket(), kernArgOffset); + + // Enqueue command to capture GPU Packet. The packet is not submitted to the device. + // The packet is stored in gpuPacket_ and submitted during graph launch. + command->submit(*(command->queue())->vdev()); + // Need to ensure if the command is NDRangeKernelCommand if we capture non kernel nodes + SetKernelName(reinterpret_cast(command)->kernel().name()); + command->release(); } + } std::string GetLabel(hipGraphDebugDotFlags flag) { hipFunction_t func = getFunc(kernelParams_, ihipGetDevice()); diff --git a/hipamd/src/hip_internal.hpp b/hipamd/src/hip_internal.hpp index 88b12d845..d3d38178f 100644 --- a/hipamd/src/hip_internal.hpp +++ b/hipamd/src/hip_internal.hpp @@ -487,7 +487,7 @@ class stream_per_thread { void setFlags(unsigned int flags) { flags_ = flags; } void Reset(); - hip::Stream* NullStream(); + hip::Stream* NullStream(bool wait = true); Stream* GetNullStream() const {return null_stream_;}; void SetActiveStatus() { @@ -572,7 +572,7 @@ class stream_per_thread { /// Get default stream associated with the ROCclr context extern hip::Stream* getNullStream(amd::Context&); /// Get default stream of the thread - extern hip::Stream* getNullStream(); + extern hip::Stream* getNullStream(bool wait = true); /// Get device ID associated with the ROCclr context int getDeviceID(amd::Context& ctx); /// Check if stream is valid From 47f90e1199a06d7352b9c648b20c56cda30d49f3 Mon Sep 17 00:00:00 2001 From: Ajay Date: Thu, 11 Jan 2024 12:08:09 -0800 Subject: [PATCH 23/27] SWDEV-440718 - Suffix hip bin name with _6. Link to amd_comgr_2.dll Change-Id: I5d21af0cc91f2082099273f75ebe1c3279b6365d --- hipamd/src/CMakeLists.txt | 3 +++ hipamd/src/hiprtc/hiprtcInternal.cpp | 3 ++- rocclr/device/comgrctx.cpp | 6 ++++-- 3 files changed, 9 insertions(+), 3 deletions(-) diff --git a/hipamd/src/CMakeLists.txt b/hipamd/src/CMakeLists.txt index b038fd72d..b2bf28bba 100644 --- a/hipamd/src/CMakeLists.txt +++ b/hipamd/src/CMakeLists.txt @@ -59,6 +59,9 @@ list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_LIST_DIR}/cmake") if(BUILD_SHARED_LIBS) add_library(amdhip64 SHARED) + if(WIN32) + set_target_properties(amdhip64 PROPERTIES RUNTIME_OUTPUT_NAME "amdhip64_${HIP_VERSION_MAJOR}") + endif() # Windows doesn't have a strip utility, so CMAKE_STRIP won't be set. if((CMAKE_BUILD_TYPE STREQUAL "Release") AND NOT ("${CMAKE_STRIP}" STREQUAL "")) add_custom_command(TARGET amdhip64 POST_BUILD COMMAND ${CMAKE_STRIP} $) diff --git a/hipamd/src/hiprtc/hiprtcInternal.cpp b/hipamd/src/hiprtc/hiprtcInternal.cpp index 89aa9064e..ac02aa107 100644 --- a/hipamd/src/hiprtc/hiprtcInternal.cpp +++ b/hipamd/src/hiprtc/hiprtcInternal.cpp @@ -59,7 +59,8 @@ RTCProgram::RTCProgram(std::string name) : name_(name) { bool RTCProgram::findIsa() { const char* libName; #ifdef _WIN32 - libName = "amdhip64.dll"; + std::string dll_name = std::string("amdhip64_" + std::to_string(HIP_VERSION_MAJOR) + ".dll"); + libName = dll_name.c_str(); #else libName = "libamdhip64.so"; #endif diff --git a/rocclr/device/comgrctx.cpp b/rocclr/device/comgrctx.cpp index 67184ca94..1a8afe60a 100644 --- a/rocclr/device/comgrctx.cpp +++ b/rocclr/device/comgrctx.cpp @@ -57,9 +57,11 @@ bool Comgr::LoadLib(bool is_versioned) { cep_.handle = Os::loadLibrary(comgr_lib_name); #endif } else { - static constexpr const char* comgr_lib_name = + std::string comgr_major_dll = "amd_comgr_" + + std::to_string(AMD_COMGR_INTERFACE_VERSION_MAJOR) + ".dll"; + static const char* comgr_lib_name = LP64_SWITCH(WINDOWS_SWITCH("amd_comgr32.dll", "libamd_comgr32.so.2"), - WINDOWS_SWITCH("amd_comgr.dll", "libamd_comgr.so.2")); + WINDOWS_SWITCH(comgr_major_dll.c_str(), "libamd_comgr.so.2")); cep_.handle = Os::loadLibrary(comgr_lib_name); } if (nullptr == cep_.handle) { From dc16383865c19c7d2c580c3e282307d860a6a3e9 Mon Sep 17 00:00:00 2001 From: Satyanvesh Dittakavi Date: Mon, 22 Jan 2024 07:27:58 +0000 Subject: [PATCH 24/27] SWDEV-440029 - Fix hipMemcpyFrom/ToSymbol directions allowed Change-Id: Ib0b3d737cd5d297ac2d3c1ba549902b96e29c18c --- hipamd/src/hip_memory.cpp | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/hipamd/src/hip_memory.cpp b/hipamd/src/hip_memory.cpp index 418ee0884..8dc0e176b 100644 --- a/hipamd/src/hip_memory.cpp +++ b/hipamd/src/hip_memory.cpp @@ -1311,8 +1311,8 @@ hipError_t hipMemcpyToSymbol_common(const void* symbol, const void* src, size_t size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); - if (kind != hipMemcpyHostToDevice && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { HIP_RETURN(hipErrorInvalidMemcpyDirection); } @@ -1345,8 +1345,8 @@ hipError_t hipMemcpyFromSymbol_common(void* dst, const void* symbol, size_t size size_t offset, hipMemcpyKind kind, hipStream_t stream=nullptr) { CHECK_STREAM_CAPTURING(); - if (kind != hipMemcpyDeviceToHost && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { HIP_RETURN(hipErrorInvalidMemcpyDirection); } @@ -1379,8 +1379,8 @@ hipError_t hipMemcpyToSymbolAsync_common(const void* symbol, const void* src, si size_t offset, hipMemcpyKind kind, hipStream_t stream) { STREAM_CAPTURE(hipMemcpyToSymbolAsync, stream, symbol, src, sizeBytes, offset, kind); - if (kind != hipMemcpyHostToDevice && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyHostToDevice && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { return hipErrorInvalidMemcpyDirection; } @@ -1412,8 +1412,8 @@ hipError_t hipMemcpyFromSymbolAsync_common(void* dst, const void* symbol, size_t size_t offset, hipMemcpyKind kind, hipStream_t stream) { STREAM_CAPTURE(hipMemcpyFromSymbolAsync, stream, dst, symbol, sizeBytes, offset, kind); - if (kind != hipMemcpyDeviceToHost && (kind != hipMemcpyDeviceToDevice || - kind != hipMemcpyDeviceToDeviceNoCU)) { + if (kind != hipMemcpyDeviceToHost && kind != hipMemcpyDeviceToDevice && + kind != hipMemcpyDeviceToDeviceNoCU) { return hipErrorInvalidMemcpyDirection; } From 84ff2dace7eda46c97832f4f9b928d52a6585a15 Mon Sep 17 00:00:00 2001 From: cadolphe Date: Wed, 4 Oct 2023 17:07:08 -0400 Subject: [PATCH 25/27] SWDEV-368741 - Check for uninstalled hsa-amd-aqlprofile to avoid Segmentation Fault Change-Id: I0c0c7a2ea7c35237332ddd7c2461e4d9ff8e2f1c --- opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp | 4 +++- rocclr/device/rocm/rocvirtual.cpp | 5 +++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp b/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp index 0e7de54e0..84ac050c7 100644 --- a/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp +++ b/opencl/tests/ocltst/module/runtime/OCLPerfCounters.cpp @@ -760,8 +760,10 @@ void OCLPerfCounters::run(void) { _wrapper->clSetDeviceClockModeAMD(global_device, setClockModeInput, &setClockModeOutput); - _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA, + error_ = _wrapper->clGetPerfCounterInfoAMD(perfCounter, CL_PERFCOUNTER_DATA, sizeof(cl_ulong), &result, NULL); + CHECK_RESULT(error_ != CL_SUCCESS, + "clGetPerfCounterInfoAMD failed (Hint (Linux): install hsa-amd-aqlprofile)\n"); err = _wrapper->clReleasePerfCounterAMD(perfCounter); CHECK_RESULT(err != CL_SUCCESS, "Release PerfCounter failed\n"); diff --git a/rocclr/device/rocm/rocvirtual.cpp b/rocclr/device/rocm/rocvirtual.cpp index 74bc741db..42eb546ea 100644 --- a/rocclr/device/rocm/rocvirtual.cpp +++ b/rocclr/device/rocm/rocvirtual.cpp @@ -3636,6 +3636,11 @@ void VirtualGPU::submitPerfCounter(amd::PerfCounterCommand& vcmd) { // one to get the profile object amd::PerfCounter* amdCounter = static_cast(counters[0]); PerfCounter* counter = static_cast(amdCounter->getDeviceCounter()); + if (counter == nullptr) { + LogError("Invalid Performance Counter"); + vcmd.setStatus(CL_INVALID_OPERATION); + return; + } PerfCounterProfile* profileRef = counter->profileRef(); // create the AQL packet for stop profiling From 8ff39a54fc790454b95b325eb2d9cdfa06ba7968 Mon Sep 17 00:00:00 2001 From: Sameer Sahasrabuddhe Date: Wed, 24 Jan 2024 13:16:50 +0530 Subject: [PATCH 26/27] SWDEV-295298: hide warp sync builtins with a macro in ROCm 6.1 Change-Id: Ie2efd233c0bcf8ad0e06223ee854fe4bd1060443 --- hipamd/include/hip/amd_detail/amd_warp_functions.h | 4 +++- .../include/hip/amd_detail/amd_warp_sync_functions.h | 11 +++++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/hipamd/include/hip/amd_detail/amd_warp_functions.h b/hipamd/include/hip/amd_detail/amd_warp_functions.h index 64c274013..98f8896cd 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_functions.h @@ -103,14 +103,16 @@ unsigned long long int __ballot64(int predicate) { return __builtin_amdgcn_uicmp(predicate, 0, ICMP_NE); } +// See amd_warp_sync_functions.h for an explanation of this preprocessor flag. +#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS // Since threads in a wave do not make independent progress, __activemask() // always returns the exact active mask, i.e, all active threads in the wave. - __device__ inline unsigned long long __activemask() { return __ballot(true); } +#endif // HIP_ENABLE_WARP_SYNC_BUILTINS __device__ static inline unsigned int __lane_id() { return __builtin_amdgcn_mbcnt_hi( diff --git a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h index 5ce2581a8..b8c67a897 100644 --- a/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h +++ b/hipamd/include/hip/amd_detail/amd_warp_sync_functions.h @@ -22,6 +22,15 @@ THE SOFTWARE. #pragma once +// Warp sync builtins (with explicit mask argument) introduced in ROCm 6.1 as a +// preview to allow end-users to adapt to the new interface involving 64-bit +// masks. These are disabled by default, and can be enabled by setting the macro +// below. The builtins will be enabled unconditionally in ROCm 6.2. +// +// This arrangement also applies to the __activemask() builtin defined in +// amd_warp_functions.h. +#ifdef HIP_ENABLE_WARP_SYNC_BUILTINS + #if !defined(__HIPCC_RTC__) #include "amd_warp_functions.h" #include "hip_assert.h" @@ -259,3 +268,5 @@ T __shfl_xor_sync(MaskT mask, T var, int laneMask, #undef __hip_do_sync #undef __hip_check_mask + +#endif // HIP_ENABLE_WARP_SYNC_BUILTINS From bb236033dbcc17cc29568ff963353b33c7321623 Mon Sep 17 00:00:00 2001 From: ipanfilo Date: Fri, 9 Feb 2024 10:56:18 -0600 Subject: [PATCH 27/27] SWDEV-438299 - Remove unused variables --- hipamd/src/hip_fatbin.cpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/hipamd/src/hip_fatbin.cpp b/hipamd/src/hip_fatbin.cpp index 5d0d00ac9..ac8775597 100644 --- a/hipamd/src/hip_fatbin.cpp +++ b/hipamd/src/hip_fatbin.cpp @@ -1,5 +1,5 @@ /* -Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. +Copyright (c) 2023 - 2024 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal @@ -118,7 +118,6 @@ hipError_t FatBinaryInfo::ExtractFatBinaryUsingCOMGR(const std::vector