From d36463103daed09a40cdea235041a6ab79ff280c Mon Sep 17 00:00:00 2001 From: Ben Wibking Date: Tue, 7 Nov 2023 00:19:10 -0500 Subject: [PATCH] replace AMREX_DEVICE_COMPILE with AMREX_IF_ON_DEVICE and AMREX_IF_ON_HOST (#3591) ## Summary This adds the macros `AMREX_IF_ON_DEVICE((code_for_device))` and `AMREX_IF_ON_HOST((code_for_host))` that are compatible with single-pass host/device compilation (as used by `nvc++ -cuda`), as well as backward compatible with all other compilers. This also replaces all uses of `AMREX_DEVICE_COMPILE` with these macros. Fixes https://github.com/AMReX-Codes/amrex/issues/3586. ## Additional background Single-pass compilation evalutes the preprocessor macros once for each source file. This means that preprocessor conditionals cannot be used to choose between host and device code. In particular, NVHPC with `-cuda` does not support `__CUDA_ARCH__`, instead requiring the use of the `if target` construct. This creates portable macros that work for either single-pass or two-pass compilation, but requires restructuring of any code that uses AMREX_DEVICE_COMPILE so that the code appears as a macro argument. This PR will allow using NVHPC with `-cuda` as the unified host/device compiler for AMReX. In the future, single-pass compilers for other backends may be available, e.g., SYCL (https://dl.acm.org/doi/abs/10.1145/3585341.3585351). AMReX can be configured to build with `nvc++ -cuda` using CMake: ``` cmake .. -DAMReX_GPU_BACKEND=CUDA -DCMAKE_C_COMPILER=nvc -DCMAKE_CXX_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER=nvc++ -DCMAKE_CUDA_COMPILER_ID=NVCXX -DCMAKE_CUDA_ARCHITECTURES=80 -DCMAKE_CUDA_COMPILER_FORCED=ON -DCMAKE_CUDA_COMPILE_FEATURES=cuda_std_17 -DAMReX_GPU_RDC=OFF -DCMAKE_CXX_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DCMAKE_CUDA_FLAGS="-cuda --gcc-toolchain=$(which gcc)" -DAMReX_ENABLE_TESTS=ON -DCMAKE_CUDA_HOST_LINK_LAUNCHER=nvc++ -DCMAKE_CUDA_LINK_EXECUTABLE=" -o " ``` CMake hacks (https://github.com/NVIDIA/cub/blob/0fc3c3701632a4be906765b73be20a9ad0da603d/cmake/CubCompilerHacks.cmake) are tested with CMake 3.22.1 and NVHPC 23.5, 23.7, and 23.9 (earlier versions do not work). However, it currently fails to link the executables for the tests due to a [compiler/linker bug](https://forums.developer.nvidia.com/t/nvc-cuda-fails-to-link-code-when-using-device-curand-functions/270401/5). (Note that by default, `nvcc` preserves denormals, whereas `nvc++` does not. Also, `nvc++` generates relocatable device code by default, whereas `nvcc` does not.) ## Checklist The proposed changes: - [ ] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate --------- Co-authored-by: Weiqun Zhang --- Docs/sphinx_documentation/source/GPU.rst | 9 +- Src/Base/AMReX.H | 60 +++---- Src/Base/AMReX_Algorithm.H | 178 +++++++++++--------- Src/Base/AMReX_Array4.H | 46 ++--- Src/Base/AMReX_FabArray.H | 7 +- Src/Base/AMReX_GpuAtomic.H | 204 ++++++++++++----------- Src/Base/AMReX_GpuLaunch.H | 29 ++-- Src/Base/AMReX_GpuQualifiers.H | 29 ++++ Src/Base/AMReX_GpuRange.H | 52 +++--- Src/Base/AMReX_GpuUtility.H | 10 +- Src/Base/AMReX_Math.H | 76 +++++---- Src/Base/AMReX_Random.H | 109 ++++++------ Src/Base/AMReX_TableData.H | 100 +++++------ Src/Base/Parser/AMReX_IParser.H | 28 +--- Src/Base/Parser/AMReX_Parser.H | 35 ++-- Src/Base/Parser/AMReX_Parser_Y.H | 4 +- Src/EB/AMReX_EB2_GeometryShop.H | 13 +- 17 files changed, 518 insertions(+), 471 deletions(-) diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index 90dbcc9f26..aff060e916 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -489,11 +489,10 @@ GPU support. When AMReX is compiled with ``USE_OMP_OFFLOAD=TRUE``, ``AMREX_USE_OMP_OFFLOAD`` is defined. -In addition to AMReX's preprocessor macros, CUDA provides the -``__CUDA_ARCH__`` macro which is only defined when in device code. -HIP and Sycl provide similar macros. -``AMREX_DEVICE_COMPILE`` should be used when a ``__host__ __device__`` -function requires separate code for the CPU and GPU implementations. +The macros ``AMREX_IF_ON_DEVICE((code_for_device))`` and +``AMREX_IF_ON_HOST((code_for_host))`` should be used when a +``__host__ __device__`` function requires separate code for the +CPU and GPU implementations. .. =================================================================== diff --git a/Src/Base/AMReX.H b/Src/Base/AMReX.H index c539a1d8e7..2b88553bcd 100644 --- a/Src/Base/AMReX.H +++ b/Src/Base/AMReX.H @@ -113,16 +113,15 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Error (const char* msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); } - AMREX_DEVICE_ASSERT(0); -#endif -#else - Error_host("Error", msg); + AMREX_IF_ON_DEVICE(( + if (msg) { AMREX_DEVICE_PRINTF("Error %s\n", msg); } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Error_host("Error", msg);)) } //! Print out warning message to cerr. @@ -132,15 +131,12 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Warning (const char * msg) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); -#else - if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); } -#endif + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - Warning_host(msg); + AMREX_IF_ON_DEVICE((if (msg) { AMREX_DEVICE_PRINTF("Warning %s\n", msg); })) #endif + AMREX_IF_ON_HOST((Warning_host(msg);)) } //! Print out message to cerr and exit via abort(). @@ -148,16 +144,15 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Abort (const char * msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(msg); + AMREX_IF_ON_DEVICE((amrex::ignore_unused(msg);)) #else - if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); } - AMREX_DEVICE_ASSERT(0); -#endif -#else - Error_host("Abort", msg); + AMREX_IF_ON_DEVICE(( + if (msg) { AMREX_DEVICE_PRINTF("Abort %s\n", msg); } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Error_host("Abort", msg);)) } /** @@ -170,22 +165,21 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void Assert (const char* EX, const char* file, int line, const char* msg = nullptr) { -#if AMREX_DEVICE_COMPILE #if defined(NDEBUG) - amrex::ignore_unused(EX,file,line,msg); -#else - if (msg) { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", - EX, file, line, msg); - } else { - AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", - EX, file, line); - } - AMREX_DEVICE_ASSERT(0); -#endif + AMREX_IF_ON_DEVICE((amrex::ignore_unused(EX,file,line,msg);)) #else - Assert_host(EX,file,line,msg); + AMREX_IF_ON_DEVICE(( + if (msg) { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d, Msg: %s", + EX, file, line, msg); + } else { + AMREX_DEVICE_PRINTF("Assertion `%s' failed, file \"%s\", line %d", + EX, file, line); + } + AMREX_DEVICE_ASSERT(0); + )) #endif + AMREX_IF_ON_HOST((Assert_host(EX,file,line,msg);)) } /** diff --git a/Src/Base/AMReX_Algorithm.H b/Src/Base/AMReX_Algorithm.H index b418f3cc1c..31889df442 100644 --- a/Src/Base/AMReX_Algorithm.H +++ b/Src/Base/AMReX_Algorithm.H @@ -161,51 +161,52 @@ namespace amrex AMREX_GPU_HOST_DEVICE ItType upper_bound (ItType first, ItType last, const ValType& val) { -#if AMREX_DEVICE_COMPILE - std::ptrdiff_t count = last-first; - while(count>0){ - auto it = first; - const auto step = count/2; - it += step; - if (!(val < *it)){ - first = ++it; - count -= step + 1; + AMREX_IF_ON_DEVICE(( + std::ptrdiff_t count = last-first; + while(count>0){ + auto it = first; + const auto step = count/2; + it += step; + if (!(val < *it)){ + first = ++it; + count -= step + 1; + } + else{ + count = step; + } } - else{ - count = step; - } - } - - return first; -#else - return std::upper_bound(first, last, val); -#endif + return first; + )) + AMREX_IF_ON_HOST(( + return std::upper_bound(first, last, val); + )) } template AMREX_GPU_HOST_DEVICE ItType lower_bound (ItType first, ItType last, const ValType& val) { -#ifdef AMREX_DEVICE_COMPILE - std::ptrdiff_t count = last-first; - while(count>0) - { - auto it = first; - const auto step = count/2; - it += step; - if (*it < val){ - first = ++it; - count -= step + 1; - } - else{ - count = step; + AMREX_IF_ON_DEVICE(( + std::ptrdiff_t count = last-first; + while(count>0) + { + auto it = first; + const auto step = count/2; + it += step; + if (*it < val){ + first = ++it; + count -= step + 1; + } + else{ + count = step; + } } - } - return first; -#else - return std::lower_bound(first, last, val); -#endif + return first; + )) + AMREX_IF_ON_HOST(( + return std::lower_bound(first, last, val); + )) } namespace detail { @@ -239,83 +240,100 @@ int builtin_clz_wrapper (clzll_tag, T x) noexcept return static_cast(__builtin_clzll(x) - (sizeof(unsigned long long) * CHAR_BIT - sizeof(T) * CHAR_BIT)); } -#ifdef AMREX_USE_CUDA - -// likewise with CUDA, there are __clz functions that take (signed) int and long long int -template ::type> -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -int clz_wrapper (clz_tag, T x) noexcept -{ - return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT); -} - -template ::type> -AMREX_GPU_DEVICE AMREX_FORCE_INLINE -int clz_wrapper (clzll_tag, T x) noexcept -{ - return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT); } -#endif -} +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> = 0> +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept; AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint8_t x) noexcept +int clz_generic (std::uint8_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else static constexpr int clz_lookup[16] = { 4, 3, 2, 2, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0 }; auto upper = x >> 4; auto lower = x & 0xF; return upper ? clz_lookup[upper] : 4 + clz_lookup[lower]; -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint16_t x) noexcept +int clz_generic (std::uint16_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint8_t(x >> 8); auto lower = std::uint8_t(x & 0xFF); return upper ? clz(upper) : 8 + clz(lower); -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint32_t x) noexcept +int clz_generic (std::uint32_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint16_t(x >> 16); auto lower = std::uint16_t(x & 0xFFFF); return upper ? clz(upper) : 16 + clz(lower); -#endif } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE -int clz (std::uint64_t x) noexcept +int clz_generic (std::uint64_t x) noexcept { -#if (AMREX_DEVICE_COMPILE && defined(AMREX_USE_CUDA)) // all supported cuda versions have __clz - return detail::clz_wrapper(detail::clz_tag{}, x); -#elif (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) - return detail::builtin_clz_wrapper(detail::clz_tag{}, x); -#else auto upper = std::uint32_t(x >> 32); auto lower = std::uint32_t(x & 0xFFFFFFFF); return upper ? clz(upper) : 32 + clz(lower); +} + +#if defined AMREX_USE_CUDA + +namespace detail { + // likewise with CUDA, there are __clz functions that take (signed) int and long long int + template ::type> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + int clz_wrapper (clz_tag, T x) noexcept + { + return __clz((int) x) - (sizeof(int) * CHAR_BIT - sizeof(T) * CHAR_BIT); + } + + template ::type> + AMREX_GPU_DEVICE AMREX_FORCE_INLINE + int clz_wrapper (clzll_tag, T x) noexcept + { + return __clzll((long long int) x) - (sizeof(long long int) * CHAR_BIT - sizeof(T) * CHAR_BIT); + } +} + +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> > +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept +{ + AMREX_IF_ON_DEVICE((return detail::clz_wrapper(detail::clz_tag{}, x);)) +#if AMREX_HAS_BUILTIN_CLZ + AMREX_IF_ON_HOST((return detail::builtin_clz_wrapper(detail::clz_tag{}, x);)) +#else + AMREX_IF_ON_HOST((return clz_generic(x);)) #endif } +#else // !defined AMREX_USE_CUDA + +template ,std::uint8_t> || + std::is_same_v,std::uint16_t> || + std::is_same_v,std::uint32_t> || + std::is_same_v,std::uint64_t>, int> > +AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE +int clz (T x) noexcept +{ +#if (!AMREX_DEVICE_COMPILE && AMREX_HAS_BUILTIN_CLZ) + return detail::builtin_clz_wrapper(detail::clz_tag{}, x); +#else + return clz_generic(x); +#endif +} + +#endif // defined AMREX_USE_CUDA + } #endif diff --git a/Src/Base/AMReX_Array4.H b/Src/Base/AMReX_Array4.H index b2ff0fcb54..8d7a4a44f3 100644 --- a/Src/Base/AMReX_Array4.H +++ b/Src/Base/AMReX_Array4.H @@ -42,13 +42,14 @@ namespace amrex { U& operator[] (int n) const noexcept { #if defined(AMREX_DEBUG) || defined(AMREX_BOUND_CHECK) if (n < 0 || n >= ncomp) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1); -#else - std::stringstream ss; - ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" %d is out of bound (0:%d)", n, ncomp-1); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " " << n << " is out of bound: (0:" << ncomp-1 << ")"; + amrex::Abort(ss.str()); + )) } #endif return p[n*stride]; @@ -233,21 +234,22 @@ namespace amrex { { if (i=end.x || j=end.y || k=end.z || n < 0 || n >= ncomp) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", - i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, - begin.z, end.z-1, ncomp-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << "," << n - << ") is out of bound (" - << begin.x << ":" << end.x-1 << "," - << begin.y << ":" << end.y-1 << "," - << begin.z << ":" << end.z-1 << "," - << "0:" << ncomp-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,0:%d)\n", + i, j, k, n, begin.x, end.x-1, begin.y, end.y-1, + begin.z, end.z-1, ncomp-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << "," << n + << ") is out of bound (" + << begin.x << ":" << end.x-1 << "," + << begin.y << ":" << end.y-1 << "," + << begin.z << ":" << end.z-1 << "," + << "0:" << ncomp-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif diff --git a/Src/Base/AMReX_FabArray.H b/Src/Base/AMReX_FabArray.H index a8839a4bcc..e507dab153 100644 --- a/Src/Base/AMReX_FabArray.H +++ b/Src/Base/AMReX_FabArray.H @@ -198,11 +198,8 @@ struct MultiArray4 { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Array4 const& operator[] (int li) const noexcept { -#if AMREX_DEVICE_COMPILE - return dp[li]; -#else - return hp[li]; -#endif + AMREX_IF_ON_DEVICE((return dp[li];)) + AMREX_IF_ON_HOST((return hp[li];)) } #ifdef AMREX_USE_GPU diff --git a/Src/Base/AMReX_GpuAtomic.H b/Src/Base/AMReX_GpuAtomic.H index 55fc351156..deea6ae932 100644 --- a/Src/Base/AMReX_GpuAtomic.H +++ b/Src/Base/AMReX_GpuAtomic.H @@ -132,17 +132,17 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Add_device (T* const sum, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicAdd(sum, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; sycl::atomic_ref a{*sum}; return a.fetch_add(value); #else - amrex::ignore_unused(sum, value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicAdd(sum, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(sum, value); + return T(); // should never get here, but have to return something + )) #endif } @@ -175,7 +175,7 @@ namespace detail { #endif -#if defined(AMREX_USE_CUDA) && (__CUDA_ARCH__ < 600) +#if defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 600) AMREX_GPU_DEVICE AMREX_FORCE_INLINE double Add_device (double* const sum, double const value) noexcept @@ -195,17 +195,16 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Add (T* sum, T value) noexcept { -#if AMREX_DEVICE_COMPILE #ifdef AMREX_USE_SYCL - return Add_device(sum, value); -#else - return Add_device(sum, value); -#endif + AMREX_IF_ON_DEVICE((return Add_device(sum, value);)) #else - auto old = *sum; - *sum += value; - return old; + AMREX_IF_ON_DEVICE((return Add_device(sum, value);)) #endif + AMREX_IF_ON_HOST(( + auto old = *sum; + *sum += value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -252,18 +251,19 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE bool If (T* const add, T const value, Op&& op, Cond&& cond) noexcept { -#if AMREX_DEVICE_COMPILE - return If_device(add, value, std::forward(op), std::forward(cond)); -#else - T old = *add; - T const tmp = op(old, value); - if (cond(tmp)) { - *add = tmp; - return true; - } else { - return false; - } -#endif + AMREX_IF_ON_DEVICE(( + return If_device(add, value, std::forward(op), std::forward(cond)); + )) + AMREX_IF_ON_HOST(( + T old = *add; + T const tmp = op(old, value); + if (cond(tmp)) { + *add = tmp; + return true; + } else { + return false; + } + )) } //////////////////////////////////////////////////////////////////////// @@ -278,14 +278,11 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet (T* sum, T value) noexcept { -#if AMREX_DEVICE_COMPILE -#ifdef AMREX_USE_SYCL +#if defined(__SYCL_DEVICE_ONLY__) Add_device(sum, value); #else - Add_device(sum, value); -#endif -#else - *sum += value; + AMREX_IF_ON_DEVICE((Add_device(sum, value);)) + AMREX_IF_ON_HOST((*sum += value;)) #endif } @@ -293,14 +290,11 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE void AddNoRet (float* const sum, float const value) noexcept { -#if AMREX_DEVICE_COMPILE #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wdeprecated-declarations" - atomicAddNoRet(sum, value); + AMREX_IF_ON_DEVICE((atomicAddNoRet(sum, value);)) #pragma clang diagnostic pop -#else - *sum += value; -#endif + AMREX_IF_ON_HOST((*sum += value;)) } #endif @@ -314,18 +308,18 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Min_device (T* const m, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicMin(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_min(value); #else - amrex::ignore_unused(m,value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicMin(m, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(m,value); + return T(); // should never get here, but have to return something + )) #endif } @@ -357,13 +351,14 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Min (T* const m, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - return Min_device(m, value); -#else - auto const old = *m; - *m = (*m) < value ? (*m) : value; - return old; -#endif + AMREX_IF_ON_DEVICE(( + return Min_device(m, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *m; + *m = (*m) < value ? (*m) : value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -376,18 +371,18 @@ namespace detail { AMREX_GPU_DEVICE AMREX_FORCE_INLINE T Max_device (T* const m, T const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicMax(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_max(value); #else - amrex::ignore_unused(m,value); - return T(); // should never get here, but have to return something + AMREX_IF_ON_DEVICE(( return atomicMax(m, value); )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(m,value); + return T(); // should never get here, but have to return something + )) #endif } @@ -419,13 +414,14 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Max (T* const m, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - return Max_device(m, value); -#else - auto const old = *m; - *m = (*m) > value ? (*m) : value; - return old; -#endif + AMREX_IF_ON_DEVICE(( + return Max_device(m, value); + )) + AMREX_IF_ON_HOST(( + auto const old = *m; + *m = (*m) > value ? (*m) : value; + return old; + )) } //////////////////////////////////////////////////////////////////////// @@ -435,19 +431,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalOr (int* const m, int const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicOr(m, value); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_or(value); #else - int const old = *m; - *m = (*m) || value; - return old; + AMREX_IF_ON_DEVICE(( + return atomicOr(m, value); + )) + AMREX_IF_ON_HOST(( + int const old = *m; + *m = (*m) || value; + return old; + )) #endif } @@ -458,19 +456,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int LogicalAnd (int* const m, int const value) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicAnd(m, value ? ~0x0 : 0); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*m}; return a.fetch_and(value ? ~0x0 : 0); #else - int const old = *m; - *m = (*m) && value; - return old; + AMREX_IF_ON_DEVICE(( + return atomicAnd(m, value ? ~0x0 : 0); + )) + AMREX_IF_ON_HOST(( + int const old = *m; + *m = (*m) && value; + return old; + )) #endif } @@ -482,19 +482,21 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T Exch (T* address, T val) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicExch(address, val); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; sycl::atomic_ref a{*address}; return a.exchange(val); #else - auto const old = *address; - *address = val; - return old; + AMREX_IF_ON_DEVICE(( + return atomicExch(address, val); + )) + AMREX_IF_ON_HOST(( + auto const old = *address; + *address = val; + return old; + )) #endif } @@ -506,10 +508,7 @@ namespace detail { AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T CAS (T* const address, T compare, T const val) noexcept { // cannot be T const compare because of compare_exchange_strong -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return atomicCAS(address, compare, val); -#elif defined(__SYCL_DEVICE_ONLY__) +#if defined(__SYCL_DEVICE_ONLY__) constexpr auto mo = sycl::memory_order::relaxed; constexpr auto ms = sycl::memory_scope::device; constexpr auto as = sycl::access::address_space::global_space; @@ -517,9 +516,14 @@ namespace detail { a.compare_exchange_strong(compare, val); return compare; #else - auto const old = *address; - *address = (old == compare ? val : old); - return old; + AMREX_IF_ON_DEVICE(( + return atomicCAS(address, compare, val); + )) + AMREX_IF_ON_HOST(( + auto const old = *address; + *address = (old == compare ? val : old); + return old; + )) #endif } } @@ -527,17 +531,21 @@ namespace detail { namespace HostDevice::Atomic { template - AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE - void Add (T* const sum, T const value) noexcept + AMREX_FORCE_INLINE + void Add_Host (T* const sum, T const value) noexcept { -#if AMREX_DEVICE_COMPILE - Gpu::Atomic::AddNoRet(sum,value); -#else #ifdef AMREX_USE_OMP #pragma omp atomic update #endif *sum += value; -#endif + } + + template + AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE + void Add (T* const sum, T const value) noexcept + { + AMREX_IF_ON_DEVICE((Gpu::Atomic::AddNoRet(sum,value);)) + AMREX_IF_ON_HOST((Add_Host(sum,value);)) } } diff --git a/Src/Base/AMReX_GpuLaunch.H b/Src/Base/AMReX_GpuLaunch.H index c1870d2ef5..c4ba7dd86b 100644 --- a/Src/Base/AMReX_GpuLaunch.H +++ b/Src/Base/AMReX_GpuLaunch.H @@ -103,20 +103,21 @@ namespace Gpu { inline Box getThreadBox (const Box& bx, Long offset) noexcept { -#if AMREX_DEVICE_COMPILE - const auto len = bx.length3d(); - Long k = offset / (len[0]*len[1]); - Long j = (offset - k*(len[0]*len[1])) / len[0]; - Long i = (offset - k*(len[0]*len[1])) - j*len[0]; - IntVect iv{AMREX_D_DECL(static_cast(i), - static_cast(j), - static_cast(k))}; - iv += bx.smallEnd(); - return (bx & Box(iv,iv,bx.type())); -#else - amrex::ignore_unused(offset); - return bx; -#endif + AMREX_IF_ON_DEVICE(( + const auto len = bx.length3d(); + Long k = offset / (len[0]*len[1]); + Long j = (offset - k*(len[0]*len[1])) / len[0]; + Long i = (offset - k*(len[0]*len[1])) - j*len[0]; + IntVect iv{AMREX_D_DECL(static_cast(i), + static_cast(j), + static_cast(k))}; + iv += bx.smallEnd(); + return (bx & Box(iv,iv,bx.type())); + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(offset); + return bx; + )) } // ************************************************ diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H index 1c0b573176..4fba23a849 100644 --- a/Src/Base/AMReX_GpuQualifiers.H +++ b/Src/Base/AMReX_GpuQualifiers.H @@ -8,6 +8,12 @@ #include #endif +#if defined(AMREX_USE_CUDA) && (defined(AMREX_CXX_PGI) || defined(AMREX_CXX_NVHPC)) +#include +#define AMREX_IF_ON_DEVICE(CODE) NV_IF_TARGET(NV_IS_DEVICE, CODE) +#define AMREX_IF_ON_HOST(CODE) NV_IF_TARGET(NV_IS_HOST, CODE) +#endif + #define AMREX_GPU_HOST __host__ #define AMREX_GPU_DEVICE __device__ #define AMREX_GPU_GLOBAL __global__ @@ -31,6 +37,29 @@ #define AMREX_DEVICE_COMPILE (__CUDA_ARCH__ || __HIP_DEVICE_COMPILE__ || __SYCL_DEVICE_ONLY__) +// Remove surrounding parentheses if present +#define AMREX_IMPL_STRIP_PARENS(X) AMREX_IMPL_ESC(AMREX_IMPL_ISH X) +#define AMREX_IMPL_ISH(...) AMREX_IMPL_ISH __VA_ARGS__ +#define AMREX_IMPL_ESC(...) AMREX_IMPL_ESC_(__VA_ARGS__) +#define AMREX_IMPL_ESC_(...) AMREX_IMPL_VAN_##__VA_ARGS__ +#define AMREX_IMPL_VAN_AMREX_IMPL_ISH + +#if !defined(AMREX_IF_ON_DEVICE) && !defined(AMREX_IF_ON_HOST) +#if (defined(AMREX_USE_CUDA) && defined(__CUDA_ARCH__)) || \ + (defined(AMREX_USE_HIP) && defined(__HIP_DEVICE_COMPILE__)) || \ + (defined(AMREX_USE_SYCL) && defined(__SYCL_DEVICE_ONLY__)) +#define AMREX_IF_ON_DEVICE(CODE) \ + { AMREX_IMPL_STRIP_PARENS(CODE) } +#define AMREX_IF_ON_HOST(CODE) \ + {} +#else +#define AMREX_IF_ON_DEVICE(CODE) \ + {} +#define AMREX_IF_ON_HOST(CODE) \ + { AMREX_IMPL_STRIP_PARENS(CODE) } +#endif +#endif + #ifdef AMREX_USE_SYCL # include #endif diff --git a/Src/Base/AMReX_GpuRange.H b/Src/Base/AMReX_GpuRange.H index b8d2ab89d0..be5071dbf8 100644 --- a/Src/Base/AMReX_GpuRange.H +++ b/Src/Base/AMReX_GpuRange.H @@ -32,31 +32,31 @@ Long at (T const& /*b*/, Long offset) noexcept { return offset; } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Long size (Box const& b) noexcept { -#if AMREX_DEVICE_COMPILE - return b.numPts(); -#else - amrex::ignore_unused(b); - return 1; -#endif + AMREX_IF_ON_DEVICE((return b.numPts();)) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(b); + return 1; + )) } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Box at (Box const& b, Long offset) noexcept { -#if AMREX_DEVICE_COMPILE - auto len = b.length3d(); - Long k = offset / (len[0]*len[1]); - Long j = (offset - k*(len[0]*len[1])) / len[0]; - Long i = (offset - k*(len[0]*len[1])) - j*len[0]; - IntVect iv{AMREX_D_DECL(static_cast(i), - static_cast(j), - static_cast(k))}; - iv += b.smallEnd(); - return Box(iv,iv,b.type()); -#else - amrex::ignore_unused(offset); - return b; -#endif + AMREX_IF_ON_DEVICE(( + auto len = b.length3d(); + Long k = offset / (len[0]*len[1]); + Long j = (offset - k*(len[0]*len[1])) / len[0]; + Long i = (offset - k*(len[0]*len[1])) - j*len[0]; + IntVect iv{AMREX_D_DECL(static_cast(i), + static_cast(j), + static_cast(k))}; + iv += b.smallEnd(); + return Box(iv,iv,b.type()); + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(offset); + return b; + )) } template @@ -92,13 +92,15 @@ struct range_impl [[nodiscard]] AMREX_GPU_HOST_DEVICE iterator begin () const noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x); -#elif defined (__SYCL_DEVICE_ONLY__) +#if defined (__SYCL_DEVICE_ONLY__) return iterator(m_b, m_gid, m_grange); #else - return iterator(m_b,0,1); + AMREX_IF_ON_DEVICE(( + return iterator(m_b, blockDim.x*blockIdx.x+threadIdx.x, blockDim.x*gridDim.x); + )) + AMREX_IF_ON_HOST(( + return iterator(m_b,0,1); + )) #endif } diff --git a/Src/Base/AMReX_GpuUtility.H b/Src/Base/AMReX_GpuUtility.H index ce98556fc5..4adc111f5e 100644 --- a/Src/Base/AMReX_GpuUtility.H +++ b/Src/Base/AMReX_GpuUtility.H @@ -26,8 +26,9 @@ namespace Gpu { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T LDG (Array4 const& a, int i, int j, int k) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return __ldg(a.ptr(i,j,k)); +#if defined(AMREX_USE_CUDA) + AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k));)) + AMREX_IF_ON_HOST((return a(i,j,k);)) #else return a(i,j,k); #endif @@ -36,8 +37,9 @@ namespace Gpu { template AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE T LDG (Array4 const& a, int i, int j, int k, int n) noexcept { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return __ldg(a.ptr(i,j,k,n)); +#if defined(AMREX_USE_CUDA) + AMREX_IF_ON_DEVICE((return __ldg(a.ptr(i,j,k,n));)) + AMREX_IF_ON_HOST((return a(i,j,k,n);)) #else return a(i,j,k,n); #endif diff --git a/Src/Base/AMReX_Math.H b/Src/Base/AMReX_Math.H index 769b9bf50f..506289d03d 100644 --- a/Src/Base/AMReX_Math.H +++ b/Src/Base/AMReX_Math.H @@ -68,11 +68,9 @@ double cospi (double x) { #if defined(AMREX_USE_SYCL) return sycl::cospi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::cospi(x); #else - return std::cos(pi()*x); + AMREX_IF_ON_DEVICE(( return ::cospi(x); )) + AMREX_IF_ON_HOST(( return std::cos(pi()*x); )) #endif } @@ -82,11 +80,9 @@ float cospi (float x) { #if defined(AMREX_USE_SYCL) return sycl::cospi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::cospif(x); #else - return std::cos(pi()*x); + AMREX_IF_ON_DEVICE(( return ::cospif(x); )) + AMREX_IF_ON_HOST(( return std::cos(pi()*x); )) #endif } @@ -96,11 +92,9 @@ double sinpi (double x) { #if defined(AMREX_USE_SYCL) return sycl::sinpi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::sinpi(x); #else - return std::sin(pi()*x); + AMREX_IF_ON_DEVICE(( return ::sinpi(x); )) + AMREX_IF_ON_HOST(( return std::sin(pi()*x); )) #endif } @@ -110,14 +104,32 @@ float sinpi (float x) { #if defined(AMREX_USE_SYCL) return sycl::sinpi(x); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return ::sinpif(x); #else - return std::sin(pi()*x); + AMREX_IF_ON_DEVICE(( return ::sinpif(x); )) + AMREX_IF_ON_HOST(( return std::sin(pi()*x); )) #endif } +namespace detail { + AMREX_FORCE_INLINE void sincos (double x, double* sinx, double* cosx) { +#if defined(_GNU_SOURCE) && !defined(__APPLE__) + ::sincos(x, sinx, cosx); +#else + *sinx = std::sin(x); + *cosx = std::cos(x); +#endif + } + + AMREX_FORCE_INLINE void sincosf (float x, float* sinx, float* cosx) { +#if defined(_GNU_SOURCE) && !defined(__APPLE__) + ::sincosf(x, sinx, cosx); +#else + *sinx = std::sin(x); + *cosx = std::cos(x); +#endif + } +} + //! Return sine and cosine of given number AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincos (double x) @@ -125,13 +137,9 @@ std::pair sincos (double x) std::pair r; #if defined(AMREX_USE_SYCL) r.first = sycl::sincos(x, sycl::private_ptr(&r.second)); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \ - (defined(_GNU_SOURCE) && !defined(__APPLE__)) - ::sincos(x, &r.first, &r.second); #else - r.first = std::sin(x); - r.second = std::cos(x); + AMREX_IF_ON_DEVICE(( ::sincos(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( detail::sincos(x, &r.first, &r.second); )) #endif return r; } @@ -143,13 +151,9 @@ std::pair sincos (float x) std::pair r; #if defined(AMREX_USE_SYCL) r.first = sycl::sincos(x, sycl::private_ptr(&r.second)); -#elif defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) || \ - (defined(_GNU_SOURCE) && !defined(__APPLE__)) - ::sincosf(x, &r.first, &r.second); #else - r.first = std::sin(x); - r.second = std::cos(x); + AMREX_IF_ON_DEVICE(( ::sincosf(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( detail::sincosf(x, &r.first, &r.second); )) #endif return r; } @@ -159,11 +163,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincospi (double x) { std::pair r; -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - ::sincospi(x, &r.first, &r.second); -#else +#if defined(AMREX_USE_SYCL) r = sincos(pi()*x); +#else + AMREX_IF_ON_DEVICE(( ::sincospi(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( r = sincos(pi()*x); )) #endif return r; } @@ -173,11 +177,11 @@ AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE std::pair sincospi (float x) { std::pair r; -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) || \ - defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - ::sincospif(x, &r.first, &r.second); -#else +#if defined(AMREX_USE_SYCL) r = sincos(pi()*x); +#else + AMREX_IF_ON_DEVICE(( ::sincospif(x, &r.first, &r.second); )) + AMREX_IF_ON_HOST(( r = sincos(pi()*x); )) #endif return r; } diff --git a/Src/Base/AMReX_Random.H b/Src/Base/AMReX_Random.H index 675c12082d..50b2c2693b 100644 --- a/Src/Base/AMReX_Random.H +++ b/Src/Base/AMReX_Random.H @@ -23,24 +23,29 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real Random (RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) -#ifdef BL_USE_FLOAT - return 1.0f - curand_uniform(random_engine.rand_state); +#if defined (__SYCL_DEVICE_ONLY__) + mkl::rng::device::uniform distr; + return mkl::rng::device::generate(distr, *random_engine.engine); #else - return 1.0 - curand_uniform_double(random_engine.rand_state); -#endif -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) #ifdef BL_USE_FLOAT - return 1.0f - hiprand_uniform(random_engine.rand_state); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return 1.0f - hiprand_uniform(random_engine.rand_state); , + return 1.0f - curand_uniform(random_engine.rand_state); + ) + )) #else - return 1.0 - hiprand_uniform_double(random_engine.rand_state); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return 1.0 - hiprand_uniform_double(random_engine.rand_state); , + return 1.0 - curand_uniform_double(random_engine.rand_state); + ) + )) #endif -#elif defined (__SYCL_DEVICE_ONLY__) - mkl::rng::device::uniform distr; - return mkl::rng::device::generate(distr, *random_engine.engine); -#else - amrex::ignore_unused(random_engine); - return Random(); + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return Random(); + )) #endif } @@ -56,24 +61,29 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE Real RandomNormal (Real mean, Real stddev, RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) -#ifdef BL_USE_FLOAT - return stddev * curand_normal(random_engine.rand_state) + mean; +#if defined (__SYCL_DEVICE_ONLY__) + mkl::rng::device::gaussian distr(mean, stddev); + return mkl::rng::device::generate(distr, *random_engine.engine); #else - return stddev * curand_normal_double(random_engine.rand_state) + mean; -#endif -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) #ifdef BL_USE_FLOAT - return stddev * hiprand_normal(random_engine.rand_state) + mean; + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return stddev * hiprand_normal(random_engine.rand_state) + mean; , + return stddev * curand_normal(random_engine.rand_state) + mean; + ) + )) #else - return stddev * hiprand_normal_double(random_engine.rand_state) + mean; + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return stddev * hiprand_normal_double(random_engine.rand_state) + mean; , + return stddev * curand_normal_double(random_engine.rand_state) + mean; + ) + )) #endif -#elif defined (__SYCL_DEVICE_ONLY__) - mkl::rng::device::gaussian distr(mean, stddev); - return mkl::rng::device::generate(distr, *random_engine.engine); -#else - amrex::ignore_unused(random_engine); - return RandomNormal(mean, stddev); + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return RandomNormal(mean, stddev); + )) #endif } @@ -91,16 +101,20 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE unsigned int RandomPoisson (Real lambda, RandomEngine const& random_engine) { -#if defined(__CUDA_ARCH__) && defined(AMREX_USE_CUDA) - return curand_poisson(random_engine.rand_state, lambda); -#elif defined(__HIP_DEVICE_COMPILE__) && defined(AMREX_USE_HIP) - return hiprand_poisson(random_engine.rand_state, lambda); -#elif defined (__SYCL_DEVICE_ONLY__) +#if defined (__SYCL_DEVICE_ONLY__) mkl::rng::device::poisson distr(lambda); return mkl::rng::device::generate(distr, *random_engine.engine); #else - amrex::ignore_unused(random_engine); - return RandomPoisson(lambda); + AMREX_IF_ON_DEVICE(( + AMREX_HIP_OR_CUDA( + return hiprand_poisson(random_engine.rand_state, lambda); , + return curand_poisson(random_engine.rand_state, lambda); + ) + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return RandomPoisson(lambda); + )) #endif } @@ -116,22 +130,23 @@ namespace amrex AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE unsigned int Random_int (unsigned int n, RandomEngine const& random_engine) { -#if AMREX_DEVICE_COMPILE #if defined(__SYCL_DEVICE_ONLY__) mkl::rng::device::uniform distr(0,n); return mkl::rng::device::generate(distr, *random_engine.engine); #else - unsigned int rand; - constexpr unsigned int RAND_M = 4294967295; // 2**32-1 - do { - AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);, - rand = curand(random_engine.rand_state) ); - } while (rand > (RAND_M - RAND_M % n)); - return rand % n; -#endif -#else - amrex::ignore_unused(random_engine); - return Random_int(n); + AMREX_IF_ON_DEVICE(( + unsigned int rand; + constexpr unsigned int RAND_M = 4294967295; // 2**32-1 + do { + AMREX_HIP_OR_CUDA( rand = hiprand(random_engine.rand_state);, + rand = curand(random_engine.rand_state) ); + } while (rand > (RAND_M - RAND_M % n)); + return rand % n; + )) + AMREX_IF_ON_HOST(( + amrex::ignore_unused(random_engine); + return Random_int(n); + )) #endif } diff --git a/Src/Base/AMReX_TableData.H b/Src/Base/AMReX_TableData.H index b7572e2a1c..842225e53f 100644 --- a/Src/Base/AMReX_TableData.H +++ b/Src/Base/AMReX_TableData.H @@ -57,16 +57,17 @@ struct Table1D void index_assert (int i) const { if (i < begin || i >= end) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", - i, begin, end-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << ") is out of bound (" - << begin << ":" << end-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d) is out of bound (%d:%d)\n", + i, begin, end-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << ") is out of bound (" + << begin << ":" << end-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -120,17 +121,18 @@ struct Table2D { if (i < begin[0] || i >= end[0] || j < begin[1] || j >= end[1]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n", - i, j, begin[0], end[0]-1, begin[1], end[1]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d) is out of bound (%d:%d,%d:%d)\n", + i, j, begin[0], end[0]-1, begin[1], end[1]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -188,19 +190,20 @@ struct Table3D if (i < begin[0] || i >= end[0] || j < begin[1] || j >= end[1] || k < begin[2] || k >= end[2]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n", - i, j, k, begin[0], end[0]-1, begin[1], end[1]-1, - begin[2], end[2]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 - << "," << begin[2] << ":" << end[2]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d)\n", + i, j, k, begin[0], end[0]-1, begin[1], end[1]-1, + begin[2], end[2]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 + << "," << begin[2] << ":" << end[2]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif @@ -262,20 +265,21 @@ struct Table4D j < begin[1] || j >= end[1] || k < begin[2] || k >= end[2] || n < begin[3] || n >= end[3]) { -#if AMREX_DEVICE_COMPILE - AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n", - i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1, - begin[2], end[2]-1, begin[3], end[3]-1); - amrex::Abort(); -#else - std::stringstream ss; - ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" - << begin[0] << ":" << end[0]-1 - << "," << begin[1] << ":" << end[1]-1 - << "," << begin[2] << ":" << end[2]-1 - << "," << begin[3] << ":" << end[3]-1 << ")"; - amrex::Abort(ss.str()); -#endif + AMREX_IF_ON_DEVICE(( + AMREX_DEVICE_PRINTF(" (%d,%d,%d,%d) is out of bound (%d:%d,%d:%d,%d:%d,%d:%d)\n", + i, j, k, n, begin[0], end[0]-1, begin[1], end[1]-1, + begin[2], end[2]-1, begin[3], end[3]-1); + amrex::Abort(); + )) + AMREX_IF_ON_HOST(( + std::stringstream ss; + ss << " (" << i << "," << j << "," << k << "," << n << ") is out of bound (" + << begin[0] << ":" << end[0]-1 + << "," << begin[1] << ":" << end[1]-1 + << "," << begin[2] << ":" << end[2]-1 + << "," << begin[3] << ":" << end[3]-1 << ")"; + amrex::Abort(ss.str()); + )) } } #endif diff --git a/Src/Base/Parser/AMReX_IParser.H b/Src/Base/Parser/AMReX_IParser.H index 69f40252b0..025da853c2 100644 --- a/Src/Base/Parser/AMReX_IParser.H +++ b/Src/Base/Parser/AMReX_IParser.H @@ -20,11 +20,8 @@ struct IParserExecutor [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int operator() () const noexcept { -#if AMREX_DEVICE_COMPILE - return iparser_exe_eval(m_device_executor, nullptr); -#else - return iparser_exe_eval(m_host_executor, nullptr); -#endif + AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, nullptr);)) + AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, nullptr);)) } template @@ -33,30 +30,21 @@ struct IParserExecutor operator() (Ts... var) const noexcept { amrex::GpuArray l_var{var...}; -#if AMREX_DEVICE_COMPILE - return iparser_exe_eval(m_device_executor, l_var.data()); -#else - return iparser_exe_eval(m_host_executor, l_var.data()); -#endif + AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, l_var.data());)) + AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, l_var.data());)) } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE int operator() (GpuArray const& var) const noexcept { -#if AMREX_DEVICE_COMPILE - return iparser_exe_eval(m_device_executor, var.data()); -#else - return iparser_exe_eval(m_host_executor, var.data()); -#endif + AMREX_IF_ON_DEVICE((return iparser_exe_eval(m_device_executor, var.data());)) + AMREX_IF_ON_HOST((return iparser_exe_eval(m_host_executor, var.data());)) } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE explicit operator bool () const { -#if AMREX_DEVICE_COMPILE - return m_device_executor != nullptr; -#else - return m_host_executor != nullptr; -#endif + AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;)) + AMREX_IF_ON_HOST((return m_host_executor != nullptr;)) } char* m_host_executor = nullptr; diff --git a/Src/Base/Parser/AMReX_Parser.H b/Src/Base/Parser/AMReX_Parser.H index b74de94195..456910f873 100644 --- a/Src/Base/Parser/AMReX_Parser.H +++ b/Src/Base/Parser/AMReX_Parser.H @@ -21,11 +21,8 @@ struct ParserExecutor [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE double operator() () const noexcept { -#if AMREX_DEVICE_COMPILE - return parser_exe_eval(m_device_executor, nullptr); -#else - return parser_exe_eval(m_host_executor, nullptr); -#endif + AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, nullptr);)) + AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, nullptr);)) } template @@ -34,11 +31,8 @@ struct ParserExecutor operator() (Ts... var) const noexcept { amrex::GpuArray l_var{var...}; -#if AMREX_DEVICE_COMPILE - return parser_exe_eval(m_device_executor, l_var.data()); -#else - return parser_exe_eval(m_host_executor, l_var.data()); -#endif + AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, l_var.data());)) + AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, l_var.data());)) } template @@ -47,30 +41,21 @@ struct ParserExecutor operator() (Ts... var) const noexcept { amrex::GpuArray l_var{var...}; -#if AMREX_DEVICE_COMPILE - return static_cast(parser_exe_eval(m_device_executor, l_var.data())); -#else - return static_cast(parser_exe_eval(m_host_executor, l_var.data())); -#endif + AMREX_IF_ON_DEVICE((return static_cast(parser_exe_eval(m_device_executor, l_var.data()));)) + AMREX_IF_ON_HOST((return static_cast(parser_exe_eval(m_host_executor, l_var.data()));)) } [[nodiscard]] AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE double operator() (GpuArray const& var) const noexcept { -#if AMREX_DEVICE_COMPILE - return parser_exe_eval(m_device_executor, var.data()); -#else - return parser_exe_eval(m_host_executor, var.data()); -#endif + AMREX_IF_ON_DEVICE((return parser_exe_eval(m_device_executor, var.data());)) + AMREX_IF_ON_HOST((return parser_exe_eval(m_host_executor, var.data());)) } AMREX_GPU_HOST_DEVICE AMREX_FORCE_INLINE explicit operator bool () const { -#if AMREX_DEVICE_COMPILE - return m_device_executor != nullptr; -#else - return m_host_executor != nullptr; -#endif + AMREX_IF_ON_DEVICE((return m_device_executor != nullptr;)) + AMREX_IF_ON_HOST((return m_host_executor != nullptr;)) } char* m_host_executor = nullptr; diff --git a/Src/Base/Parser/AMReX_Parser_Y.H b/Src/Base/Parser/AMReX_Parser_Y.H index 792f796554..e84cf9e0d5 100644 --- a/Src/Base/Parser/AMReX_Parser_Y.H +++ b/Src/Base/Parser/AMReX_Parser_Y.H @@ -350,7 +350,7 @@ template AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE T parser_math_comp_ellint_1 (T a) { -#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER) return std::comp_ellint_1(a); #else amrex::ignore_unused(a); @@ -363,7 +363,7 @@ template AMREX_GPU_HOST_DEVICE AMREX_NO_INLINE T parser_math_comp_ellint_2 (T a) { -#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) +#if defined(__GNUC__) && !defined(__clang__) && !defined(__CUDA_ARCH__) && !defined(__NVCOMPILER) return std::comp_ellint_2(a); #else amrex::ignore_unused(a); diff --git a/Src/EB/AMReX_EB2_GeometryShop.H b/Src/EB/AMReX_EB2_GeometryShop.H index ee353c1395..33931b28c5 100644 --- a/Src/EB/AMReX_EB2_GeometryShop.H +++ b/Src/EB/AMReX_EB2_GeometryShop.H @@ -28,13 +28,12 @@ AMREX_GPU_HOST_DEVICE Real IF_f (F const& f, GpuArray const& p) noexcept { -#if AMREX_DEVICE_COMPILE - amrex::ignore_unused(f,p); - amrex::Error("EB2::GeometryShop: how did this happen?"); - return 0.0; -#else - return f({AMREX_D_DECL(p[0],p[1],p[2])}); -#endif + AMREX_IF_ON_DEVICE(( + amrex::ignore_unused(f,p); + amrex::Error("EB2::GeometryShop: how did this happen?"); + return 0.0; + )) + AMREX_IF_ON_HOST((return f({AMREX_D_DECL(p[0],p[1],p[2])});)) } template