From da5f6998587ec439dd03818b773a8530f965e00f Mon Sep 17 00:00:00 2001 From: Weiqun Zhang Date: Thu, 1 Aug 2024 02:05:52 +0000 Subject: [PATCH] GPU Device Variable on Intel GPUs This adds GPU device variable support on Intel GPUs using Intel oneAPI compiler's experimental feature. To make the user interface consistent, we have add a macro AMREX_DEVICE_GLOBAL_VARIABLE. For example, the user can define a device variable as follows for all GPUs and CPUs. AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, my_dg1); // amrex::Real my_dg1; AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, 4, my_dg2); // amrex::Real my_dg2[4]; Below are their declarations. extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, my_dg1); extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, 4, my_dg2); GPU and CPU kernels can use the global variables if they see the declarations. We have also added two functions from copying data from and to device global variables. //! Copy `nbytes` bytes from host to device global variable. `offset` is the //! offset in bytes from the start of the device global variable. template void memcpy_from_host_to_device_global_async (T& dg, const void* src, std::size_t nbytes, std::size_t offset = 0) //! Copy `nbytes` bytes from device global variable to host. `offset` is the //! offset in bytes from the start of the device global variable. template void memcpy_from_device_global_to_host_async (void* dst, T const& dg, std::size_t nbytes, std::size_t offset = 0) --- Src/Base/AMReX_GpuDevice.H | 47 ++++++++++++++++++++++++++++++ Src/Base/AMReX_GpuLaunch.nolint.H | 7 ++--- Src/Base/AMReX_GpuQualifiers.H | 18 ++++++++++++ Tests/DeviceGlobal/CMakeLists.txt | 9 ++++++ Tests/DeviceGlobal/GNUmakefile | 23 +++++++++++++++ Tests/DeviceGlobal/Make.package | 2 ++ Tests/DeviceGlobal/global_vars.H | 7 +++++ Tests/DeviceGlobal/global_vars.cpp | 7 +++++ Tests/DeviceGlobal/init.cpp | 27 +++++++++++++++++ Tests/DeviceGlobal/main.cpp | 20 +++++++++++++ Tests/DeviceGlobal/work.cpp | 40 +++++++++++++++++++++++++ 11 files changed, 203 insertions(+), 4 deletions(-) create mode 100644 Tests/DeviceGlobal/CMakeLists.txt create mode 100644 Tests/DeviceGlobal/GNUmakefile create mode 100644 Tests/DeviceGlobal/Make.package create mode 100644 Tests/DeviceGlobal/global_vars.H create mode 100644 Tests/DeviceGlobal/global_vars.cpp create mode 100644 Tests/DeviceGlobal/init.cpp create mode 100644 Tests/DeviceGlobal/main.cpp create mode 100644 Tests/DeviceGlobal/work.cpp diff --git a/Src/Base/AMReX_GpuDevice.H b/Src/Base/AMReX_GpuDevice.H index 7c17c918a70..a7aef5a9243 100644 --- a/Src/Base/AMReX_GpuDevice.H +++ b/Src/Base/AMReX_GpuDevice.H @@ -14,6 +14,7 @@ #include #include #include +#include #include #define AMREX_GPU_MAX_STREAMS 8 @@ -318,6 +319,52 @@ dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept void hypreSynchronize (); #endif +//! Copy `nbytes` bytes from host to device global variable. `offset` is the +//! offset in bytes from the start of the device global variable. +template +void memcpy_from_host_to_device_global_async (T& dg, const void* src, + std::size_t nbytes, + std::size_t offset = 0) +{ +#if defined(AMREX_USE_CUDA) + AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset, + cudaMemcpyHostToDevice, + Device::gpuStream())); +#elif defined(AMREX_USE_HIP) + AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset, + hipMemcpyHostToDevice, + Device::gpuStream())); +#elif defined(AMREX_USE_SYCL) + Device::streamQueue().memcpy(dg, src, nbytes, offset); +#else + auto* p = (char*)(&dg); + std::memcpy(p+offset, src, nbytes); +#endif +} + +//! Copy `nbytes` bytes from device global variable to host. `offset` is the +//! offset in bytes from the start of the device global variable. +template +void memcpy_from_device_global_to_host_async (void* dst, T const& dg, + std::size_t nbytes, + std::size_t offset = 0) +{ +#if defined(AMREX_USE_CUDA) + AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset, + cudaMemcpyDeviceToHost, + Device::gpuStream())); +#elif defined(AMREX_USE_HIP) + AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset, + hipMemcpyDeviceToHost, + Device::gpuStream())); +#elif defined(AMREX_USE_SYCL) + Device::streamQueue().memcpy(dst, dg, nbytes, offset); +#else + auto const* p = (char const*)(&dg); + std::memcpy(dst, p+offset, nbytes); +#endif +} + } #endif diff --git a/Src/Base/AMReX_GpuLaunch.nolint.H b/Src/Base/AMReX_GpuLaunch.nolint.H index c7df1737517..bb1bbb2453b 100644 --- a/Src/Base/AMReX_GpuLaunch.nolint.H +++ b/Src/Base/AMReX_GpuLaunch.nolint.H @@ -1,9 +1,8 @@ // Do not include this header anywhere other than AMReX_GpuLaunch.H. // The purpose of this file is to avoid clang-tidy. -#define AMREX_WRONG_NUM_ARGS(...) static_assert(false,"Wrong number of arguments to macro") -#define AMREX_GET_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME -#define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\ +#define AMREX_GET_LAUNCH_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME +#define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\ AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3, \ AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS, \ @@ -14,7 +13,7 @@ AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) -#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\ +#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\ AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3, \ AMREX_WRONG_NUM_ARGS, \ AMREX_WRONG_NUM_ARGS, \ diff --git a/Src/Base/AMReX_GpuQualifiers.H b/Src/Base/AMReX_GpuQualifiers.H index 4fba23a849a..3e10bec54df 100644 --- a/Src/Base/AMReX_GpuQualifiers.H +++ b/Src/Base/AMReX_GpuQualifiers.H @@ -64,4 +64,22 @@ # include #endif +#define AMREX_WRONG_NUM_ARGS(...) static_assert(false,"Wrong number of arguments to macro") + +#define AMREX_GET_DGV_MACRO(_1,_2,_3,NAME,...) NAME +#define AMREX_DEVICE_GLOBAL_VARIABLE(...) AMREX_GET_DGV_MACRO(__VA_ARGS__,\ + AMREX_DGVARR, AMREX_DGV,\ + AMREX_WRONG_NUM_ARGS)(__VA_ARGS__) + +#ifdef AMREX_USE_SYCL +# define AMREX_DGV(type,name) SYCL_EXTERNAL sycl::ext::oneapi::experimental::device_global name +# define AMREX_DGVARR(type,num,name) SYCL_EXTERNAL sycl::ext::oneapi::experimental::device_global name +#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP) +# define AMREX_DGV(type,name) __device__ type name +# define AMREX_DGVARR(type,num,name) __device__ type name[num] +#else +# define AMREX_DGV(type,name) type name +# define AMREX_DGVARR(type,num,name) type name[num] +#endif + #endif diff --git a/Tests/DeviceGlobal/CMakeLists.txt b/Tests/DeviceGlobal/CMakeLists.txt new file mode 100644 index 00000000000..5c378f7fdd5 --- /dev/null +++ b/Tests/DeviceGlobal/CMakeLists.txt @@ -0,0 +1,9 @@ +foreach(D IN LISTS AMReX_SPACEDIM) + set(_sources main.cpp global_vars.cpp init.cpp work.cpp) + set(_input_files) + + setup_test(${D} _sources _input_files) + + unset(_sources) + unset(_input_files) +endforeach() diff --git a/Tests/DeviceGlobal/GNUmakefile b/Tests/DeviceGlobal/GNUmakefile new file mode 100644 index 00000000000..fd5fbd8f2c0 --- /dev/null +++ b/Tests/DeviceGlobal/GNUmakefile @@ -0,0 +1,23 @@ +AMREX_HOME ?= ../../ + +DEBUG = FALSE + +DIM = 3 + +COMP = gcc + +USE_CUDA = TRUE +USE_HIP = FALSE +USE_SYCL = FALSE + +USE_MPI = FALSE +USE_OMP = FALSE + +BL_NO_FORT = TRUE + +include $(AMREX_HOME)/Tools/GNUMake/Make.defs + +include ./Make.package +include $(AMREX_HOME)/Src/Base/Make.package + +include $(AMREX_HOME)/Tools/GNUMake/Make.rules diff --git a/Tests/DeviceGlobal/Make.package b/Tests/DeviceGlobal/Make.package new file mode 100644 index 00000000000..8df45d1f81b --- /dev/null +++ b/Tests/DeviceGlobal/Make.package @@ -0,0 +1,2 @@ +CEXE_sources += main.cpp init.cpp work.cpp global_vars.cpp + diff --git a/Tests/DeviceGlobal/global_vars.H b/Tests/DeviceGlobal/global_vars.H new file mode 100644 index 00000000000..88ce1f0c4fc --- /dev/null +++ b/Tests/DeviceGlobal/global_vars.H @@ -0,0 +1,7 @@ +#pragma once + +#include +#include + +extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, dg_x); +extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, 4, dg_y); diff --git a/Tests/DeviceGlobal/global_vars.cpp b/Tests/DeviceGlobal/global_vars.cpp new file mode 100644 index 00000000000..485f41f164c --- /dev/null +++ b/Tests/DeviceGlobal/global_vars.cpp @@ -0,0 +1,7 @@ + +#include "global_vars.H" + +// definitions of global variables + +AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, dg_x); +AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, 4, dg_y); diff --git a/Tests/DeviceGlobal/init.cpp b/Tests/DeviceGlobal/init.cpp new file mode 100644 index 00000000000..03975da33a9 --- /dev/null +++ b/Tests/DeviceGlobal/init.cpp @@ -0,0 +1,27 @@ + +#include "global_vars.H" + +void init () +{ + amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int i) + { + dg_x = 1; + for (int n = 0; n < 4; ++n) { + dg_y[n] = 100 + n; + } + }); + + amrex::Gpu::streamSynchronize(); +} + +void init2 () +{ + amrex::Gpu::PinnedVector pv{2,200,201,202,203}; + amrex::Gpu::memcpy_from_host_to_device_global_async + (dg_x, pv.data(), sizeof(amrex::Long)); + amrex::Gpu::memcpy_from_host_to_device_global_async + (dg_y, pv.data()+1, sizeof(amrex::Long)); + amrex::Gpu::memcpy_from_host_to_device_global_async + (dg_y, pv.data()+2, sizeof(amrex::Long)*3, sizeof(amrex::Long)); + amrex::Gpu::streamSynchronize(); +} diff --git a/Tests/DeviceGlobal/main.cpp b/Tests/DeviceGlobal/main.cpp new file mode 100644 index 00000000000..b3b67784722 --- /dev/null +++ b/Tests/DeviceGlobal/main.cpp @@ -0,0 +1,20 @@ +#include +#include + +void init(); +void work(); +void init2(); +void work2(); + +int main (int argc, char* argv[]) +{ + amrex::Initialize(argc,argv); + { + init(); + work(); + + init2(); + work2(); + } + amrex::Finalize(); +} diff --git a/Tests/DeviceGlobal/work.cpp b/Tests/DeviceGlobal/work.cpp new file mode 100644 index 00000000000..8350dad0661 --- /dev/null +++ b/Tests/DeviceGlobal/work.cpp @@ -0,0 +1,40 @@ + +#include "global_vars.H" + +void work () +{ + amrex::Gpu::PinnedVector pv; + pv.resize(5,0); + auto* p = pv.data(); + amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int) + { + p[0] = dg_x; + for (int n = 0; n < 4; ++n) { + p[1+n] = dg_y[n]; + } + }); + amrex::Gpu::streamSynchronize(); + AMREX_ALWAYS_ASSERT(pv[0] == 1 && + pv[1] == 100 && + pv[2] == 101 && + pv[3] == 102 && + pv[4] == 103); +} + +void work2 () +{ + amrex::Gpu::PinnedVector pv; + pv.resize(5,0); + amrex::Gpu::memcpy_from_device_global_to_host_async + (pv.data(), dg_x, sizeof(amrex::Long)); + amrex::Gpu::memcpy_from_device_global_to_host_async + (pv.data()+1, dg_y, sizeof(amrex::Long)); + amrex::Gpu::memcpy_from_device_global_to_host_async + (pv.data()+2, dg_y, sizeof(amrex::Long)*3, sizeof(amrex::Long)); + amrex::Gpu::streamSynchronize(); + AMREX_ALWAYS_ASSERT(pv[0] == 2 && + pv[1] == 200 && + pv[2] == 201 && + pv[3] == 202 && + pv[4] == 203); +}