Skip to content

Commit

Permalink
GPU Device Variable on Intel GPUs
Browse files Browse the repository at this point in the history
This adds GPU device variable support on Intel GPUs using Intel oneAPI
compiler's experimental feature.

To make the user interface consistent, we have add a macro
AMREX_DEVICE_GLOBAL_VARIABLE. For example, the user can define a device
variable as follows for all GPUs and CPUs.

    AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, my_dg1);  // amrex::Real my_dg1;
    AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, 4, my_dg2);  // amrex::Real my_dg2[4];

Below are their declarations.

    extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, my_dg1);
    extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Real, 4, my_dg2);

GPU and CPU kernels can use the global variables if they see the
declarations.

We have also added two functions from copying data from and to device global
variables.

    //! Copy `nbytes` bytes from host to device global variable. `offset` is the
    //! offset in bytes from the start of the device global variable.
    template <typename T>
    void memcpy_from_host_to_device_global_async (T& dg, const void* src,
                                                  std::size_t nbytes,
                                                  std::size_t offset = 0)

    //! Copy `nbytes` bytes from device global variable to host. `offset` is the
    //! offset in bytes from the start of the device global variable.
    template <typename T>
    void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
                                                  std::size_t nbytes,
                                                  std::size_t offset = 0)
  • Loading branch information
WeiqunZhang committed Aug 1, 2024
1 parent 83ecf62 commit da5f699
Show file tree
Hide file tree
Showing 11 changed files with 203 additions and 4 deletions.
47 changes: 47 additions & 0 deletions Src/Base/AMReX_GpuDevice.H
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
#include <algorithm>
#include <array>
#include <cstdlib>
#include <cstring>
#include <memory>

#define AMREX_GPU_MAX_STREAMS 8
Expand Down Expand Up @@ -318,6 +319,52 @@ dtod_memcpy (void* p_d_dst, const void* p_d_src, const std::size_t sz) noexcept
void hypreSynchronize ();
#endif

//! Copy `nbytes` bytes from host to device global variable. `offset` is the
//! offset in bytes from the start of the device global variable.
template <typename T>
void memcpy_from_host_to_device_global_async (T& dg, const void* src,
std::size_t nbytes,
std::size_t offset = 0)
{
#if defined(AMREX_USE_CUDA)
AMREX_CUDA_SAFE_CALL(cudaMemcpyToSymbolAsync(dg, src, nbytes, offset,
cudaMemcpyHostToDevice,
Device::gpuStream()));
#elif defined(AMREX_USE_HIP)
AMREX_HIP_SAFE_CALL(hipMemcpyToSymbolAsync(dg, src, nbytes, offset,
hipMemcpyHostToDevice,
Device::gpuStream()));
#elif defined(AMREX_USE_SYCL)
Device::streamQueue().memcpy(dg, src, nbytes, offset);
#else
auto* p = (char*)(&dg);
std::memcpy(p+offset, src, nbytes);
#endif
}

//! Copy `nbytes` bytes from device global variable to host. `offset` is the
//! offset in bytes from the start of the device global variable.
template <typename T>
void memcpy_from_device_global_to_host_async (void* dst, T const& dg,
std::size_t nbytes,
std::size_t offset = 0)
{
#if defined(AMREX_USE_CUDA)
AMREX_CUDA_SAFE_CALL(cudaMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
cudaMemcpyDeviceToHost,
Device::gpuStream()));
#elif defined(AMREX_USE_HIP)
AMREX_HIP_SAFE_CALL(hipMemcpyFromSymbolAsync(dst, dg, nbytes, offset,
hipMemcpyDeviceToHost,
Device::gpuStream()));
#elif defined(AMREX_USE_SYCL)
Device::streamQueue().memcpy(dst, dg, nbytes, offset);
#else
auto const* p = (char const*)(&dg);
std::memcpy(dst, p+offset, nbytes);
#endif
}

}

#endif
7 changes: 3 additions & 4 deletions Src/Base/AMReX_GpuLaunch.nolint.H
Original file line number Diff line number Diff line change
@@ -1,9 +1,8 @@
// Do not include this header anywhere other than AMReX_GpuLaunch.H.
// The purpose of this file is to avoid clang-tidy.

#define AMREX_WRONG_NUM_ARGS(...) static_assert(false,"Wrong number of arguments to macro")
#define AMREX_GET_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
#define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
#define AMREX_GET_LAUNCH_MACRO(_1,_2,_3,_4,_5,_6,_7,_8,_9,NAME,...) NAME
#define AMREX_LAUNCH_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\
AMREX_GPU_LAUNCH_DEVICE_LAMBDA_RANGE_3, \
AMREX_WRONG_NUM_ARGS, \
AMREX_WRONG_NUM_ARGS, \
Expand All @@ -14,7 +13,7 @@
AMREX_WRONG_NUM_ARGS, \
AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)

#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_MACRO(__VA_ARGS__,\
#define AMREX_LAUNCH_HOST_DEVICE_LAMBDA(...) AMREX_GET_LAUNCH_MACRO(__VA_ARGS__,\
AMREX_GPU_LAUNCH_HOST_DEVICE_LAMBDA_RANGE_3, \
AMREX_WRONG_NUM_ARGS, \
AMREX_WRONG_NUM_ARGS, \
Expand Down
18 changes: 18 additions & 0 deletions Src/Base/AMReX_GpuQualifiers.H
Original file line number Diff line number Diff line change
Expand Up @@ -64,4 +64,22 @@
# include <sycl/sycl.hpp>
#endif

#define AMREX_WRONG_NUM_ARGS(...) static_assert(false,"Wrong number of arguments to macro")

#define AMREX_GET_DGV_MACRO(_1,_2,_3,NAME,...) NAME
#define AMREX_DEVICE_GLOBAL_VARIABLE(...) AMREX_GET_DGV_MACRO(__VA_ARGS__,\
AMREX_DGVARR, AMREX_DGV,\
AMREX_WRONG_NUM_ARGS)(__VA_ARGS__)

#ifdef AMREX_USE_SYCL
# define AMREX_DGV(type,name) SYCL_EXTERNAL sycl::ext::oneapi::experimental::device_global<type> name
# define AMREX_DGVARR(type,num,name) SYCL_EXTERNAL sycl::ext::oneapi::experimental::device_global<type[num]> name
#elif defined(AMREX_USE_CUDA) || defined(AMREX_USE_HIP)
# define AMREX_DGV(type,name) __device__ type name
# define AMREX_DGVARR(type,num,name) __device__ type name[num]
#else
# define AMREX_DGV(type,name) type name
# define AMREX_DGVARR(type,num,name) type name[num]
#endif

#endif
9 changes: 9 additions & 0 deletions Tests/DeviceGlobal/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
foreach(D IN LISTS AMReX_SPACEDIM)
set(_sources main.cpp global_vars.cpp init.cpp work.cpp)
set(_input_files)

setup_test(${D} _sources _input_files)

unset(_sources)
unset(_input_files)
endforeach()
23 changes: 23 additions & 0 deletions Tests/DeviceGlobal/GNUmakefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
AMREX_HOME ?= ../../

DEBUG = FALSE

DIM = 3

COMP = gcc

USE_CUDA = TRUE
USE_HIP = FALSE
USE_SYCL = FALSE

USE_MPI = FALSE
USE_OMP = FALSE

BL_NO_FORT = TRUE

include $(AMREX_HOME)/Tools/GNUMake/Make.defs

include ./Make.package
include $(AMREX_HOME)/Src/Base/Make.package

include $(AMREX_HOME)/Tools/GNUMake/Make.rules
2 changes: 2 additions & 0 deletions Tests/DeviceGlobal/Make.package
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
CEXE_sources += main.cpp init.cpp work.cpp global_vars.cpp

7 changes: 7 additions & 0 deletions Tests/DeviceGlobal/global_vars.H
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
#pragma once

#include <AMReX_Gpu.H>
#include <AMReX_INT.H>

extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, dg_x);
extern AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, 4, dg_y);
7 changes: 7 additions & 0 deletions Tests/DeviceGlobal/global_vars.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@

#include "global_vars.H"

// definitions of global variables

AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, dg_x);
AMREX_DEVICE_GLOBAL_VARIABLE(amrex::Long, 4, dg_y);
27 changes: 27 additions & 0 deletions Tests/DeviceGlobal/init.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@

#include "global_vars.H"

void init ()
{
amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int i)
{
dg_x = 1;
for (int n = 0; n < 4; ++n) {
dg_y[n] = 100 + n;
}
});

amrex::Gpu::streamSynchronize();
}

void init2 ()
{
amrex::Gpu::PinnedVector<amrex::Long> pv{2,200,201,202,203};
amrex::Gpu::memcpy_from_host_to_device_global_async
(dg_x, pv.data(), sizeof(amrex::Long));
amrex::Gpu::memcpy_from_host_to_device_global_async
(dg_y, pv.data()+1, sizeof(amrex::Long));
amrex::Gpu::memcpy_from_host_to_device_global_async
(dg_y, pv.data()+2, sizeof(amrex::Long)*3, sizeof(amrex::Long));
amrex::Gpu::streamSynchronize();
}
20 changes: 20 additions & 0 deletions Tests/DeviceGlobal/main.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#include <AMReX.H>
#include <AMReX_Print.H>

void init();
void work();
void init2();
void work2();

int main (int argc, char* argv[])
{
amrex::Initialize(argc,argv);
{
init();
work();

init2();
work2();
}
amrex::Finalize();
}
40 changes: 40 additions & 0 deletions Tests/DeviceGlobal/work.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@

#include "global_vars.H"

void work ()
{
amrex::Gpu::PinnedVector<amrex::Long> pv;
pv.resize(5,0);
auto* p = pv.data();
amrex::ParallelFor(1, [=] AMREX_GPU_DEVICE (int)
{
p[0] = dg_x;
for (int n = 0; n < 4; ++n) {
p[1+n] = dg_y[n];
}
});
amrex::Gpu::streamSynchronize();
AMREX_ALWAYS_ASSERT(pv[0] == 1 &&
pv[1] == 100 &&
pv[2] == 101 &&
pv[3] == 102 &&
pv[4] == 103);
}

void work2 ()
{
amrex::Gpu::PinnedVector<amrex::Long> pv;
pv.resize(5,0);
amrex::Gpu::memcpy_from_device_global_to_host_async
(pv.data(), dg_x, sizeof(amrex::Long));
amrex::Gpu::memcpy_from_device_global_to_host_async
(pv.data()+1, dg_y, sizeof(amrex::Long));
amrex::Gpu::memcpy_from_device_global_to_host_async
(pv.data()+2, dg_y, sizeof(amrex::Long)*3, sizeof(amrex::Long));
amrex::Gpu::streamSynchronize();
AMREX_ALWAYS_ASSERT(pv[0] == 2 &&
pv[1] == 200 &&
pv[2] == 201 &&
pv[3] == 202 &&
pv[4] == 203);
}

0 comments on commit da5f699

Please sign in to comment.