-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #8 from sbaldu/feature_alpaka
Feature alpaka
- Loading branch information
Showing
47 changed files
with
6,263 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
[submodule "extern/pybind11"] | ||
path = extern/pybind11 | ||
url = https://github.com/sbaldu/pybind11.git | ||
branch = master | ||
[submodule "extern/alpaka"] | ||
path = extern/alpaka | ||
url = https://github.com/cms-patatrack/alpaka.git |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,31 @@ | ||
#ifndef AlpakaCore_AllocatorConfig_h | ||
#define AlpakaCore_AllocatorConfig_h | ||
|
||
#include <limits> | ||
|
||
namespace cms::alpakatools { | ||
|
||
namespace config { | ||
|
||
// bin growth factor (bin_growth in cub::CachingDeviceAllocator) | ||
constexpr unsigned int binGrowth = 2; | ||
|
||
// smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator | ||
constexpr unsigned int minBin = 8; // 256 bytes | ||
|
||
// largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail. | ||
constexpr unsigned int maxBin = 30; // 1 GB | ||
|
||
// total storage for the allocator; 0 means no limit. | ||
constexpr size_t maxCachedBytes = 0; | ||
|
||
// fraction of total device memory taken for the allocator; 0 means no limit. | ||
constexpr double maxCachedFraction = 0.8; | ||
|
||
// if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used. | ||
|
||
} // namespace config | ||
|
||
} // namespace cms::alpakatools | ||
|
||
#endif // AlpakaCore_AllocatorConfig_h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
#ifndef AlpakaCore_AllocatorPolicy_h | ||
#define AlpakaCore_AllocatorPolicy_h | ||
|
||
#include <alpaka/alpaka.hpp> | ||
|
||
namespace cms::alpakatools { | ||
|
||
// Which memory allocator to use | ||
// - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost | ||
// - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2) | ||
// - Caching: (device and host) caching allocator | ||
enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 }; | ||
|
||
template <typename TDev> | ||
constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous; | ||
|
||
#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED | ||
template <> | ||
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> = | ||
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR | ||
AllocatorPolicy::Caching; | ||
#else | ||
AllocatorPolicy::Synchronous; | ||
#endif | ||
#endif // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED | ||
|
||
#if defined ALPAKA_ACC_GPU_CUDA_ENABLED | ||
template <> | ||
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCudaRt> = | ||
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR | ||
AllocatorPolicy::Caching; | ||
#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR | ||
AllocatorPolicy::Asynchronous; | ||
#else | ||
AllocatorPolicy::Synchronous; | ||
#endif | ||
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED | ||
|
||
#if defined ALPAKA_ACC_GPU_HIP_ENABLED | ||
template <> | ||
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> = | ||
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR | ||
AllocatorPolicy::Caching; | ||
#else | ||
AllocatorPolicy::Synchronous; | ||
#endif | ||
#endif // ALPAKA_ACC_GPU_HIP_ENABLED | ||
|
||
} // namespace cms::alpakatools | ||
|
||
#endif // AlpakaCore_AllocatorPolicy_h |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,145 @@ | ||
#ifndef AlpakaCore_CachedBufAlloc_h | ||
#define AlpakaCore_CachedBufAlloc_h | ||
|
||
#include <alpaka/alpaka.hpp> | ||
|
||
#include "getDeviceCachingAllocator.h" | ||
#include "getHostCachingAllocator.h" | ||
|
||
namespace cms::alpakatools { | ||
|
||
namespace traits { | ||
|
||
//! The caching memory allocator trait. | ||
template <typename TElem, typename TDim, typename TIdx, typename TDev, typename TQueue, typename TSfinae = void> | ||
struct CachedBufAlloc { | ||
static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator"); | ||
}; | ||
|
||
//! The caching memory allocator implementation for the CPU device | ||
template <typename TElem, typename TDim, typename TIdx, typename TQueue> | ||
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> { | ||
template <typename TExtent> | ||
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent) | ||
-> alpaka::BufCpu<TElem, TDim, TIdx> { | ||
// non-cached host-only memory | ||
return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent); | ||
} | ||
}; | ||
|
||
#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED | ||
|
||
//! The caching memory allocator implementation for the pinned host memory | ||
template <typename TElem, typename TDim, typename TIdx> | ||
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> { | ||
template <typename TExtent> | ||
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, | ||
alpaka::QueueCudaRtNonBlocking queue, | ||
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> { | ||
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; | ||
|
||
auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>(); | ||
|
||
// FIXME the BufCpu does not support a pitch ? | ||
size_t size = alpaka::getExtentProduct(extent); | ||
size_t sizeBytes = size * sizeof(TElem); | ||
void* memPtr = allocator.allocate(sizeBytes, queue); | ||
|
||
// use a custom deleter to return the buffer to the CachingAllocator | ||
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; | ||
|
||
return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent); | ||
} | ||
}; | ||
|
||
//! The caching memory allocator implementation for the CUDA device | ||
template <typename TElem, typename TDim, typename TIdx, typename TQueue> | ||
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> { | ||
template <typename TExtent> | ||
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent) | ||
-> alpaka::BufCudaRt<TElem, TDim, TIdx> { | ||
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; | ||
|
||
auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev); | ||
|
||
size_t width = alpaka::getWidth(extent); | ||
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem)); | ||
// TODO implement pitch for TDim > 1 | ||
size_t pitchBytes = widthBytes; | ||
size_t size = alpaka::getExtentProduct(extent); | ||
size_t sizeBytes = size * sizeof(TElem); | ||
void* memPtr = allocator.allocate(sizeBytes, queue); | ||
|
||
// use a custom deleter to return the buffer to the CachingAllocator | ||
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; | ||
|
||
return alpaka::BufCudaRt<TElem, TDim, TIdx>( | ||
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent); | ||
} | ||
}; | ||
|
||
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED | ||
|
||
#ifdef ALPAKA_ACC_GPU_HIP_ENABLED | ||
|
||
//! The caching memory allocator implementation for the pinned host memory | ||
template <typename TElem, typename TDim, typename TIdx> | ||
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> { | ||
template <typename TExtent> | ||
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, | ||
alpaka::QueueHipRtNonBlocking queue, | ||
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> { | ||
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; | ||
|
||
auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>(); | ||
|
||
// FIXME the BufCpu does not support a pitch ? | ||
size_t size = alpaka::getExtentProduct(extent); | ||
size_t sizeBytes = size * sizeof(TElem); | ||
void* memPtr = allocator.allocate(sizeBytes, queue); | ||
|
||
// use a custom deleter to return the buffer to the CachingAllocator | ||
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; | ||
|
||
return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent); | ||
} | ||
}; | ||
|
||
//! The caching memory allocator implementation for the ROCm/HIP device | ||
template <typename TElem, typename TDim, typename TIdx, typename TQueue> | ||
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> { | ||
template <typename TExtent> | ||
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent) | ||
-> alpaka::BufHipRt<TElem, TDim, TIdx> { | ||
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE; | ||
|
||
auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev); | ||
|
||
size_t width = alpaka::getWidth(extent); | ||
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem)); | ||
// TODO implement pitch for TDim > 1 | ||
size_t pitchBytes = widthBytes; | ||
size_t size = alpaka::getExtentProduct(extent); | ||
size_t sizeBytes = size * sizeof(TElem); | ||
void* memPtr = allocator.allocate(sizeBytes, queue); | ||
|
||
// use a custom deleter to return the buffer to the CachingAllocator | ||
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); }; | ||
|
||
return alpaka::BufHipRt<TElem, TDim, TIdx>( | ||
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent); | ||
} | ||
}; | ||
|
||
#endif // ALPAKA_ACC_GPU_HIP_ENABLED | ||
|
||
} // namespace traits | ||
|
||
template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev> | ||
ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) { | ||
return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent); | ||
} | ||
|
||
} // namespace cms::alpakatools | ||
|
||
#endif // AlpakaCore_CachedBufAlloc_h |
Oops, something went wrong.