Skip to content

Commit

Permalink
Merge pull request #8 from sbaldu/feature_alpaka
Browse files Browse the repository at this point in the history
Feature alpaka
  • Loading branch information
waredjeb authored Jan 19, 2024
2 parents 203bcde + ac55fa4 commit 9c19be3
Show file tree
Hide file tree
Showing 47 changed files with 6,263 additions and 0 deletions.
7 changes: 7 additions & 0 deletions .gitmodules
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[submodule "extern/pybind11"]
path = extern/pybind11
url = https://github.com/sbaldu/pybind11.git
branch = master
[submodule "extern/alpaka"]
path = extern/alpaka
url = https://github.com/cms-patatrack/alpaka.git
31 changes: 31 additions & 0 deletions CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
#ifndef AlpakaCore_AllocatorConfig_h
#define AlpakaCore_AllocatorConfig_h

#include <limits>

namespace cms::alpakatools {

namespace config {

// bin growth factor (bin_growth in cub::CachingDeviceAllocator)
constexpr unsigned int binGrowth = 2;

// smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
constexpr unsigned int minBin = 8; // 256 bytes

// largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
constexpr unsigned int maxBin = 30; // 1 GB

// total storage for the allocator; 0 means no limit.
constexpr size_t maxCachedBytes = 0;

// fraction of total device memory taken for the allocator; 0 means no limit.
constexpr double maxCachedFraction = 0.8;

// if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used.

} // namespace config

} // namespace cms::alpakatools

#endif // AlpakaCore_AllocatorConfig_h
51 changes: 51 additions & 0 deletions CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#ifndef AlpakaCore_AllocatorPolicy_h
#define AlpakaCore_AllocatorPolicy_h

#include <alpaka/alpaka.hpp>

namespace cms::alpakatools {

// Which memory allocator to use
// - Synchronous: (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost
// - Asynchronous: (device only) cudaMallocAsync (requires CUDA >= 11.2)
// - Caching: (device and host) caching allocator
enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };

template <typename TDev>
constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;

#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
template <>
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#else
AllocatorPolicy::Synchronous;
#endif
#endif // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED

#if defined ALPAKA_ACC_GPU_CUDA_ENABLED
template <>
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCudaRt> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
AllocatorPolicy::Asynchronous;
#else
AllocatorPolicy::Synchronous;
#endif
#endif // ALPAKA_ACC_GPU_CUDA_ENABLED

#if defined ALPAKA_ACC_GPU_HIP_ENABLED
template <>
constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
AllocatorPolicy::Caching;
#else
AllocatorPolicy::Synchronous;
#endif
#endif // ALPAKA_ACC_GPU_HIP_ENABLED

} // namespace cms::alpakatools

#endif // AlpakaCore_AllocatorPolicy_h
145 changes: 145 additions & 0 deletions CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,145 @@
#ifndef AlpakaCore_CachedBufAlloc_h
#define AlpakaCore_CachedBufAlloc_h

#include <alpaka/alpaka.hpp>

#include "getDeviceCachingAllocator.h"
#include "getHostCachingAllocator.h"

namespace cms::alpakatools {

namespace traits {

//! The caching memory allocator trait.
template <typename TElem, typename TDim, typename TIdx, typename TDev, typename TQueue, typename TSfinae = void>
struct CachedBufAlloc {
static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
};

//! The caching memory allocator implementation for the CPU device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufCpu<TElem, TDim, TIdx> {
// non-cached host-only memory
return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
}
};

#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED

//! The caching memory allocator implementation for the pinned host memory
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueCudaRtNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();

// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

//! The caching memory allocator implementation for the CUDA device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufCudaRt<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);

size_t width = alpaka::getWidth(extent);
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch for TDim > 1
size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCudaRt<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
}
};

#endif // ALPAKA_ACC_GPU_CUDA_ENABLED

#ifdef ALPAKA_ACC_GPU_HIP_ENABLED

//! The caching memory allocator implementation for the pinned host memory
template <typename TElem, typename TDim, typename TIdx>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
alpaka::QueueHipRtNonBlocking queue,
TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();

// FIXME the BufCpu does not support a pitch ?
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
}
};

//! The caching memory allocator implementation for the ROCm/HIP device
template <typename TElem, typename TDim, typename TIdx, typename TQueue>
struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
template <typename TExtent>
ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
-> alpaka::BufHipRt<TElem, TDim, TIdx> {
ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;

auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);

size_t width = alpaka::getWidth(extent);
size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
// TODO implement pitch for TDim > 1
size_t pitchBytes = widthBytes;
size_t size = alpaka::getExtentProduct(extent);
size_t sizeBytes = size * sizeof(TElem);
void* memPtr = allocator.allocate(sizeBytes, queue);

// use a custom deleter to return the buffer to the CachingAllocator
auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };

return alpaka::BufHipRt<TElem, TDim, TIdx>(
dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
}
};

#endif // ALPAKA_ACC_GPU_HIP_ENABLED

} // namespace traits

template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev>
ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
}

} // namespace cms::alpakatools

#endif // AlpakaCore_CachedBufAlloc_h
Loading

0 comments on commit 9c19be3

Please sign in to comment.