Merge pull request #8 from sbaldu/feature_alpaka

Feature alpaka
cms-patatrack · Jan 19, 2024 · 9c19be3 · 9c19be3
2 parents 203bcde + ac55fa4
commit 9c19be3
Show file tree

Hide file tree

Showing 47 changed files with 6,263 additions and 0 deletions.
diff --git a/.gitmodules b/.gitmodules
@@ -0,0 +1,7 @@
+[submodule "extern/pybind11"]
+	path = extern/pybind11
+	url = https://github.com/sbaldu/pybind11.git
+	branch = master
+[submodule "extern/alpaka"]
+	path = extern/alpaka
+	url = https://github.com/cms-patatrack/alpaka.git
diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h b/CLUEstering/alpaka/AlpakaCore/AllocatorConfig.h
@@ -0,0 +1,31 @@
+#ifndef AlpakaCore_AllocatorConfig_h
+#define AlpakaCore_AllocatorConfig_h
+
+#include <limits>
+
+namespace cms::alpakatools {
+
+  namespace config {
+
+    // bin growth factor (bin_growth in cub::CachingDeviceAllocator)
+    constexpr unsigned int binGrowth = 2;
+
+    // smallest bin, corresponds to binGrowth^minBin bytes (min_bin in cub::CacingDeviceAllocator
+    constexpr unsigned int minBin = 8;  // 256 bytes
+
+    // largest bin, corresponds to binGrowth^maxBin bytes (max_bin in cub::CachingDeviceAllocator). Note that unlike in cub, allocations larger than binGrowth^maxBin are set to fail.
+    constexpr unsigned int maxBin = 30;  // 1 GB
+
+    // total storage for the allocator; 0 means no limit.
+    constexpr size_t maxCachedBytes = 0;
+
+    // fraction of total device memory taken for the allocator; 0 means no limit.
+    constexpr double maxCachedFraction = 0.8;
+
+    // if both maxCachedBytes and maxCachedFraction are non-zero, the smallest resulting value is used.
+
+  }  // namespace config
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_AllocatorConfig_h
diff --git a/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h b/CLUEstering/alpaka/AlpakaCore/AllocatorPolicy.h
@@ -0,0 +1,51 @@
+#ifndef AlpakaCore_AllocatorPolicy_h
+#define AlpakaCore_AllocatorPolicy_h
+
+#include <alpaka/alpaka.hpp>
+
+namespace cms::alpakatools {
+
+  // Which memory allocator to use
+  //   - Synchronous:   (device and host) cudaMalloc/hipMalloc and cudaMallocHost/hipMallocHost
+  //   - Asynchronous:  (device only)     cudaMallocAsync (requires CUDA >= 11.2)
+  //   - Caching:       (device and host) caching allocator
+  enum class AllocatorPolicy { Synchronous = 0, Asynchronous = 1, Caching = 2 };
+
+  template <typename TDev>
+  constexpr inline AllocatorPolicy allocator_policy = AllocatorPolicy::Synchronous;
+
+#if defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCpu> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#else
+      AllocatorPolicy::Synchronous;
+#endif
+#endif  // defined ALPAKA_ACC_CPU_B_SEQ_T_SEQ_ENABLED || defined ALPAKA_ACC_CPU_B_TBB_T_SEQ_ENABLED
+
+#if defined ALPAKA_ACC_GPU_CUDA_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevCudaRt> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#elif CUDA_VERSION >= 11020 && !defined ALPAKA_DISABLE_ASYNC_ALLOCATOR
+      AllocatorPolicy::Asynchronous;
+#else
+          AllocatorPolicy::Synchronous;
+#endif
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#if defined ALPAKA_ACC_GPU_HIP_ENABLED
+  template <>
+  constexpr inline AllocatorPolicy allocator_policy<alpaka::DevHipRt> =
+#if !defined ALPAKA_DISABLE_CACHING_ALLOCATOR
+      AllocatorPolicy::Caching;
+#else
+      AllocatorPolicy::Synchronous;
+#endif
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_AllocatorPolicy_h
diff --git a/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h b/CLUEstering/alpaka/AlpakaCore/CachedBufAlloc.h
@@ -0,0 +1,145 @@
+#ifndef AlpakaCore_CachedBufAlloc_h
+#define AlpakaCore_CachedBufAlloc_h
+
+#include <alpaka/alpaka.hpp>
+
+#include "getDeviceCachingAllocator.h"
+#include "getHostCachingAllocator.h"
+
+namespace cms::alpakatools {
+
+  namespace traits {
+
+    //! The caching memory allocator trait.
+    template <typename TElem, typename TDim, typename TIdx, typename TDev, typename TQueue, typename TSfinae = void>
+    struct CachedBufAlloc {
+      static_assert(alpaka::meta::DependentFalseType<TDev>::value, "This device does not support a caching allocator");
+    };
+
+    //! The caching memory allocator implementation for the CPU device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        // non-cached host-only memory
+        return alpaka::allocAsyncBuf<TElem, TIdx>(queue, extent);
+      }
+    };
+
+#ifdef ALPAKA_ACC_GPU_CUDA_ENABLED
+
+    //! The caching memory allocator implementation for the pinned host memory
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueCudaRtNonBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueCudaRtNonBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueCudaRtNonBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the CUDA device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCudaRt, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCudaRt const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufCudaRt<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getDeviceCachingAllocator<alpaka::DevCudaRt, TQueue>(dev);
+
+        size_t width = alpaka::getWidth(extent);
+        size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+        // TODO implement pitch for TDim > 1
+        size_t pitchBytes = widthBytes;
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCudaRt<TElem, TDim, TIdx>(
+            dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
+      }
+    };
+
+#endif  // ALPAKA_ACC_GPU_CUDA_ENABLED
+
+#ifdef ALPAKA_ACC_GPU_HIP_ENABLED
+
+    //! The caching memory allocator implementation for the pinned host memory
+    template <typename TElem, typename TDim, typename TIdx>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevCpu, alpaka::QueueHipRtNonBlocking, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevCpu const& dev,
+                                                alpaka::QueueHipRtNonBlocking queue,
+                                                TExtent const& extent) -> alpaka::BufCpu<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getHostCachingAllocator<alpaka::QueueHipRtNonBlocking>();
+
+        // FIXME the BufCpu does not support a pitch ?
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufCpu<TElem, TDim, TIdx>(dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), extent);
+      }
+    };
+
+    //! The caching memory allocator implementation for the ROCm/HIP device
+    template <typename TElem, typename TDim, typename TIdx, typename TQueue>
+    struct CachedBufAlloc<TElem, TDim, TIdx, alpaka::DevHipRt, TQueue, void> {
+      template <typename TExtent>
+      ALPAKA_FN_HOST static auto allocCachedBuf(alpaka::DevHipRt const& dev, TQueue queue, TExtent const& extent)
+          -> alpaka::BufHipRt<TElem, TDim, TIdx> {
+        ALPAKA_DEBUG_MINIMAL_LOG_SCOPE;
+
+        auto& allocator = getDeviceCachingAllocator<alpaka::DevHipRt, TQueue>(dev);
+
+        size_t width = alpaka::getWidth(extent);
+        size_t widthBytes = width * static_cast<TIdx>(sizeof(TElem));
+        // TODO implement pitch for TDim > 1
+        size_t pitchBytes = widthBytes;
+        size_t size = alpaka::getExtentProduct(extent);
+        size_t sizeBytes = size * sizeof(TElem);
+        void* memPtr = allocator.allocate(sizeBytes, queue);
+
+        // use a custom deleter to return the buffer to the CachingAllocator
+        auto deleter = [alloc = &allocator](TElem* ptr) { alloc->free(ptr); };
+
+        return alpaka::BufHipRt<TElem, TDim, TIdx>(
+            dev, reinterpret_cast<TElem*>(memPtr), std::move(deleter), pitchBytes, extent);
+      }
+    };
+
+#endif  // ALPAKA_ACC_GPU_HIP_ENABLED
+
+  }  // namespace traits
+
+  template <typename TElem, typename TIdx, typename TExtent, typename TQueue, typename TDev>
+  ALPAKA_FN_HOST auto allocCachedBuf(TDev const& dev, TQueue queue, TExtent const& extent = TExtent()) {
+    return traits::CachedBufAlloc<TElem, alpaka::Dim<TExtent>, TIdx, TDev, TQueue>::allocCachedBuf(dev, queue, extent);
+  }
+
+}  // namespace cms::alpakatools
+
+#endif  // AlpakaCore_CachedBufAlloc_h