diff --git a/cub/cub/util_arch.cuh b/cub/cub/util_arch.cuh index 5f8780620fa..5891007cc4b 100644 --- a/cub/cub/util_arch.cuh +++ b/cub/cub/util_arch.cuh @@ -47,6 +47,10 @@ #include #include +#include +#include +#include + // Legacy include; this functionality used to be defined in here. #include @@ -143,27 +147,24 @@ namespace detail static constexpr ::cuda::std::size_t max_smem_per_block = 48 * 1024; } // namespace detail -template +template struct RegBoundScaling { - enum - { - ITEMS_PER_THREAD = CUB_MAX(1, NOMINAL_4B_ITEMS_PER_THREAD * 4 / CUB_MAX(4, sizeof(T))), - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, - ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), - }; + static constexpr int ITEMS_PER_THREAD = + ::cuda::std::max(1, Nominal4ByteItemsPerThread * 4 / ::cuda::std::max(4, int{sizeof(T)})); + static constexpr int BLOCK_THREADS = + ::cuda::std::min(Nominal4ByteBlockThreads, + ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32); }; -template +template struct MemBoundScaling { - enum - { - ITEMS_PER_THREAD = - CUB_MAX(1, CUB_MIN(NOMINAL_4B_ITEMS_PER_THREAD * 4 / sizeof(T), NOMINAL_4B_ITEMS_PER_THREAD * 2)), - BLOCK_THREADS = CUB_MIN(NOMINAL_4B_BLOCK_THREADS, - ((cub::detail::max_smem_per_block / (sizeof(T) * ITEMS_PER_THREAD)) + 31) / 32 * 32), - }; + static constexpr int ITEMS_PER_THREAD = ::cuda::std::max( + 1, ::cuda::std::min(Nominal4ByteItemsPerThread * 4 / int{sizeof(T)}, Nominal4ByteItemsPerThread * 2)); + static constexpr int BLOCK_THREADS = + ::cuda::std::min(Nominal4ByteBlockThreads, + ::cuda::ceil_div(int{detail::max_smem_per_block} / (int{sizeof(T)} * ITEMS_PER_THREAD), 32) * 32); }; #endif // Do not document