From 1d9da6d41b8c90c0e441ded72f9f394ea43f2578 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 18 Sep 2024 16:20:30 -0700 Subject: [PATCH] Use `cuda::std::byte` as the default device storage type (#597) Address https://github.com/NVIDIA/cuCollections/pull/573#discussion_r1737574327 This PR updates the existing code to use `cuda::std::byte` as the default device data type instead of `std::byte`. This change addresses potential issues where `cuda::std::` utilities cannot be applied to `std::byte` when relaxed constexprs are disabled. --- .../distinct_count_estimator/device_ref_example.cu | 8 ++++---- .../distinct_count_estimator.inl | 4 ++-- .../distinct_count_estimator_ref.inl | 4 ++-- include/cuco/detail/hyperloglog/hyperloglog.cuh | 9 +++++---- include/cuco/detail/hyperloglog/hyperloglog_ref.cuh | 11 ++++++----- include/cuco/detail/hyperloglog/kernels.cuh | 6 +++--- .../detail/trie/dynamic_bitset/dynamic_bitset.cuh | 3 ++- include/cuco/distinct_count_estimator.cuh | 6 +++--- include/cuco/distinct_count_estimator_ref.cuh | 12 ++++++------ tests/distinct_count_estimator/device_ref_test.cu | 4 ++-- tests/static_map/capacity_test.cu | 2 +- tests/static_map/duplicate_keys_test.cu | 2 +- tests/static_map/erase_test.cu | 2 +- tests/static_map/for_each_test.cu | 2 +- tests/static_map/hash_test.cu | 2 +- tests/static_map/insert_and_find_test.cu | 2 +- tests/static_map/insert_or_apply_test.cu | 6 +++--- tests/static_map/insert_or_assign_test.cu | 2 +- tests/static_map/unique_sequence_test.cu | 2 +- tests/static_multimap/count_test.cu | 2 +- tests/static_multimap/insert_contains_test.cu | 2 +- tests/static_multimap/insert_if_test.cu | 2 +- tests/static_set/capacity_test.cu | 2 +- 23 files changed, 50 insertions(+), 47 deletions(-) diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu index ab4d1929f..d9a7078a0 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/distinct_count_estimator/device_ref_example.cu @@ -15,10 +15,10 @@ */ #include +#include #include #include -#include #include /** @@ -37,7 +37,7 @@ __global__ void fused_kernel(RefType ref, InputIt first, std::size_t n) using local_ref_type = typename RefType::with_scope; // Shared memory storage for the block-local estimator - extern __shared__ std::byte local_sketch[]; + extern __shared__ cuda::std::byte local_sketch[]; // The following check is optional since the base address of dynamic shared memory is guaranteed // to meet the alignment requirements @@ -94,7 +94,7 @@ __global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb, size_t n, OutputIt out) { - extern __shared__ std::byte local_sketch[]; + extern __shared__ cuda::std::byte local_sketch[]; auto const block = cooperative_groups::this_thread_block(); @@ -161,4 +161,4 @@ int main(void) } return 0; -} \ No newline at end of file +} diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl index be5c3764e..b3ee95891 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl @@ -121,8 +121,8 @@ constexpr auto distinct_count_estimator::hash_functio } template -constexpr cuda::std::span distinct_count_estimator::sketch() - const noexcept +constexpr cuda::std::span +distinct_count_estimator::sketch() const noexcept { return this->impl_->sketch(); } diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl index 6ded30148..bf222986c 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl @@ -19,7 +19,7 @@ namespace cuco { template __host__ __device__ constexpr distinct_count_estimator_ref::distinct_count_estimator_ref( - cuda::std::span sketch_span, Hash const& hash) + cuda::std::span sketch_span, Hash const& hash) : impl_{sketch_span, hash} { } @@ -114,7 +114,7 @@ __host__ __device__ constexpr auto distinct_count_estimator_ref: } template -__host__ __device__ constexpr cuda::std::span +__host__ __device__ constexpr cuda::std::span distinct_count_estimator_ref::sketch() const noexcept { return this->impl_.sketch(); diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh index 98a5e4857..3ea977af0 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh @@ -22,9 +22,9 @@ #include #include +#include #include -#include #include #include @@ -74,7 +74,8 @@ class hyperloglog { : allocator_{alloc}, sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)), custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}}, - ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), sketch_size_b}, hash} + ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), sketch_size_b}, + hash} { this->ref_.clear_async(stream); } @@ -290,7 +291,7 @@ class hyperloglog { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] constexpr cuda::std::span sketch() const noexcept + [[nodiscard]] constexpr cuda::std::span sketch() const noexcept { return this->ref_.sketch(); } @@ -352,4 +353,4 @@ class hyperloglog { template friend class hyperloglog; }; -} // namespace cuco::detail \ No newline at end of file +} // namespace cuco::detail diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh index 08db69018..5a656e325 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -36,7 +37,6 @@ #include #include // there is no -#include #include namespace cuco::detail { @@ -79,7 +79,7 @@ class hyperloglog_ref { * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, + __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, Hash const& hash) : hash_{hash}, precision_{cuda::std::countr_zero( @@ -448,10 +448,11 @@ class hyperloglog_ref { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] __host__ __device__ constexpr cuda::std::span sketch() const noexcept + [[nodiscard]] __host__ __device__ constexpr cuda::std::span sketch() + const noexcept { - return cuda::std::span(reinterpret_cast(this->sketch_.data()), - this->sketch_bytes()); + return cuda::std::span( + reinterpret_cast(this->sketch_.data()), this->sketch_bytes()); } /** diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh index abe02a5e3..f5dc22535 100644 --- a/include/cuco/detail/hyperloglog/kernels.cuh +++ b/include/cuco/detail/hyperloglog/kernels.cuh @@ -46,7 +46,7 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first, // Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is // sufficient for this purpose - extern __shared__ std::byte local_sketch[]; + extern __shared__ cuda::std::byte local_sketch[]; auto const loop_stride = cuco::detail::grid_stride(); auto idx = cuco::detail::global_thread_id(); @@ -94,7 +94,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re using local_ref_type = typename RefType::with_scope; // TODO assert alignment - extern __shared__ std::byte local_sketch[]; + extern __shared__ cuda::std::byte local_sketch[]; auto const loop_stride = cuco::detail::grid_stride(); auto idx = cuco::detail::global_thread_id(); @@ -142,4 +142,4 @@ CUCO_KERNEL void estimate(std::size_t* cardinality, RefType ref) if (block.thread_rank() == 0) { *cardinality = estimate; } } } -} // namespace cuco::hyperloglog_ns::detail \ No newline at end of file +} // namespace cuco::hyperloglog_ns::detail diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh index be324ab8d..abdad2719 100644 --- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh +++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh @@ -18,6 +18,7 @@ #pragma once #include +#include #include #include #include @@ -78,7 +79,7 @@ struct rank { * @tparam Allocator Type of allocator used for device storage */ // TODO: have to use device_malloc_allocator for now otherwise the container cannot grow -template > +template > class dynamic_bitset { public: using size_type = std::size_t; ///< size type to specify bit index diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh index feb8f00a0..5d3c7b6aa 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/distinct_count_estimator.cuh @@ -22,9 +22,9 @@ #include #include +#include #include -#include #include #include @@ -43,7 +43,7 @@ namespace cuco { template , - class Allocator = cuco::cuda_allocator> + class Allocator = cuco::cuda_allocator> class distinct_count_estimator { using impl_type = detail::hyperloglog; @@ -243,7 +243,7 @@ class distinct_count_estimator { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] constexpr cuda::std::span sketch() const noexcept; + [[nodiscard]] constexpr cuda::std::span sketch() const noexcept; /** * @brief Gets the number of bytes required for the sketch storage. diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh index cb566990e..799bb46c7 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/distinct_count_estimator_ref.cuh @@ -20,12 +20,11 @@ #include #include +#include #include #include -#include - namespace cuco { /** * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. @@ -64,8 +63,8 @@ class distinct_count_estimator_ref { * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span sketch_span, - Hash const& hash = {}); + __host__ __device__ constexpr distinct_count_estimator_ref( + cuda::std::span sketch_span, Hash const& hash = {}); /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -211,7 +210,8 @@ class distinct_count_estimator_ref { * * @return The cuda::std::span of the sketch */ - [[nodiscard]] __host__ __device__ constexpr cuda::std::span sketch() const noexcept; + [[nodiscard]] __host__ __device__ constexpr cuda::std::span sketch() + const noexcept; /** * @brief Gets the number of bytes required for the sketch storage. @@ -255,4 +255,4 @@ class distinct_count_estimator_ref { }; } // namespace cuco -#include \ No newline at end of file +#include diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/distinct_count_estimator/device_ref_test.cu index 2866f163b..ad40e7ab6 100644 --- a/tests/distinct_count_estimator/device_ref_test.cu +++ b/tests/distinct_count_estimator/device_ref_test.cu @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -26,7 +27,6 @@ #include #include -#include #include template @@ -35,7 +35,7 @@ __global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb, size_t n, OutputIt out) { - extern __shared__ std::byte local_sketch[]; + extern __shared__ cuda::std::byte local_sketch[]; auto const block = cooperative_groups::this_thread_block(); diff --git a/tests/static_map/capacity_test.cu b/tests/static_map/capacity_test.cu index 3e54acac0..fb1ae7491 100644 --- a/tests/static_map/capacity_test.cu +++ b/tests/static_map/capacity_test.cu @@ -24,7 +24,7 @@ TEST_CASE("Static map capacity", "") using T = int32_t; using ProbeT = cuco::double_hashing<1, cuco::default_hash_function>; using Equal = thrust::equal_to; - using AllocatorT = cuco::cuda_allocator; + using AllocatorT = cuco::cuda_allocator; using StorageT = cuco::storage<2>; SECTION("zero capacity is allowed.") diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu index 5356fcbe0..6af8d0acf 100644 --- a/tests/static_map/duplicate_keys_test.cu +++ b/tests/static_map/duplicate_keys_test.cu @@ -70,7 +70,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu index 5d478fa32..4d68a680d 100644 --- a/tests/static_map/erase_test.cu +++ b/tests/static_map/erase_test.cu @@ -118,7 +118,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}, cuco::erased_key{-2}}; diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu index 1c72a2e58..9fc518ae9 100644 --- a/tests/static_map/for_each_test.cu +++ b/tests/static_map/for_each_test.cu @@ -114,7 +114,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>; auto map = map_type{num_keys, cuco::empty_key{-1}, cuco::empty_value{0}}; diff --git a/tests/static_map/hash_test.cu b/tests/static_map/hash_test.cu index c22eae998..4f00b72b6 100644 --- a/tests/static_map/hash_test.cu +++ b/tests/static_map/hash_test.cu @@ -41,7 +41,7 @@ void test_hash_function() cuda::thread_scope_device, thrust::equal_to, cuco::linear_probing<1, Hash>, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu index 7c000fe29..698b2ef40 100644 --- a/tests/static_map/insert_and_find_test.cu +++ b/tests/static_map/insert_and_find_test.cu @@ -72,7 +72,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu index 1b818e38c..9663c95a3 100644 --- a/tests/static_map/insert_or_apply_test.cu +++ b/tests/static_map/insert_or_apply_test.cu @@ -175,7 +175,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>; SECTION("sentinel equals init; has_init = true") @@ -213,7 +213,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, cuco::linear_probing<2, cuco::murmurhash3_32>, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>; SECTION("sentinel equals init; has_init = true") @@ -249,7 +249,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, cuco::linear_probing<1, cuco::murmurhash3_32>, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>; SECTION("duplicate keys") diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu index fe1f1a51b..a6ad767d6 100644 --- a/tests/static_map/insert_or_assign_test.cu +++ b/tests/static_map/insert_or_assign_test.cu @@ -108,7 +108,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu index 2f217eed4..22cfd2d4a 100644 --- a/tests/static_map/unique_sequence_test.cu +++ b/tests/static_map/unique_sequence_test.cu @@ -182,7 +182,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ extent_type{}, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu index 8eacfa729..b892d27b7 100644 --- a/tests/static_multimap/count_test.cu +++ b/tests/static_multimap/count_test.cu @@ -96,7 +96,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys * multiplicity, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu index a3fa36648..efe3caad3 100644 --- a/tests/static_multimap/insert_contains_test.cu +++ b/tests/static_multimap/insert_contains_test.cu @@ -116,7 +116,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ extent_type{num_keys}, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu index 40697f40a..300597134 100644 --- a/tests/static_multimap/insert_if_test.cu +++ b/tests/static_multimap/insert_if_test.cu @@ -101,7 +101,7 @@ TEMPLATE_TEST_CASE_SIG( cuda::thread_scope_device, thrust::equal_to, probe, - cuco::cuda_allocator, + cuco::cuda_allocator, cuco::storage<2>>{ num_keys * 2, cuco::empty_key{-1}, cuco::empty_value{-1}}; diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu index 4432e56cc..acfa8d5d8 100644 --- a/tests/static_set/capacity_test.cu +++ b/tests/static_set/capacity_test.cu @@ -23,7 +23,7 @@ TEST_CASE("Static set capacity", "") using Key = int32_t; using ProbeT = cuco::double_hashing<1, cuco::default_hash_function>; using Equal = thrust::equal_to; - using AllocatorT = cuco::cuda_allocator; + using AllocatorT = cuco::cuda_allocator; using StorageT = cuco::storage<2>; SECTION("zero capacity is allowed.")