Merge remote-tracking branch 'upstream/dev' into bloom-filter-new

NVIDIA · Sep 26, 2024 · 591df5a · 591df5a
2 parents 8fc4b00 + 1d9da6d
commit 591df5a
Show file tree

Hide file tree

Showing 23 changed files with 50 additions and 47 deletions.
diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
@@ -15,10 +15,10 @@
  */
 #include <cuco/distinct_count_estimator.cuh>
 
+#include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
-#include <cstddef>
 #include <iostream>
 
 /**
@@ -37,7 +37,7 @@ __global__ void fused_kernel(RefType ref, InputIt first, std::size_t n)
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
   // Shared memory storage for the block-local estimator
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   // The following check is optional since the base address of dynamic shared memory is guaranteed
   // to meet the alignment requirements
@@ -94,7 +94,7 @@ __global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
                                        size_t n,
                                        OutputIt out)
 {
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const block = cooperative_groups::this_thread_block();
 
@@ -161,4 +161,4 @@ int main(void)
   }
 
   return 0;
-}
+}
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -121,8 +121,8 @@ constexpr auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_functio
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr cuda::std::span<std::byte> distinct_count_estimator<T, Scope, Hash, Allocator>::sketch()
-  const noexcept
+constexpr cuda::std::span<cuda::std::byte>
+distinct_count_estimator<T, Scope, Hash, Allocator>::sketch() const noexcept
 {
   return this->impl_->sketch();
 }

diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -19,7 +19,7 @@ namespace cuco {
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__
   __device__ constexpr distinct_count_estimator_ref<T, Scope, Hash>::distinct_count_estimator_ref(
-    cuda::std::span<std::byte> sketch_span, Hash const& hash)
+    cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash)
   : impl_{sketch_span, hash}
 {
 }
@@ -114,7 +114,7 @@ __host__ __device__ constexpr auto distinct_count_estimator_ref<T, Scope, Hash>:
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ constexpr cuda::std::span<std::byte>
+__host__ __device__ constexpr cuda::std::span<cuda::std::byte>
 distinct_count_estimator_ref<T, Scope, Hash>::sketch() const noexcept
 {
   return this->impl_.sketch();

diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -22,9 +22,9 @@
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-#include <cstddef>
 #include <iterator>
 #include <memory>
 
@@ -74,7 +74,8 @@ class hyperloglog {
     : allocator_{alloc},
       sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)),
               custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}},
-      ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()), sketch_size_b}, hash}
+      ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(this->sketch_.get()), sketch_size_b},
+           hash}
   {
     this->ref_.clear_async(stream);
   }
@@ -290,7 +291,7 @@ class hyperloglog {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept
+  [[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept
   {
     return this->ref_.sketch();
   }
@@ -352,4 +353,4 @@ class hyperloglog {
   template <class T_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
   friend class hyperloglog;
 };
-}  // namespace cuco::detail
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -27,6 +27,7 @@
 
 #include <cuda/atomic>
 #include <cuda/std/bit>
+#include <cuda/std/cstddef>
 #include <cuda/std/span>
 #include <cuda/std/utility>
 #include <cuda/stream_ref>
@@ -36,7 +37,6 @@
 #include <cooperative_groups/reduce.h>
 
 #include <algorithm>  // there is no <cuda/std/algorithm>
-#include <cstddef>
 #include <vector>
 
 namespace cuco::detail {
@@ -79,7 +79,7 @@ class hyperloglog_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<std::byte> sketch_span,
+  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<cuda::std::byte> sketch_span,
                                                 Hash const& hash)
     : hash_{hash},
       precision_{cuda::std::countr_zero(
@@ -448,10 +448,11 @@ class hyperloglog_ref {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<cuda::std::byte> sketch()
+    const noexcept
   {
-    return cuda::std::span<std::byte>(reinterpret_cast<std::byte*>(this->sketch_.data()),
-                                      this->sketch_bytes());
+    return cuda::std::span<cuda::std::byte>(
+      reinterpret_cast<cuda::std::byte*>(this->sketch_.data()), this->sketch_bytes());
   }
 
   /**

diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -46,7 +46,7 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
 
   // Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is
   // sufficient for this purpose
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
@@ -94,7 +94,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
   // TODO assert alignment
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
@@ -142,4 +142,4 @@ CUCO_KERNEL void estimate(std::size_t* cardinality, RefType ref)
     if (block.thread_rank() == 0) { *cardinality = estimate; }
   }
 }
-}  // namespace cuco::hyperloglog_ns::detail
+}  // namespace cuco::hyperloglog_ns::detail
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <cuda/std/array>
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_vector.h>
@@ -78,7 +79,7 @@ struct rank {
  * @tparam Allocator Type of allocator used for device storage
  */
 // TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
-template <class Allocator = thrust::device_malloc_allocator<std::byte>>
+template <class Allocator = thrust::device_malloc_allocator<cuda::std::byte>>
 class dynamic_bitset {
  public:
   using size_type = std::size_t;  ///< size type to specify bit index

diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
@@ -22,9 +22,9 @@
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-#include <cstddef>
 #include <iterator>
 #include <memory>
 
@@ -43,7 +43,7 @@ namespace cuco {
 template <class T,
           cuda::thread_scope Scope = cuda::thread_scope_device,
           class Hash               = cuco::xxhash_64<T>,
-          class Allocator          = cuco::cuda_allocator<std::byte>>
+          class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
 class distinct_count_estimator {
   using impl_type = detail::hyperloglog<T, Scope, Hash, Allocator>;
 
@@ -243,7 +243,7 @@ class distinct_count_estimator {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept;
+  [[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.

diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
@@ -20,12 +20,11 @@
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
 #include <cooperative_groups.h>
 
-#include <cstddef>
-
 namespace cuco {
 /**
  * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
@@ -64,8 +63,8 @@ class distinct_count_estimator_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span<std::byte> sketch_span,
-                                                             Hash const& hash = {});
+  __host__ __device__ constexpr distinct_count_estimator_ref(
+    cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash = {});
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -211,7 +210,8 @@ class distinct_count_estimator_ref {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept;
+  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<cuda::std::byte> sketch()
+    const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
@@ -255,4 +255,4 @@ class distinct_count_estimator_ref {
 };
 }  // namespace cuco
 
-#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
+#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/distinct_count_estimator/device_ref_test.cu
@@ -19,14 +19,14 @@
 #include <cuco/distinct_count_estimator.cuh>
 #include <cuco/hash_functions.cuh>
 
+#include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/generators/catch_generators.hpp>
 
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 
 template <typename Ref, typename InputIt, typename OutputIt>
@@ -35,7 +35,7 @@ __global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
                                 size_t n,
                                 OutputIt out)
 {
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const block = cooperative_groups::this_thread_block();
 

diff --git a/tests/static_map/capacity_test.cu b/tests/static_map/capacity_test.cu
@@ -24,7 +24,7 @@ TEST_CASE("Static map capacity", "")
   using T          = int32_t;
   using ProbeT     = cuco::double_hashing<1, cuco::default_hash_function<Key>>;
   using Equal      = thrust::equal_to<Key>;
-  using AllocatorT = cuco::cuda_allocator<std::byte>;
+  using AllocatorT = cuco::cuda_allocator<cuda::std::byte>;
   using StorageT   = cuco::storage<2>;
 
   SECTION("zero capacity is allowed.")

diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu
@@ -70,7 +70,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 

diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu
@@ -118,7 +118,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key<Key>{-2}};
 

diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu
@@ -114,7 +114,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     probe,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};

diff --git a/tests/static_map/hash_test.cu b/tests/static_map/hash_test.cu
@@ -41,7 +41,7 @@ void test_hash_function()
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               cuco::linear_probing<1, Hash>,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 

diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu
@@ -72,7 +72,7 @@ TEMPLATE_TEST_CASE_SIG(
                                 cuda::thread_scope_device,
                                 thrust::equal_to<Key>,
                                 probe,
-                                cuco::cuda_allocator<std::byte>,
+                                cuco::cuda_allocator<cuda::std::byte>,
                                 cuco::storage<2>>{
       num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 

diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
@@ -175,7 +175,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     probe,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   SECTION("sentinel equals init; has_init = true")
@@ -213,7 +213,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     cuco::linear_probing<2, cuco::murmurhash3_32<Key>>,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   SECTION("sentinel equals init; has_init = true")
@@ -249,7 +249,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   SECTION("duplicate keys")

diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu
@@ -108,7 +108,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 

diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu
@@ -182,7 +182,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     extent_type{}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 

diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu
@@ -96,7 +96,7 @@ TEMPLATE_TEST_CASE_SIG(
                                                  cuda::thread_scope_device,
                                                  thrust::equal_to<T>,
                                                  probe,
-                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::cuda_allocator<cuda::std::byte>,
                                                  cuco::storage<2>>{
     num_keys * multiplicity, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};
 

diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu
@@ -116,7 +116,7 @@ TEMPLATE_TEST_CASE_SIG(
                                                  cuda::thread_scope_device,
                                                  thrust::equal_to<Key>,
                                                  probe,
-                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::cuda_allocator<cuda::std::byte>,
                                                  cuco::storage<2>>{
     extent_type{num_keys}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};