From 1d9da6d41b8c90c0e441ded72f9f394ea43f2578 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 18 Sep 2024 16:20:30 -0700
Subject: [PATCH] Use `cuda::std::byte` as the default device storage type
 (#597)

Address
https://github.com/NVIDIA/cuCollections/pull/573#discussion_r1737574327

This PR updates the existing code to use `cuda::std::byte` as the
default device data type instead of `std::byte`. This change addresses
potential issues where `cuda::std::` utilities cannot be applied to
`std::byte` when relaxed constexprs are disabled.
---
 .../distinct_count_estimator/device_ref_example.cu   |  8 ++++----
 .../distinct_count_estimator.inl                     |  4 ++--
 .../distinct_count_estimator_ref.inl                 |  4 ++--
 include/cuco/detail/hyperloglog/hyperloglog.cuh      |  9 +++++----
 include/cuco/detail/hyperloglog/hyperloglog_ref.cuh  | 11 ++++++-----
 include/cuco/detail/hyperloglog/kernels.cuh          |  6 +++---
 .../detail/trie/dynamic_bitset/dynamic_bitset.cuh    |  3 ++-
 include/cuco/distinct_count_estimator.cuh            |  6 +++---
 include/cuco/distinct_count_estimator_ref.cuh        | 12 ++++++------
 tests/distinct_count_estimator/device_ref_test.cu    |  4 ++--
 tests/static_map/capacity_test.cu                    |  2 +-
 tests/static_map/duplicate_keys_test.cu              |  2 +-
 tests/static_map/erase_test.cu                       |  2 +-
 tests/static_map/for_each_test.cu                    |  2 +-
 tests/static_map/hash_test.cu                        |  2 +-
 tests/static_map/insert_and_find_test.cu             |  2 +-
 tests/static_map/insert_or_apply_test.cu             |  6 +++---
 tests/static_map/insert_or_assign_test.cu            |  2 +-
 tests/static_map/unique_sequence_test.cu             |  2 +-
 tests/static_multimap/count_test.cu                  |  2 +-
 tests/static_multimap/insert_contains_test.cu        |  2 +-
 tests/static_multimap/insert_if_test.cu              |  2 +-
 tests/static_set/capacity_test.cu                    |  2 +-
 23 files changed, 50 insertions(+), 47 deletions(-)
diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/distinct_count_estimator/device_ref_example.cu
index ab4d1929f..d9a7078a0 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/distinct_count_estimator/device_ref_example.cu
@@ -15,10 +15,10 @@
  */
 #include <cuco/distinct_count_estimator.cuh>
 
+#include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
-#include <cstddef>
 #include <iostream>
 
 /**
@@ -37,7 +37,7 @@ __global__ void fused_kernel(RefType ref, InputIt first, std::size_t n)
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
   // Shared memory storage for the block-local estimator
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   // The following check is optional since the base address of dynamic shared memory is guaranteed
   // to meet the alignment requirements
@@ -94,7 +94,7 @@ __global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
                                        size_t n,
                                        OutputIt out)
 {
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const block = cooperative_groups::this_thread_block();
 
@@ -161,4 +161,4 @@ int main(void)
   }
 
   return 0;
-}
\ No newline at end of file
+}
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
index be5c3764e..b3ee95891 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
@@ -121,8 +121,8 @@ constexpr auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_functio
 }
 
 template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr cuda::std::span<std::byte> distinct_count_estimator<T, Scope, Hash, Allocator>::sketch()
-  const noexcept
+constexpr cuda::std::span<cuda::std::byte>
+distinct_count_estimator<T, Scope, Hash, Allocator>::sketch() const noexcept
 {
   return this->impl_->sketch();
 }
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
index 6ded30148..bf222986c 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
@@ -19,7 +19,7 @@ namespace cuco {
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__
   __device__ constexpr distinct_count_estimator_ref<T, Scope, Hash>::distinct_count_estimator_ref(
-    cuda::std::span<std::byte> sketch_span, Hash const& hash)
+    cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash)
   : impl_{sketch_span, hash}
 {
 }
@@ -114,7 +114,7 @@ __host__ __device__ constexpr auto distinct_count_estimator_ref<T, Scope, Hash>:
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ constexpr cuda::std::span<std::byte>
+__host__ __device__ constexpr cuda::std::span<cuda::std::byte>
 distinct_count_estimator_ref<T, Scope, Hash>::sketch() const noexcept
 {
   return this->impl_.sketch();
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
index 98a5e4857..3ea977af0 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog.cuh
@@ -22,9 +22,9 @@
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-#include <cstddef>
 #include <iterator>
 #include <memory>
 
@@ -74,7 +74,8 @@ class hyperloglog {
     : allocator_{alloc},
       sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)),
               custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}},
-      ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()), sketch_size_b}, hash}
+      ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(this->sketch_.get()), sketch_size_b},
+           hash}
   {
     this->ref_.clear_async(stream);
   }
@@ -290,7 +291,7 @@ class hyperloglog {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept
+  [[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept
   {
     return this->ref_.sketch();
   }
@@ -352,4 +353,4 @@ class hyperloglog {
   template <class T_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
   friend class hyperloglog;
 };
-}  // namespace cuco::detail
\ No newline at end of file
+}  // namespace cuco::detail
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 08db69018..5a656e325 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -27,6 +27,7 @@
 
 #include <cuda/atomic>
 #include <cuda/std/bit>
+#include <cuda/std/cstddef>
 #include <cuda/std/span>
 #include <cuda/std/utility>
 #include <cuda/stream_ref>
@@ -36,7 +37,6 @@
 #include <cooperative_groups/reduce.h>
 
 #include <algorithm>  // there is no <cuda/std/algorithm>
-#include <cstddef>
 #include <vector>
 
 namespace cuco::detail {
@@ -79,7 +79,7 @@ class hyperloglog_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<std::byte> sketch_span,
+  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<cuda::std::byte> sketch_span,
                                                 Hash const& hash)
     : hash_{hash},
       precision_{cuda::std::countr_zero(
@@ -448,10 +448,11 @@ class hyperloglog_ref {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<cuda::std::byte> sketch()
+    const noexcept
   {
-    return cuda::std::span<std::byte>(reinterpret_cast<std::byte*>(this->sketch_.data()),
-                                      this->sketch_bytes());
+    return cuda::std::span<cuda::std::byte>(
+      reinterpret_cast<cuda::std::byte*>(this->sketch_.data()), this->sketch_bytes());
   }
 
   /**
diff --git a/include/cuco/detail/hyperloglog/kernels.cuh b/include/cuco/detail/hyperloglog/kernels.cuh
index abe02a5e3..f5dc22535 100644
--- a/include/cuco/detail/hyperloglog/kernels.cuh
+++ b/include/cuco/detail/hyperloglog/kernels.cuh
@@ -46,7 +46,7 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,
 
   // Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is
   // sufficient for this purpose
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
@@ -94,7 +94,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
   using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;
 
   // TODO assert alignment
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const loop_stride = cuco::detail::grid_stride();
   auto idx               = cuco::detail::global_thread_id();
@@ -142,4 +142,4 @@ CUCO_KERNEL void estimate(std::size_t* cardinality, RefType ref)
     if (block.thread_rank() == 0) { *cardinality = estimate; }
   }
 }
-}  // namespace cuco::hyperloglog_ns::detail
\ No newline at end of file
+}  // namespace cuco::hyperloglog_ns::detail
diff --git a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
index be324ab8d..abdad2719 100644
--- a/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
+++ b/include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
@@ -18,6 +18,7 @@
 #pragma once
 
 #include <cuda/std/array>
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 #include <thrust/device_malloc_allocator.h>
 #include <thrust/device_vector.h>
@@ -78,7 +79,7 @@ struct rank {
  * @tparam Allocator Type of allocator used for device storage
  */
 // TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
-template <class Allocator = thrust::device_malloc_allocator<std::byte>>
+template <class Allocator = thrust::device_malloc_allocator<cuda::std::byte>>
 class dynamic_bitset {
  public:
   using size_type = std::size_t;  ///< size type to specify bit index
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/distinct_count_estimator.cuh
index feb8f00a0..5d3c7b6aa 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/distinct_count_estimator.cuh
@@ -22,9 +22,9 @@
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
-#include <cstddef>
 #include <iterator>
 #include <memory>
 
@@ -43,7 +43,7 @@ namespace cuco {
 template <class T,
           cuda::thread_scope Scope = cuda::thread_scope_device,
           class Hash               = cuco::xxhash_64<T>,
-          class Allocator          = cuco::cuda_allocator<std::byte>>
+          class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
 class distinct_count_estimator {
   using impl_type = detail::hyperloglog<T, Scope, Hash, Allocator>;
 
@@ -243,7 +243,7 @@ class distinct_count_estimator {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept;
+  [[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/distinct_count_estimator_ref.cuh
index cb566990e..799bb46c7 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/distinct_count_estimator_ref.cuh
@@ -20,12 +20,11 @@
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
+#include <cuda/std/cstddef>
 #include <cuda/stream_ref>
 
 #include <cooperative_groups.h>
 
-#include <cstddef>
-
 namespace cuco {
 /**
  * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
@@ -64,8 +63,8 @@ class distinct_count_estimator_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span<std::byte> sketch_span,
-                                                             Hash const& hash = {});
+  __host__ __device__ constexpr distinct_count_estimator_ref(
+    cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash = {});
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -211,7 +210,8 @@ class distinct_count_estimator_ref {
    *
    * @return The cuda::std::span of the sketch
    */
-  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept;
+  [[nodiscard]] __host__ __device__ constexpr cuda::std::span<cuda::std::byte> sketch()
+    const noexcept;
 
   /**
    * @brief Gets the number of bytes required for the sketch storage.
@@ -255,4 +255,4 @@ class distinct_count_estimator_ref {
 };
 }  // namespace cuco
 
-#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
\ No newline at end of file
+#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/distinct_count_estimator/device_ref_test.cu
index 2866f163b..ad40e7ab6 100644
--- a/tests/distinct_count_estimator/device_ref_test.cu
+++ b/tests/distinct_count_estimator/device_ref_test.cu
@@ -19,6 +19,7 @@
 #include <cuco/distinct_count_estimator.cuh>
 #include <cuco/hash_functions.cuh>
 
+#include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
 
@@ -26,7 +27,6 @@
 #include <catch2/generators/catch_generators.hpp>
 
 #include <cmath>
-#include <cstddef>
 #include <cstdint>
 
 template <typename Ref, typename InputIt, typename OutputIt>
@@ -35,7 +35,7 @@ __global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
                                 size_t n,
                                 OutputIt out)
 {
-  extern __shared__ std::byte local_sketch[];
+  extern __shared__ cuda::std::byte local_sketch[];
 
   auto const block = cooperative_groups::this_thread_block();
 
diff --git a/tests/static_map/capacity_test.cu b/tests/static_map/capacity_test.cu
index 3e54acac0..fb1ae7491 100644
--- a/tests/static_map/capacity_test.cu
+++ b/tests/static_map/capacity_test.cu
@@ -24,7 +24,7 @@ TEST_CASE("Static map capacity", "")
   using T          = int32_t;
   using ProbeT     = cuco::double_hashing<1, cuco::default_hash_function<Key>>;
   using Equal      = thrust::equal_to<Key>;
-  using AllocatorT = cuco::cuda_allocator<std::byte>;
+  using AllocatorT = cuco::cuda_allocator<cuda::std::byte>;
   using StorageT   = cuco::storage<2>;
 
   SECTION("zero capacity is allowed.")
diff --git a/tests/static_map/duplicate_keys_test.cu b/tests/static_map/duplicate_keys_test.cu
index 5356fcbe0..6af8d0acf 100644
--- a/tests/static_map/duplicate_keys_test.cu
+++ b/tests/static_map/duplicate_keys_test.cu
@@ -70,7 +70,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_map/erase_test.cu b/tests/static_map/erase_test.cu
index 5d478fa32..4d68a680d 100644
--- a/tests/static_map/erase_test.cu
+++ b/tests/static_map/erase_test.cu
@@ -118,7 +118,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key<Key>{-2}};
 
diff --git a/tests/static_map/for_each_test.cu b/tests/static_map/for_each_test.cu
index 1c72a2e58..9fc518ae9 100644
--- a/tests/static_map/for_each_test.cu
+++ b/tests/static_map/for_each_test.cu
@@ -114,7 +114,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     probe,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
diff --git a/tests/static_map/hash_test.cu b/tests/static_map/hash_test.cu
index c22eae998..4f00b72b6 100644
--- a/tests/static_map/hash_test.cu
+++ b/tests/static_map/hash_test.cu
@@ -41,7 +41,7 @@ void test_hash_function()
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               cuco::linear_probing<1, Hash>,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_map/insert_and_find_test.cu b/tests/static_map/insert_and_find_test.cu
index 7c000fe29..698b2ef40 100644
--- a/tests/static_map/insert_and_find_test.cu
+++ b/tests/static_map/insert_and_find_test.cu
@@ -72,7 +72,7 @@ TEMPLATE_TEST_CASE_SIG(
                                 cuda::thread_scope_device,
                                 thrust::equal_to<Key>,
                                 probe,
-                                cuco::cuda_allocator<std::byte>,
+                                cuco::cuda_allocator<cuda::std::byte>,
                                 cuco::storage<2>>{
       num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_map/insert_or_apply_test.cu b/tests/static_map/insert_or_apply_test.cu
index 1b818e38c..9663c95a3 100644
--- a/tests/static_map/insert_or_apply_test.cu
+++ b/tests/static_map/insert_or_apply_test.cu
@@ -175,7 +175,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     probe,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   SECTION("sentinel equals init; has_init = true")
@@ -213,7 +213,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     cuco::linear_probing<2, cuco::murmurhash3_32<Key>>,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   SECTION("sentinel equals init; has_init = true")
@@ -249,7 +249,7 @@ TEMPLATE_TEST_CASE_SIG(
                                     cuda::thread_scope_device,
                                     thrust::equal_to<Key>,
                                     cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
-                                    cuco::cuda_allocator<std::byte>,
+                                    cuco::cuda_allocator<cuda::std::byte>,
                                     cuco::storage<2>>;
 
   SECTION("duplicate keys")
diff --git a/tests/static_map/insert_or_assign_test.cu b/tests/static_map/insert_or_assign_test.cu
index fe1f1a51b..a6ad767d6 100644
--- a/tests/static_map/insert_or_assign_test.cu
+++ b/tests/static_map/insert_or_assign_test.cu
@@ -108,7 +108,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_map/unique_sequence_test.cu b/tests/static_map/unique_sequence_test.cu
index 2f217eed4..22cfd2d4a 100644
--- a/tests/static_map/unique_sequence_test.cu
+++ b/tests/static_map/unique_sequence_test.cu
@@ -182,7 +182,7 @@ TEMPLATE_TEST_CASE_SIG(
                               cuda::thread_scope_device,
                               thrust::equal_to<Key>,
                               probe,
-                              cuco::cuda_allocator<std::byte>,
+                              cuco::cuda_allocator<cuda::std::byte>,
                               cuco::storage<2>>{
     extent_type{}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_multimap/count_test.cu b/tests/static_multimap/count_test.cu
index 8eacfa729..b892d27b7 100644
--- a/tests/static_multimap/count_test.cu
+++ b/tests/static_multimap/count_test.cu
@@ -96,7 +96,7 @@ TEMPLATE_TEST_CASE_SIG(
                                                  cuda::thread_scope_device,
                                                  thrust::equal_to<T>,
                                                  probe,
-                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::cuda_allocator<cuda::std::byte>,
                                                  cuco::storage<2>>{
     num_keys * multiplicity, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};
 
diff --git a/tests/static_multimap/insert_contains_test.cu b/tests/static_multimap/insert_contains_test.cu
index a3fa36648..efe3caad3 100644
--- a/tests/static_multimap/insert_contains_test.cu
+++ b/tests/static_multimap/insert_contains_test.cu
@@ -116,7 +116,7 @@ TEMPLATE_TEST_CASE_SIG(
                                                  cuda::thread_scope_device,
                                                  thrust::equal_to<Key>,
                                                  probe,
-                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::cuda_allocator<cuda::std::byte>,
                                                  cuco::storage<2>>{
     extent_type{num_keys}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_multimap/insert_if_test.cu b/tests/static_multimap/insert_if_test.cu
index 40697f40a..300597134 100644
--- a/tests/static_multimap/insert_if_test.cu
+++ b/tests/static_multimap/insert_if_test.cu
@@ -101,7 +101,7 @@ TEMPLATE_TEST_CASE_SIG(
                                                  cuda::thread_scope_device,
                                                  thrust::equal_to<Key>,
                                                  probe,
-                                                 cuco::cuda_allocator<std::byte>,
+                                                 cuco::cuda_allocator<cuda::std::byte>,
                                                  cuco::storage<2>>{
     num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};
 
diff --git a/tests/static_set/capacity_test.cu b/tests/static_set/capacity_test.cu
index 4432e56cc..acfa8d5d8 100644
--- a/tests/static_set/capacity_test.cu
+++ b/tests/static_set/capacity_test.cu
@@ -23,7 +23,7 @@ TEST_CASE("Static set capacity", "")
   using Key        = int32_t;
   using ProbeT     = cuco::double_hashing<1, cuco::default_hash_function<Key>>;
   using Equal      = thrust::equal_to<Key>;
-  using AllocatorT = cuco::cuda_allocator<std::byte>;
+  using AllocatorT = cuco::cuda_allocator<cuda::std::byte>;
   using StorageT   = cuco::storage<2>;
 
   SECTION("zero capacity is allowed.")