From 4454de4b878f31d41c5b7578fe6ca24bba5ea3f4 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 29 Aug 2024 14:37:02 -0700
Subject: [PATCH] Disable `--expt-relaxed-constexpr` with CCCL enhancements
 (#595)

With CCCL now more mature, the need to build cuCollections using
`--expt-relaxed-constexpr` is no longer necessary. This PR updates the
implementations to support disabling `--expt-relaxed-constexpr`.
---
 benchmarks/CMakeLists.txt                     |  2 +-
 .../hash_function/hash_function_bench.cu      | 21 ++++++++--------
 examples/CMakeLists.txt                       |  4 +--
 examples/static_set/device_subsets_example.cu |  2 +-
 include/cuco/detail/__config                  |  4 ---
 .../detail/hash_functions/murmurhash3.cuh     | 24 ++++++++++--------
 include/cuco/detail/hash_functions/utils.cuh  |  8 +++---
 include/cuco/detail/hash_functions/xxhash.cuh | 19 +++++++-------
 .../detail/hyperloglog/hyperloglog_ref.cuh    |  6 ++---
 .../open_addressing_ref_impl.cuh              |  9 ++++---
 include/cuco/detail/probe_sequence_impl.cuh   | 12 ++++-----
 include/cuco/static_multimap.cuh              |  9 ++++---
 include/cuco/utility/key_generator.cuh        | 24 ++++++++++--------
 tests/CMakeLists.txt                          |  2 +-
 tests/static_map/custom_type_test.cu          |  4 +--
 tests/static_multimap/custom_type_test.cu     |  8 +++---
 tests/utility/hash_test.cu                    | 25 ++++++++++---------
 17 files changed, 96 insertions(+), 87 deletions(-)
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index d03c51765..1eb09c0b4 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -35,7 +35,7 @@ function(ConfigureBench BENCH_NAME)
                                         RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmarks")
     target_include_directories(${BENCH_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
-    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda --expt-relaxed-constexpr -lineinfo)
+    target_compile_options(${BENCH_NAME} PRIVATE --expt-extended-lambda -lineinfo)
     target_link_libraries(${BENCH_NAME} PRIVATE
                                         nvbench::main
                                         pthread
diff --git a/benchmarks/hash_function/hash_function_bench.cu b/benchmarks/hash_function/hash_function_bench.cu
index e02ba8f0a..d48b28cd4 100644
--- a/benchmarks/hash_function/hash_function_bench.cu
+++ b/benchmarks/hash_function/hash_function_bench.cu
@@ -21,9 +21,9 @@
 
 #include <nvbench/nvbench.cuh>
 
+#include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
 
-#include <cstddef>
 #include <cstdint>
 #include <type_traits>
 
@@ -139,8 +139,8 @@ __global__ void string_hash_bench_kernel(
 template <typename Hash>
 void string_hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
 {
-  static_assert(std::is_same_v<typename Hash::argument_type, std::byte>,
-                "Argument type must be std::byte");
+  static_assert(std::is_same_v<typename Hash::argument_type, cuda::std::byte>,
+                "Argument type must be cuda::std::byte");
 
   bool const materialize_result = false;
   constexpr auto block_size     = 128;
@@ -164,7 +164,7 @@ void string_hash_eval(nvbench::state& state, nvbench::type_list<Hash>)
                                                                                      : 1);
 
   state.add_element_count(num_keys);
-  // state.add_global_memory_reads<std::byte>(storage.size() * n_repeats);
+  // state.add_global_memory_reads<cuda::std::byte>(storage.size() * n_repeats);
 
   state.exec([&](nvbench::launch& launch) {
     string_hash_bench_kernel<block_size><<<grid_size, block_size, 0, launch.get_stream()>>>(
@@ -196,12 +196,13 @@ NVBENCH_BENCH_TYPES(
   .set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
   .add_int64_axis("NumInputs", {cuco::benchmark::defaults::N * 10});
 
-NVBENCH_BENCH_TYPES(string_hash_eval,
-                    NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<std::byte>,
-                                                         cuco::xxhash_32<std::byte>,
-                                                         cuco::xxhash_64<std::byte>,
-                                                         cuco::murmurhash3_x86_128<std::byte>,
-                                                         cuco::murmurhash3_x64_128<std::byte>>))
+NVBENCH_BENCH_TYPES(
+  string_hash_eval,
+  NVBENCH_TYPE_AXES(nvbench::type_list<cuco::murmurhash3_32<cuda::std::byte>,
+                                       cuco::xxhash_32<cuda::std::byte>,
+                                       cuco::xxhash_64<cuda::std::byte>,
+                                       cuco::murmurhash3_x86_128<cuda::std::byte>,
+                                       cuco::murmurhash3_x64_128<cuda::std::byte>>))
   .set_name("string_hash_function_eval")
   .set_type_axes_names({"Hash"})
   .set_max_noise(cuco::benchmark::defaults::MAX_NOISE)
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index b5fafd152..3dad563e0 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -1,5 +1,5 @@
 ﻿#=============================================================================
-# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+# Copyright (c) 2018-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -25,7 +25,7 @@ function(ConfigureExample EXAMPLE_NAME EXAMPLE_SRC)
     target_include_directories(${EXAMPLE_NAME} PRIVATE
                                              "${CMAKE_CURRENT_SOURCE_DIR}")
     target_compile_options(${EXAMPLE_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
+      --expt-extended-lambda -Xcompiler -Wno-subobject-linkage)
     target_link_libraries(${EXAMPLE_NAME} PRIVATE cuco CUDA::cudart)
 endfunction(ConfigureExample)
 
diff --git a/examples/static_set/device_subsets_example.cu b/examples/static_set/device_subsets_example.cu
index e7276bf16..4e479f27b 100644
--- a/examples/static_set/device_subsets_example.cu
+++ b/examples/static_set/device_subsets_example.cu
@@ -64,7 +64,7 @@ using ref_type         = cuco::static_set_ref<key_type,
                                               storage_ref_type>;  ///< Set ref type
 
 /// Sample data to insert and query
-__device__ constexpr std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
+__device__ constexpr cuda::std::array<key_type, N> data = {1, 3, 5, 7, 9, 11, 13, 15, 17, 19};
 /// Empty slots are represented by reserved "sentinel" values. These values should be selected such
 /// that they never occur in your input data.
 key_type constexpr empty_key_sentinel = -1;
diff --git a/include/cuco/detail/__config b/include/cuco/detail/__config
index 24c7758ab..fd999aa52 100644
--- a/include/cuco/detail/__config
+++ b/include/cuco/detail/__config
@@ -25,10 +25,6 @@
 #error "NVCC version 11.5 or later is required"
 #endif
 
-#if !defined(__CUDACC_RELAXED_CONSTEXPR__)
-#error "Support for relaxed constexpr is required (nvcc flag --expt-relaxed-constexpr)"
-#endif
-
 #if !defined(__CUDACC_EXTENDED_LAMBDA__)
 #error "Support for extended device lambdas is required (nvcc flag --expt-extended-lambda)"
 #endif
diff --git a/include/cuco/detail/hash_functions/murmurhash3.cuh b/include/cuco/detail/hash_functions/murmurhash3.cuh
index 01aeeeead..f99c04c75 100644
--- a/include/cuco/detail/hash_functions/murmurhash3.cuh
+++ b/include/cuco/detail/hash_functions/murmurhash3.cuh
@@ -20,9 +20,9 @@
 #include <cuco/extent.cuh>
 
 #include <cuda/std/array>
+#include <cuda/std/cstddef>
 #include <cuda/std/type_traits>
 
-#include <cstddef>
 #include <cstdint>
 
 namespace cuco::detail {
@@ -146,7 +146,7 @@ struct MurmurHash3_32 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+    return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                         cuco::extent<std::size_t, sizeof(Key)>{});
   }
 
@@ -160,7 +160,7 @@ struct MurmurHash3_32 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     auto const nblocks = size / 4;
@@ -183,10 +183,14 @@ struct MurmurHash3_32 {
     // tail
     std::uint32_t k1 = 0;
     switch (size & 3) {
-      case 3: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16; [[fallthrough]];
-      case 2: k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8; [[fallthrough]];
+      case 3:
+        k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 2]) << 16;
+        [[fallthrough]];
+      case 2:
+        k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 1]) << 8;
+        [[fallthrough]];
       case 1:
-        k1 ^= std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
+        k1 ^= cuda::std::to_integer<std::uint32_t>(bytes[nblocks * 4 + 0]);
         k1 *= c1;
         k1 = rotl32(k1, 15);
         k1 *= c2;
@@ -247,7 +251,7 @@ struct MurmurHash3_x64_128 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+    return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                         cuco::extent<std::size_t, sizeof(Key)>{});
   }
 
@@ -261,7 +265,7 @@ struct MurmurHash3_x64_128 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     constexpr std::uint32_t block_size = 16;
@@ -390,7 +394,7 @@ struct MurmurHash3_x86_128 {
    */
   constexpr result_type __host__ __device__ operator()(Key const& key) const noexcept
   {
-    return compute_hash(reinterpret_cast<std::byte const*>(&key),
+    return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                         cuco::extent<std::size_t, sizeof(Key)>{});
   }
 
@@ -404,7 +408,7 @@ struct MurmurHash3_x86_128 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     constexpr std::uint32_t block_size = 16;
diff --git a/include/cuco/detail/hash_functions/utils.cuh b/include/cuco/detail/hash_functions/utils.cuh
index d2a7421dd..a6bb2a4a7 100644
--- a/include/cuco/detail/hash_functions/utils.cuh
+++ b/include/cuco/detail/hash_functions/utils.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,12 +16,14 @@
 
 #pragma once
 
+#include <cuda/std/cstddef>
+
 namespace cuco::detail {
 
 template <typename T, typename U, typename Extent>
 constexpr __host__ __device__ T load_chunk(U const* const data, Extent index) noexcept
 {
-  auto const bytes = reinterpret_cast<std::byte const*>(data);
+  auto const bytes = reinterpret_cast<cuda::std::byte const*>(data);
   T chunk;
   memcpy(&chunk, bytes + index * sizeof(T), sizeof(T));
   return chunk;
@@ -37,4 +39,4 @@ constexpr __host__ __device__ std::uint64_t rotl64(std::uint64_t x, std::int8_t
   return (x << r) | (x >> (64 - r));
 }
 
-};  // namespace cuco::detail
\ No newline at end of file
+};  // namespace cuco::detail
diff --git a/include/cuco/detail/hash_functions/xxhash.cuh b/include/cuco/detail/hash_functions/xxhash.cuh
index 709045060..4ef75c782 100644
--- a/include/cuco/detail/hash_functions/xxhash.cuh
+++ b/include/cuco/detail/hash_functions/xxhash.cuh
@@ -19,7 +19,8 @@
 #include <cuco/detail/hash_functions/utils.cuh>
 #include <cuco/extent.cuh>
 
-#include <cstddef>
+#include <cuda/std/cstddef>
+
 #include <cstdint>
 
 namespace cuco::detail {
@@ -91,10 +92,10 @@ struct XXHash_32 {
   {
     if constexpr (sizeof(Key) <= 16) {
       Key const key_copy = key;
-      return compute_hash(reinterpret_cast<std::byte const*>(&key_copy),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key_copy),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     } else {
-      return compute_hash(reinterpret_cast<std::byte const*>(&key),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     }
   }
@@ -109,7 +110,7 @@ struct XXHash_32 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     std::size_t offset = 0;
@@ -159,7 +160,7 @@ struct XXHash_32 {
     // the following loop is only needed if the size of the key is not a multiple of the block size
     if (size % 4) {
       while (offset < size) {
-        h32 += (std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
+        h32 += (cuda::std::to_integer<std::uint32_t>(bytes[offset]) & 255) * prime5;
         h32 = rotl32(h32, 11) * prime1;
         ++offset;
       }
@@ -254,10 +255,10 @@ struct XXHash_64 {
   {
     if constexpr (sizeof(Key) <= 16) {
       Key const key_copy = key;
-      return compute_hash(reinterpret_cast<std::byte const*>(&key_copy),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key_copy),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     } else {
-      return compute_hash(reinterpret_cast<std::byte const*>(&key),
+      return compute_hash(reinterpret_cast<cuda::std::byte const*>(&key),
                           cuco::extent<std::size_t, sizeof(Key)>{});
     }
   }
@@ -272,7 +273,7 @@ struct XXHash_64 {
    * @return The resulting hash value
    */
   template <typename Extent>
-  constexpr result_type __host__ __device__ compute_hash(std::byte const* bytes,
+  constexpr result_type __host__ __device__ compute_hash(cuda::std::byte const* bytes,
                                                          Extent size) const noexcept
   {
     std::size_t offset = 0;
@@ -357,7 +358,7 @@ struct XXHash_64 {
     // block size
     if (size % 4) {
       while (offset < size) {
-        h64 ^= (std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
+        h64 ^= (cuda::std::to_integer<std::uint32_t>(bytes[offset]) & 0xff) * prime5;
         h64 = rotl64(h64, 11) * prime1;
         ++offset;
       }
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
index 5597e1d6f..08db69018 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
@@ -475,8 +475,8 @@ class hyperloglog_ref {
     cuco::sketch_size_kb sketch_size_kb) noexcept
   {
     // minimum precision is 4 or 64 bytes
-    return std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
-                    cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
+    return cuda::std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
+                          cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
   }
 
   /**
@@ -493,7 +493,7 @@ class hyperloglog_ref {
     // https://github.com/apache/spark/blob/6a27789ad7d59cd133653a49be0bb49729542abe/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/util/HyperLogLogPlusPlusHelper.scala#L43
 
     //  minimum precision is 4 or 64 bytes
-    auto const precision = std::max(
+    auto const precision = cuda::std::max(
       static_cast<int32_t>(4),
       static_cast<int32_t>(
         cuda::std::ceil(2.0 * cuda::std::log(1.106 / standard_deviation) / cuda::std::log(2.0))));
diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
index f4c20f829..c78705804 100644
--- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
+++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh
@@ -213,8 +213,8 @@ class open_addressing_ref_impl {
    *
    * @return The key equality predicate
    */
-  [[nodiscard]] __device__ constexpr detail::equal_wrapper<key_type, key_equal> const& predicate()
-    const noexcept
+  [[nodiscard]] __host__ __device__ constexpr detail::equal_wrapper<key_type, key_equal> const&
+  predicate() const noexcept
   {
     return this->predicate_;
   }
@@ -255,7 +255,7 @@ class open_addressing_ref_impl {
    *
    * @return The non-owning storage ref of the container
    */
-  [[nodiscard]] __device__ constexpr storage_ref_type const& storage_ref() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr storage_ref_type const& storage_ref() const noexcept
   {
     return storage_ref_;
   }
@@ -1142,7 +1142,8 @@ class open_addressing_ref_impl {
    * @return The key
    */
   template <typename Value>
-  [[nodiscard]] __device__ constexpr auto const& extract_key(Value const& value) const noexcept
+  [[nodiscard]] __host__ __device__ constexpr auto const& extract_key(
+    Value const& value) const noexcept
   {
     if constexpr (this->has_payload) {
       return thrust::raw_reference_cast(value).first;
diff --git a/include/cuco/detail/probe_sequence_impl.cuh b/include/cuco/detail/probe_sequence_impl.cuh
index 46d18e419..51b1bfd68 100644
--- a/include/cuco/detail/probe_sequence_impl.cuh
+++ b/include/cuco/detail/probe_sequence_impl.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -45,7 +45,7 @@ class probe_sequence_base {
    *
    * @return The number of elements loaded with each vector load
    */
-  static constexpr uint32_t vector_width() noexcept { return 2u; }
+  static __host__ __device__ constexpr uint32_t vector_width() noexcept { return 2u; }
 };
 
 /**
@@ -210,7 +210,7 @@ class linear_probing_impl
   __device__ __forceinline__ iterator
   initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).initial_slot(g, k));
+    return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
   }
 
   /**
@@ -257,7 +257,7 @@ class linear_probing_impl
    */
   __device__ __forceinline__ iterator next_slot(iterator s) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).next_slot(s));
+    return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
   }
 
   /**
@@ -364,7 +364,7 @@ class double_hashing_impl
   __device__ __forceinline__ iterator
   initial_slot(cooperative_groups::thread_block_tile<cg_size> const& g, ProbeKey const& k) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).initial_slot(g, k));
+    return const_cast<iterator>(cuda::std::as_const(*this).initial_slot(g, k));
   }
 
   /**
@@ -409,7 +409,7 @@ class double_hashing_impl
    */
   __device__ __forceinline__ iterator next_slot(iterator s) noexcept
   {
-    return const_cast<iterator>(std::as_const(*this).next_slot(s));
+    return const_cast<iterator>(cuda::std::as_const(*this).next_slot(s));
   }
 
   /**
diff --git a/include/cuco/static_multimap.cuh b/include/cuco/static_multimap.cuh
index ebf17edba..27c682dd6 100644
--- a/include/cuco/static_multimap.cuh
+++ b/include/cuco/static_multimap.cuh
@@ -1052,7 +1052,7 @@ class static_multimap {
    *
    * @return Boolean indicating if vector-load is used.
    */
-  static constexpr bool uses_vector_load() noexcept
+  static __host__ __device__ constexpr bool uses_vector_load() noexcept
   {
     return cuco::detail::is_packable<value_type>();
   }
@@ -1060,12 +1060,15 @@ class static_multimap {
   /**
    * @brief Returns the number of pairs loaded with each vector-load
    */
-  static constexpr uint32_t vector_width() noexcept { return ProbeSequence::vector_width(); }
+  static __host__ __device__ constexpr uint32_t vector_width() noexcept
+  {
+    return ProbeSequence::vector_width();
+  }
 
   /**
    * @brief Returns the warp size.
    */
-  static constexpr uint32_t warp_size() noexcept { return 32u; }
+  static __host__ __device__ constexpr uint32_t warp_size() noexcept { return 32u; }
 
   /**
    * @brief Custom deleter for unique pointer of slots.
diff --git a/include/cuco/utility/key_generator.cuh b/include/cuco/utility/key_generator.cuh
index ebd3d9feb..d58c8cf08 100644
--- a/include/cuco/utility/key_generator.cuh
+++ b/include/cuco/utility/key_generator.cuh
@@ -21,6 +21,7 @@
 #include <cuco/detail/utility/strong_type.cuh>
 
 #include <cuda/functional>
+#include <cuda/std/limits>
 #include <cuda/std/span>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
@@ -35,9 +36,8 @@
 #include <thrust/transform.h>
 #include <thrust/type_traits/is_execution_policy.h>
 
-#include <time.h>
-
 #include <cstdint>
+#include <ctime>
 #include <iterator>
 #include <tuple>
 #include <type_traits>
@@ -191,7 +191,7 @@ struct dropout_fn {
   {
     RNG rng;
     thrust::uniform_int_distribution<T> non_match_dist{static_cast<T>(num_),
-                                                       std::numeric_limits<T>::max()};
+                                                       cuda::std::numeric_limits<T>::max()};
     rng.seed(seed);
     return non_match_dist(rng);
   }
@@ -246,7 +246,7 @@ class key_generator {
    *
    * @param seed Seed for the random number generator
    */
-  key_generator(uint32_t seed = static_cast<uint32_t>(time(nullptr))) : rng_(seed) {}
+  key_generator(uint32_t seed = static_cast<uint32_t>(std::time(nullptr))) : rng_(seed) {}
 
   /**
    * @brief Generates a sequence of random keys in the interval [0, N).
@@ -437,7 +437,8 @@ class key_generator {
  * vector holding the actual data
  */
 template <typename RNG = thrust::default_random_engine>
-std::pair<thrust::device_vector<cuda::std::span<std::byte>>, thrust::device_vector<std::byte>>
+std::pair<thrust::device_vector<cuda::std::span<cuda::std::byte>>,
+          thrust::device_vector<cuda::std::byte>>
 generate_random_byte_sequences(std::size_t n_sequences,
                                std::size_t min_sequence_length,
                                std::size_t max_sequence_length,
@@ -475,20 +476,21 @@ generate_random_byte_sequences(std::size_t n_sequences,
   // the total number of bytes required to store the sequences
   auto const n_bytes = thrust::reduce(exec_pol, lengths.begin(), lengths.end());
   // the byte vector holding the actual sequence data
-  thrust::device_vector<std::byte> bytes(n_bytes);
+  thrust::device_vector<cuda::std::byte> bytes(n_bytes);
 
   auto offsets_and_lengths =
     thrust::make_zip_iterator(thrust::make_tuple(offsets.begin(), lengths.begin()));
-  thrust::device_vector<cuda::std::span<std::byte>> sequences(n_sequences);
+  thrust::device_vector<cuda::std::span<cuda::std::byte>> sequences(n_sequences);
   // create the span object for each sequence
   thrust::transform(
     exec_pol,
     offsets_and_lengths,
     offsets_and_lengths + n_sequences,
     sequences.begin(),
-    cuda::proclaim_return_type<cuda::std::span<std::byte>>(
+    cuda::proclaim_return_type<cuda::std::span<cuda::std::byte>>(
       [bytes_ptr = thrust::raw_pointer_cast(bytes.data())] __device__(auto const& seq) {
-        return cuda::std::span<std::byte>{bytes_ptr + thrust::get<0>(seq), thrust::get<1>(seq)};
+        return cuda::std::span<cuda::std::byte>{bytes_ptr + thrust::get<0>(seq),
+                                                thrust::get<1>(seq)};
       }));
 
   // fill the byte buffer with random data
@@ -496,11 +498,11 @@ generate_random_byte_sequences(std::size_t n_sequences,
                     thrust::counting_iterator<std::size_t>(0),
                     thrust::counting_iterator<std::size_t>(bytes.size()),
                     bytes.begin(),
-                    cuda::proclaim_return_type<std::byte>([seed] __device__(std::size_t idx) {
+                    cuda::proclaim_return_type<cuda::std::byte>([seed] __device__(std::size_t idx) {
                       RNG rng;
                       thrust::uniform_int_distribution<int> byte_distribution{0, 255};
                       rng.seed(seed + idx);
-                      return static_cast<std::byte>(byte_distribution(rng));
+                      return static_cast<cuda::std::byte>(byte_distribution(rng));
                     }));
 
   return {std::move(sequences), std::move(bytes)};
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index 80b7c2870..737ddf32e 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -39,7 +39,7 @@ function(ConfigureTest TEST_NAME)
     set_target_properties(${TEST_NAME} PROPERTIES
                                        RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/tests")
     target_compile_options(${TEST_NAME} PRIVATE --compiler-options=-Wall --compiler-options=-Wextra
-      --expt-extended-lambda --expt-relaxed-constexpr -Xcompiler -Wno-subobject-linkage)
+      --expt-extended-lambda -Xcompiler -Wno-subobject-linkage)
     catch_discover_tests(${TEST_NAME} EXTRA_ARGS --allow-running-no-tests)
 endfunction(ConfigureTest)
 
diff --git a/tests/static_map/custom_type_test.cu b/tests/static_map/custom_type_test.cu
index 2dcd542e2..d2d5138aa 100644
--- a/tests/static_map/custom_type_test.cu
+++ b/tests/static_map/custom_type_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -154,7 +154,7 @@ TEMPLATE_TEST_CASE_SIG("User defined key and value type",
                               insert_values.end(),
                               found_values.begin(),
                               cuda::proclaim_return_type<bool>([] __device__(Value lhs, Value rhs) {
-                                return std::tie(lhs.f, lhs.s) == std::tie(rhs.f, rhs.s);
+                                return cuda::std::tie(lhs.f, lhs.s) == cuda::std::tie(rhs.f, rhs.s);
                               })));
   }
 
diff --git a/tests/static_multimap/custom_type_test.cu b/tests/static_multimap/custom_type_test.cu
index 8ddca9f5e..74210f5bd 100644
--- a/tests/static_multimap/custom_type_test.cu
+++ b/tests/static_multimap/custom_type_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 
 #include <cuco/static_multimap.cuh>
 
+#include <cuda/std/tuple>
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/functional.h>
@@ -25,12 +26,9 @@
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
-#include <thrust/tuple.h>
 
 #include <catch2/catch_template_test_macros.hpp>
 
-#include <tuple>
-
 // User-defined key type
 struct key_pair {
   int32_t a;
@@ -48,7 +46,7 @@ struct hash_key_pair {
 struct key_pair_equals {
   __device__ bool operator()(const key_pair& lhs, const key_pair& rhs)
   {
-    return std::tie(lhs.a, lhs.b) == std::tie(rhs.a, rhs.b);
+    return cuda::std::tie(lhs.a, lhs.b) == cuda::std::tie(rhs.a, rhs.b);
   }
 };
 
diff --git a/tests/utility/hash_test.cu b/tests/utility/hash_test.cu
index 90f9c8a5b..5e0edda74 100644
--- a/tests/utility/hash_test.cu
+++ b/tests/utility/hash_test.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,13 +19,13 @@
 #include <cuco/detail/__config>
 #include <cuco/hash_functions.cuh>
 
+#include <cuda/std/cstddef>
+#include <cuda/std/limits>
 #include <thrust/device_vector.h>
 
 #include <catch2/catch_template_test_macros.hpp>
 #include <catch2/catch_test_macros.hpp>
 
-#include <cstddef>
-
 template <int32_t Words>
 struct large_key {
   constexpr __host__ __device__ large_key(int32_t value) noexcept
@@ -56,15 +56,15 @@ __global__ void check_identity_hash_result_kernel(OutputIter result)
 
   result[i++] = check_hash_result<cuco::identity_hash<signed char>>(0, 0);
   result[i++] = check_hash_result<cuco::identity_hash<signed char>>(
-    std::numeric_limits<signed char>::max(), std::numeric_limits<signed char>::max());
+    cuda::std::numeric_limits<signed char>::max(), cuda::std::numeric_limits<signed char>::max());
 
   result[i++] = check_hash_result<cuco::identity_hash<int32_t>>(0, 0);
   result[i++] = check_hash_result<cuco::identity_hash<int32_t>>(
-    std::numeric_limits<int32_t>::max(), std::numeric_limits<int32_t>::max());
+    cuda::std::numeric_limits<int32_t>::max(), cuda::std::numeric_limits<int32_t>::max());
 
   result[i++] = check_hash_result<cuco::identity_hash<int64_t>>(0, 0);
   result[i++] = check_hash_result<cuco::identity_hash<int64_t>>(
-    std::numeric_limits<int64_t>::max(), std::numeric_limits<int64_t>::max());
+    cuda::std::numeric_limits<int64_t>::max(), cuda::std::numeric_limits<int64_t>::max());
 }
 
 TEST_CASE("Test cuco::identity_hash", "")
@@ -73,15 +73,16 @@ TEST_CASE("Test cuco::identity_hash", "")
   {
     CHECK(check_hash_result<cuco::identity_hash<signed char>>(0, 0));
     CHECK(check_hash_result<cuco::identity_hash<signed char>>(
-      std::numeric_limits<signed char>::max(), std::numeric_limits<signed char>::max()));
+      cuda::std::numeric_limits<signed char>::max(),
+      cuda::std::numeric_limits<signed char>::max()));
 
     CHECK(check_hash_result<cuco::identity_hash<int32_t>>(0, 0));
-    CHECK(check_hash_result<cuco::identity_hash<int32_t>>(std::numeric_limits<int32_t>::max(),
-                                                          std::numeric_limits<int32_t>::max()));
+    CHECK(check_hash_result<cuco::identity_hash<int32_t>>(
+      cuda::std::numeric_limits<int32_t>::max(), cuda::std::numeric_limits<int32_t>::max()));
 
     CHECK(check_hash_result<cuco::identity_hash<int64_t>>(0, 0));
-    CHECK(check_hash_result<cuco::identity_hash<int64_t>>(std::numeric_limits<int64_t>::max(),
-                                                          std::numeric_limits<int64_t>::max()));
+    CHECK(check_hash_result<cuco::identity_hash<int64_t>>(
+      cuda::std::numeric_limits<int64_t>::max(), cuda::std::numeric_limits<int64_t>::max()));
   }
   SECTION("Check if device-generated hash values match the identity function.")
   {
@@ -238,7 +239,7 @@ TEMPLATE_TEST_CASE_SIG("Static vs. dynamic key hash test",
   SECTION("Identical keys with static and dynamic key size should have the same hash value.")
   {
     CHECK(hash(key) ==
-          hash.compute_hash(reinterpret_cast<std::byte const*>(&key), sizeof(key_type)));
+          hash.compute_hash(reinterpret_cast<cuda::std::byte const*>(&key), sizeof(key_type)));
   }
 }