Skip to content

Commit

Permalink
Use cuda::std::byte as the default device storage type (#597)
Browse files Browse the repository at this point in the history
Address
#573 (comment)

This PR updates the existing code to use `cuda::std::byte` as the
default device data type instead of `std::byte`. This change addresses
potential issues where `cuda::std::` utilities cannot be applied to
`std::byte` when relaxed constexprs are disabled.
  • Loading branch information
PointKernel authored Sep 18, 2024
1 parent 5602381 commit 1d9da6d
Show file tree
Hide file tree
Showing 23 changed files with 50 additions and 47 deletions.
8 changes: 4 additions & 4 deletions examples/distinct_count_estimator/device_ref_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,10 @@
*/
#include <cuco/distinct_count_estimator.cuh>

#include <cuda/std/cstddef>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#include <cstddef>
#include <iostream>

/**
Expand All @@ -37,7 +37,7 @@ __global__ void fused_kernel(RefType ref, InputIt first, std::size_t n)
using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;

// Shared memory storage for the block-local estimator
extern __shared__ std::byte local_sketch[];
extern __shared__ cuda::std::byte local_sketch[];

// The following check is optional since the base address of dynamic shared memory is guaranteed
// to meet the alignment requirements
Expand Down Expand Up @@ -94,7 +94,7 @@ __global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
size_t n,
OutputIt out)
{
extern __shared__ std::byte local_sketch[];
extern __shared__ cuda::std::byte local_sketch[];

auto const block = cooperative_groups::this_thread_block();

Expand Down Expand Up @@ -161,4 +161,4 @@ int main(void)
}

return 0;
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,8 @@ constexpr auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_functio
}

template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
constexpr cuda::std::span<std::byte> distinct_count_estimator<T, Scope, Hash, Allocator>::sketch()
const noexcept
constexpr cuda::std::span<cuda::std::byte>
distinct_count_estimator<T, Scope, Hash, Allocator>::sketch() const noexcept
{
return this->impl_->sketch();
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ namespace cuco {
template <class T, cuda::thread_scope Scope, class Hash>
__host__
__device__ constexpr distinct_count_estimator_ref<T, Scope, Hash>::distinct_count_estimator_ref(
cuda::std::span<std::byte> sketch_span, Hash const& hash)
cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash)
: impl_{sketch_span, hash}
{
}
Expand Down Expand Up @@ -114,7 +114,7 @@ __host__ __device__ constexpr auto distinct_count_estimator_ref<T, Scope, Hash>:
}

template <class T, cuda::thread_scope Scope, class Hash>
__host__ __device__ constexpr cuda::std::span<std::byte>
__host__ __device__ constexpr cuda::std::span<cuda::std::byte>
distinct_count_estimator_ref<T, Scope, Hash>::sketch() const noexcept
{
return this->impl_.sketch();
Expand Down
9 changes: 5 additions & 4 deletions include/cuco/detail/hyperloglog/hyperloglog.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
#include <cuco/types.cuh>
#include <cuco/utility/cuda_thread_scope.cuh>

#include <cuda/std/cstddef>
#include <cuda/stream_ref>

#include <cstddef>
#include <iterator>
#include <memory>

Expand Down Expand Up @@ -74,7 +74,8 @@ class hyperloglog {
: allocator_{alloc},
sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)),
custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}},
ref_{cuda::std::span{reinterpret_cast<std::byte*>(this->sketch_.get()), sketch_size_b}, hash}
ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(this->sketch_.get()), sketch_size_b},
hash}
{
this->ref_.clear_async(stream);
}
Expand Down Expand Up @@ -290,7 +291,7 @@ class hyperloglog {
*
* @return The cuda::std::span of the sketch
*/
[[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept
[[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept
{
return this->ref_.sketch();
}
Expand Down Expand Up @@ -352,4 +353,4 @@ class hyperloglog {
template <class T_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
friend class hyperloglog;
};
} // namespace cuco::detail
} // namespace cuco::detail
11 changes: 6 additions & 5 deletions include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@

#include <cuda/atomic>
#include <cuda/std/bit>
#include <cuda/std/cstddef>
#include <cuda/std/span>
#include <cuda/std/utility>
#include <cuda/stream_ref>
Expand All @@ -36,7 +37,6 @@
#include <cooperative_groups/reduce.h>

#include <algorithm> // there is no <cuda/std/algorithm>
#include <cstddef>
#include <vector>

namespace cuco::detail {
Expand Down Expand Up @@ -79,7 +79,7 @@ class hyperloglog_ref {
* @param sketch_span Reference to sketch storage
* @param hash The hash function used to hash items
*/
__host__ __device__ constexpr hyperloglog_ref(cuda::std::span<std::byte> sketch_span,
__host__ __device__ constexpr hyperloglog_ref(cuda::std::span<cuda::std::byte> sketch_span,
Hash const& hash)
: hash_{hash},
precision_{cuda::std::countr_zero(
Expand Down Expand Up @@ -448,10 +448,11 @@ class hyperloglog_ref {
*
* @return The cuda::std::span of the sketch
*/
[[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept
[[nodiscard]] __host__ __device__ constexpr cuda::std::span<cuda::std::byte> sketch()
const noexcept
{
return cuda::std::span<std::byte>(reinterpret_cast<std::byte*>(this->sketch_.data()),
this->sketch_bytes());
return cuda::std::span<cuda::std::byte>(
reinterpret_cast<cuda::std::byte*>(this->sketch_.data()), this->sketch_bytes());
}

/**
Expand Down
6 changes: 3 additions & 3 deletions include/cuco/detail/hyperloglog/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ CUCO_KERNEL void add_shmem_vectorized(typename RefType::value_type const* first,

// Base address of dynamic shared memory is guaranteed to be aligned to at least 16 bytes which is
// sufficient for this purpose
extern __shared__ std::byte local_sketch[];
extern __shared__ cuda::std::byte local_sketch[];

auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();
Expand Down Expand Up @@ -94,7 +94,7 @@ CUCO_KERNEL void add_shmem(InputIt first, cuco::detail::index_type n, RefType re
using local_ref_type = typename RefType::with_scope<cuda::thread_scope_block>;

// TODO assert alignment
extern __shared__ std::byte local_sketch[];
extern __shared__ cuda::std::byte local_sketch[];

auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();
Expand Down Expand Up @@ -142,4 +142,4 @@ CUCO_KERNEL void estimate(std::size_t* cardinality, RefType ref)
if (block.thread_rank() == 0) { *cardinality = estimate; }
}
}
} // namespace cuco::hyperloglog_ns::detail
} // namespace cuco::hyperloglog_ns::detail
3 changes: 2 additions & 1 deletion include/cuco/detail/trie/dynamic_bitset/dynamic_bitset.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#pragma once

#include <cuda/std/array>
#include <cuda/std/cstddef>
#include <cuda/stream_ref>
#include <thrust/device_malloc_allocator.h>
#include <thrust/device_vector.h>
Expand Down Expand Up @@ -78,7 +79,7 @@ struct rank {
* @tparam Allocator Type of allocator used for device storage
*/
// TODO: have to use device_malloc_allocator for now otherwise the container cannot grow
template <class Allocator = thrust::device_malloc_allocator<std::byte>>
template <class Allocator = thrust::device_malloc_allocator<cuda::std::byte>>
class dynamic_bitset {
public:
using size_type = std::size_t; ///< size type to specify bit index
Expand Down
6 changes: 3 additions & 3 deletions include/cuco/distinct_count_estimator.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,9 @@
#include <cuco/utility/allocator.hpp>
#include <cuco/utility/cuda_thread_scope.cuh>

#include <cuda/std/cstddef>
#include <cuda/stream_ref>

#include <cstddef>
#include <iterator>
#include <memory>

Expand All @@ -43,7 +43,7 @@ namespace cuco {
template <class T,
cuda::thread_scope Scope = cuda::thread_scope_device,
class Hash = cuco::xxhash_64<T>,
class Allocator = cuco::cuda_allocator<std::byte>>
class Allocator = cuco::cuda_allocator<cuda::std::byte>>
class distinct_count_estimator {
using impl_type = detail::hyperloglog<T, Scope, Hash, Allocator>;

Expand Down Expand Up @@ -243,7 +243,7 @@ class distinct_count_estimator {
*
* @return The cuda::std::span of the sketch
*/
[[nodiscard]] constexpr cuda::std::span<std::byte> sketch() const noexcept;
[[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept;

/**
* @brief Gets the number of bytes required for the sketch storage.
Expand Down
12 changes: 6 additions & 6 deletions include/cuco/distinct_count_estimator_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -20,12 +20,11 @@
#include <cuco/types.cuh>
#include <cuco/utility/cuda_thread_scope.cuh>

#include <cuda/std/cstddef>
#include <cuda/stream_ref>

#include <cooperative_groups.h>

#include <cstddef>

namespace cuco {
/**
* @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
Expand Down Expand Up @@ -64,8 +63,8 @@ class distinct_count_estimator_ref {
* @param sketch_span Reference to sketch storage
* @param hash The hash function used to hash items
*/
__host__ __device__ constexpr distinct_count_estimator_ref(cuda::std::span<std::byte> sketch_span,
Hash const& hash = {});
__host__ __device__ constexpr distinct_count_estimator_ref(
cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash = {});

/**
* @brief Resets the estimator, i.e., clears the current count estimate.
Expand Down Expand Up @@ -211,7 +210,8 @@ class distinct_count_estimator_ref {
*
* @return The cuda::std::span of the sketch
*/
[[nodiscard]] __host__ __device__ constexpr cuda::std::span<std::byte> sketch() const noexcept;
[[nodiscard]] __host__ __device__ constexpr cuda::std::span<cuda::std::byte> sketch()
const noexcept;

/**
* @brief Gets the number of bytes required for the sketch storage.
Expand Down Expand Up @@ -255,4 +255,4 @@ class distinct_count_estimator_ref {
};
} // namespace cuco

#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
4 changes: 2 additions & 2 deletions tests/distinct_count_estimator/device_ref_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
#include <cuco/distinct_count_estimator.cuh>
#include <cuco/hash_functions.cuh>

#include <cuda/std/cstddef>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>

#include <catch2/catch_template_test_macros.hpp>
#include <catch2/generators/catch_generators.hpp>

#include <cmath>
#include <cstddef>
#include <cstdint>

template <typename Ref, typename InputIt, typename OutputIt>
Expand All @@ -35,7 +35,7 @@ __global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
size_t n,
OutputIt out)
{
extern __shared__ std::byte local_sketch[];
extern __shared__ cuda::std::byte local_sketch[];

auto const block = cooperative_groups::this_thread_block();

Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/capacity_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ TEST_CASE("Static map capacity", "")
using T = int32_t;
using ProbeT = cuco::double_hashing<1, cuco::default_hash_function<Key>>;
using Equal = thrust::equal_to<Key>;
using AllocatorT = cuco::cuda_allocator<std::byte>;
using AllocatorT = cuco::cuda_allocator<cuda::std::byte>;
using StorageT = cuco::storage<2>;

SECTION("zero capacity is allowed.")
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/duplicate_keys_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
num_keys * 2, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/erase_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,7 +118,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}, cuco::erased_key<Key>{-2}};

Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/for_each_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -114,7 +114,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>;

auto map = map_type{num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{0}};
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/hash_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@ void test_hash_function()
cuda::thread_scope_device,
thrust::equal_to<Key>,
cuco::linear_probing<1, Hash>,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/insert_and_find_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

Expand Down
6 changes: 3 additions & 3 deletions tests/static_map/insert_or_apply_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -175,7 +175,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>;

SECTION("sentinel equals init; has_init = true")
Expand Down Expand Up @@ -213,7 +213,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
cuco::linear_probing<2, cuco::murmurhash3_32<Key>>,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>;

SECTION("sentinel equals init; has_init = true")
Expand Down Expand Up @@ -249,7 +249,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
cuco::linear_probing<1, cuco::murmurhash3_32<Key>>,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>;

SECTION("duplicate keys")
Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/insert_or_assign_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
num_keys, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

Expand Down
2 changes: 1 addition & 1 deletion tests/static_map/unique_sequence_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -182,7 +182,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
extent_type{}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

Expand Down
2 changes: 1 addition & 1 deletion tests/static_multimap/count_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<T>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
num_keys * multiplicity, cuco::empty_key<T>{-1}, cuco::empty_value<T>{-1}};

Expand Down
2 changes: 1 addition & 1 deletion tests/static_multimap/insert_contains_test.cu
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ TEMPLATE_TEST_CASE_SIG(
cuda::thread_scope_device,
thrust::equal_to<Key>,
probe,
cuco::cuda_allocator<std::byte>,
cuco::cuda_allocator<cuda::std::byte>,
cuco::storage<2>>{
extent_type{num_keys}, cuco::empty_key<Key>{-1}, cuco::empty_value<Value>{-1}};

Expand Down
Loading

0 comments on commit 1d9da6d

Please sign in to comment.