Skip to content

Commit

Permalink
Add adaptive CG add/contains
Browse files Browse the repository at this point in the history
  • Loading branch information
sleeepyjack committed Sep 26, 2024
1 parent d2f04ff commit 21c374a
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 57 deletions.
6 changes: 2 additions & 4 deletions benchmarks/bloom_filter/bloom_filter_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -93,8 +93,6 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);
state.add_global_memory_writes<typename filter_type::word_type>(num_keys *
filter_type::words_per_block);

filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};

Expand All @@ -111,6 +109,8 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block
template <typename Key, typename Hash, typename Block, typename Dist>
void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash, Block, Dist>)
{
// cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
// filter block fits into a 32B sector
using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>, Block>;
using filter_type =
cuco::bloom_filter<Key,
Expand Down Expand Up @@ -139,8 +139,6 @@ void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash,
gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());

state.add_element_count(num_keys);
state.add_global_memory_reads<typename filter_type::word_type>(num_keys *
filter_type::words_per_block);

filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};

Expand Down
19 changes: 16 additions & 3 deletions include/cuco/bloom_filter_ref.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -216,9 +216,22 @@ class bloom_filter_ref {
template <class ProbeKey>
[[nodiscard]] __device__ bool contains(ProbeKey const& key) const;

// TODO
// template <class CG, class ProbeKey>
// [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
/**
* @brief Device function that tests if a key's fingerprint is present in the filter.
*
* @note Best performance is achieved if the size of the CG is equal to `(words_per_block *
* sizeof(word_type)) / 32`.
*
* @tparam CG Cooperative Group type
* @tparam ProbeKey Input type that is implicitly convertible to `key_type`
*
* @param group The Cooperative Group this operation is executed with
* @param key The key to be tested
*
* @return `true` iff the key's fingerprint was present in the filter
*/
template <class CG, class ProbeKey>
[[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;

// TODO
// template <class CG, class InputIt, class OutputIt>
Expand Down
108 changes: 79 additions & 29 deletions include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -18,23 +18,22 @@

#include <cuco/detail/bloom_filter/kernels.cuh>
#include <cuco/detail/error.hpp>
#include <cuco/detail/utility/cuda.cuh>
#include <cuco/detail/utility/cuda.hpp>
#include <cuco/detail/utils.hpp>
#include <cuco/utility/cuda_thread_scope.cuh>

// TODO #include <cuda/std/algorithm> once available
#include <cub/device/device_for.cuh>
#include <cuda/atomic>
#include <cuda/std/__algorithm/max.h>
#include <cuda/std/__algorithm/min.h> // TODO #include <cuda/std/algorithm> once available
#include <cuda/std/array>
#include <cuda/std/bit>
#include <cuda/std/tuple>
#include <cuda/stream_ref>
#include <thrust/functional.h>
#include <thrust/iterator/constant_iterator.h>

#include <cooperative_groups.h>

#include <cstdint>
#include <type_traits>

Expand Down Expand Up @@ -106,26 +105,33 @@ class bloom_filter_impl {
}
}

template <class ProbeKey>
__device__ void add(cooperative_groups::thread_block_tile<words_per_block> const& tile,
ProbeKey const& key)
template <class CG, class ProbeKey>
__device__ void add(CG const& group, ProbeKey const& key)
{
auto const hash_value = policy_.hash(key);
auto const idx = policy_.block_index(hash_value, num_blocks_);
auto const rank = tile.thread_rank();
constexpr auto num_threads = tile_size_v<CG>;
constexpr auto optimal_num_threads = add_optimal_cg_size();
constexpr auto words_per_thread = words_per_block / optimal_num_threads;

// If single thread is optimal, use scalar add
if constexpr (num_threads == 1 or optimal_num_threads == 1) {
this->add(key);
} else {
auto const rank = group.thread_rank();

auto const hash_value = policy_.hash(key);
auto const idx = policy_.block_index(hash_value, num_blocks_);

auto const word = policy_.word_pattern(hash_value, rank);
if (word != 0) {
auto atom_word =
cuda::atomic_ref<word_type, thread_scope>{*(words_ + (idx * words_per_block + rank))};
atom_word.fetch_or(word, cuda::memory_order_relaxed);
#pragma unroll
for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
auto const word = policy_.word_pattern(hash_value, rank);

auto atom_word =
cuda::atomic_ref<word_type, thread_scope>{*(words_ + (idx * words_per_block + rank))};
atom_word.fetch_or(word, cuda::memory_order_relaxed);
}
}
}

// TODO
// template <class CG, class InputIt>
// __device__ void add(CG const& group, InputIt first, InputIt last);

template <class InputIt>
__host__ void add(InputIt first, InputIt last, cuda::stream_ref stream)
{
Expand Down Expand Up @@ -169,11 +175,12 @@ class bloom_filter_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto constexpr cg_size = add_optimal_cg_size();
auto constexpr block_size = cuco::detail::default_block_size();
auto const grid_size = cuco::detail::grid_size(
num_keys, words_per_block, cuco::detail::default_stride(), block_size);
auto const grid_size =
cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);

detail::add_if_n<block_size>
detail::add_if_n<cg_size, block_size>
<<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
}

Expand All @@ -194,9 +201,36 @@ class bloom_filter_impl {
return true;
}

// TODO
// template <class CG, class ProbeKey>
// [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
template <class CG, class ProbeKey>
[[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const
{
constexpr auto num_threads = tile_size_v<CG>;
constexpr auto optimal_num_threads = contains_optimal_cg_size();
constexpr auto words_per_thread = words_per_block / optimal_num_threads;

// If single thread is optimal, use scalar contains
if constexpr (num_threads == 1 or optimal_num_threads == 1) {
return this->contains(key);
} else {
auto const rank = group.thread_rank();
auto const hash_value = policy_.hash(key);
bool success = true;

#pragma unroll
for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
auto const thread_offset = i * words_per_thread;
auto const stored_pattern = this->vec_load_words<words_per_thread>(
policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
#pragma unroll words_per_thread
for (uint32_t j = 0; j < words_per_thread; ++j) {
auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
}
}

return group.all(success);
}
}

// TODO
// template <class CG, class InputIt, class OutputIt>
Expand Down Expand Up @@ -246,11 +280,12 @@ class bloom_filter_impl {
auto const num_keys = cuco::detail::distance(first, last);
if (num_keys == 0) { return; }

auto constexpr cg_size = contains_optimal_cg_size();
auto constexpr block_size = cuco::detail::default_block_size();
auto const grid_size =
cuco::detail::grid_size(num_keys, 1, cuco::detail::default_stride(), block_size);
cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);

detail::contains_if_n<block_size><<<grid_size, block_size, 0, stream.get()>>>(
detail::contains_if_n<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
first, num_keys, stencil, pred, output_begin, *this);
}

Expand All @@ -276,17 +311,32 @@ class bloom_filter_impl {
__device__ cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
{
return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
words_ + index, min(sizeof(word_type) * NumWords, required_alignment())));
words_ + index, cuda::std::min(sizeof(word_type) * NumWords, required_alignment())));
}

__host__ __device__ static constexpr size_t max_vec_bytes() noexcept
{
return 16; // LDG128 is the widest load we can perform
constexpr auto word_bytes = sizeof(word_type);
constexpr auto block_bytes = word_bytes * words_per_block;
return cuda::std::min(cuda::std::max(word_bytes, 32ul),
block_bytes); // aiming for 2xLDG128 -> 1 sector per thread
}

[[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
{
return words_per_block; // one thread per word so atomic updates can be coalesced
}

[[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size()
{
constexpr auto word_bytes = sizeof(word_type);
constexpr auto block_bytes = word_bytes * words_per_block;
return block_bytes / max_vec_bytes(); // one vector load per thread
}

__host__ __device__ static constexpr size_t required_alignment() noexcept
{
return cuda::std::max(sizeof(word_type) * words_per_block, max_vec_bytes());
return cuda::std::min(sizeof(word_type) * words_per_block, max_vec_bytes());
}

word_type* words_;
Expand Down
8 changes: 8 additions & 0 deletions include/cuco/detail/bloom_filter/bloom_filter_ref.inl
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,14 @@ template <class ProbeKey>
return impl_.contains(key);
}

template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
template <class CG, class ProbeKey>
[[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
CG const& group, ProbeKey const& key) const
{
return impl_.contains(group, key);
}

template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
template <class InputIt, class OutputIt>
__host__ void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(InputIt first,
Expand Down
53 changes: 32 additions & 21 deletions include/cuco/detail/bloom_filter/kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -26,32 +26,33 @@ namespace cuco::detail {

CUCO_SUPPRESS_KERNEL_WARNINGS

template <int32_t BlockSize, class InputIt, class StencilIt, class Predicate, class Ref>
template <int32_t CGSize,
int32_t BlockSize,
class InputIt,
class StencilIt,
class Predicate,
class Ref>
CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
{
auto constexpr words_per_block = Ref::words_per_block;
namespace cg = cooperative_groups;

auto const loop_stride = cuco::detail::grid_stride() / words_per_block;
auto idx = cuco::detail::global_thread_id() / words_per_block;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

[[maybe_unused]] auto const tile =
cooperative_groups::tiled_partition<words_per_block>(cooperative_groups::this_thread_block());
[[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());

while (idx < n) {
if (pred(*(stencil + idx))) {
typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
if constexpr (words_per_block == 1) {
ref.add(insert_element);
} else {
ref.add(tile, insert_element);
}
ref.add(tile, insert_element);
}
idx += loop_stride;
}
}

template <int32_t BlockSize,
template <int32_t CGSize,
int32_t BlockSize,
class InputIt,
class StencilIt,
class Predicate,
Expand All @@ -64,17 +65,27 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
OutputIt out,
Ref ref)
{
auto const loop_stride = cuco::detail::grid_stride();
auto idx = cuco::detail::global_thread_id();
namespace cg = cooperative_groups;

while (idx < n) {
if (pred(*(stencil + idx))) {
typename std::iterator_traits<InputIt>::value_type const& query{*(first + idx)};
*(out + idx) = ref.contains(query);
} else {
*(out + idx) = false;
auto const loop_stride = cuco::detail::grid_stride() / CGSize;
auto idx = cuco::detail::global_thread_id() / CGSize;

[[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());

if constexpr (CGSize == 1) {
while (idx < n) {
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
*(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
idx += loop_stride;
}
} else {
auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
while (idx < n) {
typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
if (tile.thread_rank() == 0) { *(out + idx) = found; }
idx += loop_stride;
}
idx += loop_stride;
}
}

Expand Down
26 changes: 26 additions & 0 deletions include/cuco/detail/utility/cuda.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@

#include <cuco/detail/utility/cuda.hpp>

#include <cooperative_groups.h>

#include <cstdint>

#if defined(CUCO_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION)
#define CUCO_SUPPRESS_KERNEL_WARNINGS
#elif defined(__NVCC__) && (defined(__GNUC__) || defined(__clang__))
Expand Down Expand Up @@ -59,5 +63,27 @@ __device__ static index_type grid_stride() noexcept
return index_type{gridDim.x} * index_type{blockDim.x};
}

/**
* @brief Constexpr helper to extract the size of a Cooperative Group.
*
* @tparam Tile The Cooperative Group type
*/
template <typename Tile>
struct tile_size;

/**
* @brief Specialization of `cuco::detail::tile_size` for 'cooperative_groups::thread_block_tile'.
*
* @tparam CGSize The Cooperative Group size
* @tparam ParentCG The Cooperative Group the tile has been created from
*/
template <uint32_t CGSize, class ParentCG>
struct tile_size<cooperative_groups::thread_block_tile<CGSize, ParentCG>> {
static constexpr uint32_t value = CGSize; ///< Size of the `thread_block_tile`
};

template <typename Tile>
constexpr uint32_t tile_size_v = tile_size<Tile>::value;

} // namespace detail
} // namespace cuco

0 comments on commit 21c374a

Please sign in to comment.