Add adaptive CG add/contains

NVIDIA · Sep 26, 2024 · 21c374a · 21c374a
1 parent d2f04ff
commit 21c374a
Show file tree

Hide file tree

Showing 6 changed files with 163 additions and 57 deletions.
diff --git a/benchmarks/bloom_filter/bloom_filter_bench.cu b/benchmarks/bloom_filter/bloom_filter_bench.cu
@@ -93,8 +93,6 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block
   gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
 
   state.add_element_count(num_keys);
-  state.add_global_memory_writes<typename filter_type::word_type>(num_keys *
-                                                                  filter_type::words_per_block);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 
@@ -111,6 +109,8 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block
 template <typename Key, typename Hash, typename Block, typename Dist>
 void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash, Block, Dist>)
 {
+  // cudaDeviceSetLimit(cudaLimitMaxL2FetchGranularity, 32); // slightly improves peformance if
+  // filter block fits into a 32B sector
   using policy_type = cuco::default_filter_policy<rebind_hasher_t<Hash, Key>, Block>;
   using filter_type =
     cuco::bloom_filter<Key,
@@ -139,8 +139,6 @@ void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash,
   gen.generate(dist_from_state<Dist>(state), keys.begin(), keys.end());
 
   state.add_element_count(num_keys);
-  state.add_global_memory_reads<typename filter_type::word_type>(num_keys *
-                                                                 filter_type::words_per_block);
 
   filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};
 

diff --git a/include/cuco/bloom_filter_ref.cuh b/include/cuco/bloom_filter_ref.cuh
@@ -216,9 +216,22 @@ class bloom_filter_ref {
   template <class ProbeKey>
   [[nodiscard]] __device__ bool contains(ProbeKey const& key) const;
 
-  // TODO
-  // template <class CG, class ProbeKey>
-  // [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
+  /**
+   * @brief Device function that tests if a key's fingerprint is present in the filter.
+   *
+   * @note Best performance is achieved if the size of the CG is equal to `(words_per_block *
+   * sizeof(word_type)) / 32`.
+   *
+   * @tparam CG Cooperative Group type
+   * @tparam ProbeKey Input type that is implicitly convertible to `key_type`
+   *
+   * @param group The Cooperative Group this operation is executed with
+   * @param key The key to be tested
+   *
+   * @return `true` iff the key's fingerprint was present in the filter
+   */
+  template <class CG, class ProbeKey>
+  [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
 
   // TODO
   // template <class CG, class InputIt, class OutputIt>

diff --git a/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh b/include/cuco/detail/bloom_filter/bloom_filter_impl.cuh
@@ -18,23 +18,22 @@
 
 #include <cuco/detail/bloom_filter/kernels.cuh>
 #include <cuco/detail/error.hpp>
+#include <cuco/detail/utility/cuda.cuh>
 #include <cuco/detail/utility/cuda.hpp>
 #include <cuco/detail/utils.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
 
-// TODO #include <cuda/std/algorithm> once available
 #include <cub/device/device_for.cuh>
 #include <cuda/atomic>
 #include <cuda/std/__algorithm/max.h>
+#include <cuda/std/__algorithm/min.h>  // TODO #include <cuda/std/algorithm> once available
 #include <cuda/std/array>
 #include <cuda/std/bit>
 #include <cuda/std/tuple>
 #include <cuda/stream_ref>
 #include <thrust/functional.h>
 #include <thrust/iterator/constant_iterator.h>
 
-#include <cooperative_groups.h>
-
 #include <cstdint>
 #include <type_traits>
 
@@ -106,26 +105,33 @@ class bloom_filter_impl {
     }
   }
 
-  template <class ProbeKey>
-  __device__ void add(cooperative_groups::thread_block_tile<words_per_block> const& tile,
-                      ProbeKey const& key)
+  template <class CG, class ProbeKey>
+  __device__ void add(CG const& group, ProbeKey const& key)
   {
-    auto const hash_value = policy_.hash(key);
-    auto const idx        = policy_.block_index(hash_value, num_blocks_);
-    auto const rank       = tile.thread_rank();
+    constexpr auto num_threads         = tile_size_v<CG>;
+    constexpr auto optimal_num_threads = add_optimal_cg_size();
+    constexpr auto words_per_thread    = words_per_block / optimal_num_threads;
+
+    // If single thread is optimal, use scalar add
+    if constexpr (num_threads == 1 or optimal_num_threads == 1) {
+      this->add(key);
+    } else {
+      auto const rank = group.thread_rank();
+
+      auto const hash_value = policy_.hash(key);
+      auto const idx        = policy_.block_index(hash_value, num_blocks_);
 
-    auto const word = policy_.word_pattern(hash_value, rank);
-    if (word != 0) {
-      auto atom_word =
-        cuda::atomic_ref<word_type, thread_scope>{*(words_ + (idx * words_per_block + rank))};
-      atom_word.fetch_or(word, cuda::memory_order_relaxed);
+#pragma unroll
+      for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
+        auto const word = policy_.word_pattern(hash_value, rank);
+
+        auto atom_word =
+          cuda::atomic_ref<word_type, thread_scope>{*(words_ + (idx * words_per_block + rank))};
+        atom_word.fetch_or(word, cuda::memory_order_relaxed);
+      }
     }
   }
 
-  // TODO
-  // template <class CG, class InputIt>
-  // __device__ void add(CG const& group, InputIt first, InputIt last);
-
   template <class InputIt>
   __host__ void add(InputIt first, InputIt last, cuda::stream_ref stream)
   {
@@ -169,11 +175,12 @@ class bloom_filter_impl {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
 
+    auto constexpr cg_size    = add_optimal_cg_size();
     auto constexpr block_size = cuco::detail::default_block_size();
-    auto const grid_size      = cuco::detail::grid_size(
-      num_keys, words_per_block, cuco::detail::default_stride(), block_size);
+    auto const grid_size =
+      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
 
-    detail::add_if_n<block_size>
+    detail::add_if_n<cg_size, block_size>
       <<<grid_size, block_size, 0, stream.get()>>>(first, num_keys, stencil, pred, *this);
   }
 
@@ -194,9 +201,36 @@ class bloom_filter_impl {
     return true;
   }
 
-  // TODO
-  // template <class CG, class ProbeKey>
-  // [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const;
+  template <class CG, class ProbeKey>
+  [[nodiscard]] __device__ bool contains(CG const& group, ProbeKey const& key) const
+  {
+    constexpr auto num_threads         = tile_size_v<CG>;
+    constexpr auto optimal_num_threads = contains_optimal_cg_size();
+    constexpr auto words_per_thread    = words_per_block / optimal_num_threads;
+
+    // If single thread is optimal, use scalar contains
+    if constexpr (num_threads == 1 or optimal_num_threads == 1) {
+      return this->contains(key);
+    } else {
+      auto const rank       = group.thread_rank();
+      auto const hash_value = policy_.hash(key);
+      bool success          = true;
+
+#pragma unroll
+      for (uint32_t i = rank; i < optimal_num_threads; i += num_threads) {
+        auto const thread_offset  = i * words_per_thread;
+        auto const stored_pattern = this->vec_load_words<words_per_thread>(
+          policy_.block_index(hash_value, num_blocks_) * words_per_block + thread_offset);
+#pragma unroll words_per_thread
+        for (uint32_t j = 0; j < words_per_thread; ++j) {
+          auto const expected_pattern = policy_.word_pattern(hash_value, thread_offset + j);
+          if ((stored_pattern[j] & expected_pattern) != expected_pattern) { success = false; }
+        }
+      }
+
+      return group.all(success);
+    }
+  }
 
   // TODO
   // template <class CG, class InputIt, class OutputIt>
@@ -246,11 +280,12 @@ class bloom_filter_impl {
     auto const num_keys = cuco::detail::distance(first, last);
     if (num_keys == 0) { return; }
 
+    auto constexpr cg_size    = contains_optimal_cg_size();
     auto constexpr block_size = cuco::detail::default_block_size();
     auto const grid_size =
-      cuco::detail::grid_size(num_keys, 1, cuco::detail::default_stride(), block_size);
+      cuco::detail::grid_size(num_keys, cg_size, cuco::detail::default_stride(), block_size);
 
-    detail::contains_if_n<block_size><<<grid_size, block_size, 0, stream.get()>>>(
+    detail::contains_if_n<cg_size, block_size><<<grid_size, block_size, 0, stream.get()>>>(
       first, num_keys, stencil, pred, output_begin, *this);
   }
 
@@ -276,17 +311,32 @@ class bloom_filter_impl {
   __device__ cuda::std::array<word_type, NumWords> vec_load_words(size_type index) const
   {
     return *reinterpret_cast<cuda::std::array<word_type, NumWords>*>(__builtin_assume_aligned(
-      words_ + index, min(sizeof(word_type) * NumWords, required_alignment())));
+      words_ + index, cuda::std::min(sizeof(word_type) * NumWords, required_alignment())));
   }
 
   __host__ __device__ static constexpr size_t max_vec_bytes() noexcept
   {
-    return 16;  // LDG128 is the widest load we can perform
+    constexpr auto word_bytes  = sizeof(word_type);
+    constexpr auto block_bytes = word_bytes * words_per_block;
+    return cuda::std::min(cuda::std::max(word_bytes, 32ul),
+                          block_bytes);  // aiming for 2xLDG128 -> 1 sector per thread
+  }
+
+  [[nodiscard]] __host__ __device__ static constexpr int32_t add_optimal_cg_size()
+  {
+    return words_per_block;  // one thread per word so atomic updates can be coalesced
+  }
+
+  [[nodiscard]] __host__ __device__ static constexpr int32_t contains_optimal_cg_size()
+  {
+    constexpr auto word_bytes  = sizeof(word_type);
+    constexpr auto block_bytes = word_bytes * words_per_block;
+    return block_bytes / max_vec_bytes();  // one vector load per thread
   }
 
   __host__ __device__ static constexpr size_t required_alignment() noexcept
   {
-    return cuda::std::max(sizeof(word_type) * words_per_block, max_vec_bytes());
+    return cuda::std::min(sizeof(word_type) * words_per_block, max_vec_bytes());
   }
 
   word_type* words_;

diff --git a/include/cuco/detail/bloom_filter/bloom_filter_ref.inl b/include/cuco/detail/bloom_filter/bloom_filter_ref.inl
@@ -106,6 +106,14 @@ template <class ProbeKey>
   return impl_.contains(key);
 }
 
+template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
+template <class CG, class ProbeKey>
+[[nodiscard]] __device__ bool bloom_filter_ref<Key, Extent, Scope, Policy>::contains(
+  CG const& group, ProbeKey const& key) const
+{
+  return impl_.contains(group, key);
+}
+
 template <class Key, class Extent, cuda::thread_scope Scope, class Policy>
 template <class InputIt, class OutputIt>
 __host__ void bloom_filter_ref<Key, Extent, Scope, Policy>::contains(InputIt first,

diff --git a/include/cuco/detail/bloom_filter/kernels.cuh b/include/cuco/detail/bloom_filter/kernels.cuh
@@ -26,32 +26,33 @@ namespace cuco::detail {
 
 CUCO_SUPPRESS_KERNEL_WARNINGS
 
-template <int32_t BlockSize, class InputIt, class StencilIt, class Predicate, class Ref>
+template <int32_t CGSize,
+          int32_t BlockSize,
+          class InputIt,
+          class StencilIt,
+          class Predicate,
+          class Ref>
 CUCO_KERNEL __launch_bounds__(BlockSize) void add_if_n(
   InputIt first, cuco::detail::index_type n, StencilIt stencil, Predicate pred, Ref ref)
 {
-  auto constexpr words_per_block = Ref::words_per_block;
+  namespace cg = cooperative_groups;
 
-  auto const loop_stride = cuco::detail::grid_stride() / words_per_block;
-  auto idx               = cuco::detail::global_thread_id() / words_per_block;
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
 
-  [[maybe_unused]] auto const tile =
-    cooperative_groups::tiled_partition<words_per_block>(cooperative_groups::this_thread_block());
+  [[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
 
   while (idx < n) {
     if (pred(*(stencil + idx))) {
       typename std::iterator_traits<InputIt>::value_type const& insert_element{*(first + idx)};
-      if constexpr (words_per_block == 1) {
-        ref.add(insert_element);
-      } else {
-        ref.add(tile, insert_element);
-      }
+      ref.add(tile, insert_element);
     }
     idx += loop_stride;
   }
 }
 
-template <int32_t BlockSize,
+template <int32_t CGSize,
+          int32_t BlockSize,
           class InputIt,
           class StencilIt,
           class Predicate,
@@ -64,17 +65,27 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void contains_if_n(InputIt first,
                                                             OutputIt out,
                                                             Ref ref)
 {
-  auto const loop_stride = cuco::detail::grid_stride();
-  auto idx               = cuco::detail::global_thread_id();
+  namespace cg = cooperative_groups;
 
-  while (idx < n) {
-    if (pred(*(stencil + idx))) {
-      typename std::iterator_traits<InputIt>::value_type const& query{*(first + idx)};
-      *(out + idx) = ref.contains(query);
-    } else {
-      *(out + idx) = false;
+  auto const loop_stride = cuco::detail::grid_stride() / CGSize;
+  auto idx               = cuco::detail::global_thread_id() / CGSize;
+
+  [[maybe_unused]] auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+
+  if constexpr (CGSize == 1) {
+    while (idx < n) {
+      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      *(out + idx) = pred(*(stencil + idx)) ? ref.contains(key) : false;
+      idx += loop_stride;
+    }
+  } else {
+    auto const tile = cg::tiled_partition<CGSize>(cg::this_thread_block());
+    while (idx < n) {
+      typename std::iterator_traits<InputIt>::value_type const& key = *(first + idx);
+      auto const found = pred(*(stencil + idx)) ? ref.contains(tile, key) : false;
+      if (tile.thread_rank() == 0) { *(out + idx) = found; }
+      idx += loop_stride;
     }
-    idx += loop_stride;
   }
 }
 

diff --git a/include/cuco/detail/utility/cuda.cuh b/include/cuco/detail/utility/cuda.cuh
@@ -17,6 +17,10 @@
 
 #include <cuco/detail/utility/cuda.hpp>
 
+#include <cooperative_groups.h>
+
+#include <cstdint>
+
 #if defined(CUCO_DISABLE_KERNEL_VISIBILITY_WARNING_SUPPRESSION)
 #define CUCO_SUPPRESS_KERNEL_WARNINGS
 #elif defined(__NVCC__) && (defined(__GNUC__) || defined(__clang__))
@@ -59,5 +63,27 @@ __device__ static index_type grid_stride() noexcept
   return index_type{gridDim.x} * index_type{blockDim.x};
 }
 
+/**
+ * @brief Constexpr helper to extract the size of a Cooperative Group.
+ *
+ * @tparam Tile The Cooperative Group type
+ */
+template <typename Tile>
+struct tile_size;
+
+/**
+ * @brief Specialization of `cuco::detail::tile_size` for 'cooperative_groups::thread_block_tile'.
+ *
+ * @tparam CGSize The Cooperative Group size
+ * @tparam ParentCG The Cooperative Group the tile has been created from
+ */
+template <uint32_t CGSize, class ParentCG>
+struct tile_size<cooperative_groups::thread_block_tile<CGSize, ParentCG>> {
+  static constexpr uint32_t value = CGSize;  ///< Size of the `thread_block_tile`
+};
+
+template <typename Tile>
+constexpr uint32_t tile_size_v = tile_size<Tile>::value;
+
 }  // namespace detail
 }  // namespace cuco