Skip to content

Commit

Permalink
Implement policy concept and address review comments
Browse files Browse the repository at this point in the history
  • Loading branch information
sleeepyjack committed Sep 18, 2024
1 parent af97e2c commit 5b3fe40
Show file tree
Hide file tree
Showing 14 changed files with 582 additions and 482 deletions.
21 changes: 8 additions & 13 deletions benchmarks/benchmark_defaults.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,16 +34,14 @@ using HASH_RANGE = nvbench::type_list<cuco::identity_hash<char>,
// cuco::murmurhash3_x86_128<char>,
// cuco::murmurhash3_x64_128<char>>; // TODO handle tuple-like hash value

auto constexpr N = 100'000'000;
auto constexpr OCCUPANCY = 0.5;
auto constexpr MULTIPLICITY = 1;
auto constexpr MATCHING_RATE = 1.0;
auto constexpr MAX_NOISE = 3;
auto constexpr SKEW = 0.5;
auto constexpr BATCH_SIZE = 1'000'000;
auto constexpr INITIAL_SIZE = 50'000'000;
auto constexpr FILTER_SIZE_MB = 2'000;
auto constexpr PATTERN_BITS = 6;
auto constexpr N = 100'000'000;
auto constexpr OCCUPANCY = 0.5;
auto constexpr MULTIPLICITY = 1;
auto constexpr MATCHING_RATE = 1.0;
auto constexpr MAX_NOISE = 3;
auto constexpr SKEW = 0.5;
auto constexpr BATCH_SIZE = 1'000'000;
auto constexpr INITIAL_SIZE = 50'000'000;

auto const N_RANGE = nvbench::range(10'000'000, 100'000'000, 20'000'000);
auto const N_RANGE_CACHE =
Expand All @@ -52,8 +50,5 @@ auto const OCCUPANCY_RANGE = nvbench::range(0.1, 0.9, 0.1);
auto const MULTIPLICITY_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 8, 16};
auto const MATCHING_RATE_RANGE = nvbench::range(0.1, 1., 0.1);
auto const SKEW_RANGE = nvbench::range(0.1, 1., 0.1);
auto const FILTER_SIZE_MB_RANGE_CACHE =
std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
auto const PATTERN_BITS_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 6, 8};

} // namespace cuco::benchmark::defaults
106 changes: 57 additions & 49 deletions benchmarks/bloom_filter/bloom_filter_bench.cu
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
* limitations under the License.
*/

#include "defaults.hpp"

#include <benchmark_defaults.hpp>
#include <benchmark_utils.hpp>

Expand All @@ -22,6 +24,7 @@

#include <nvbench/nvbench.cuh>

#include <cuda/std/limits>
#include <thrust/count.h>
#include <thrust/device_vector.h>
#include <thrust/sequence.h>
Expand Down Expand Up @@ -64,19 +67,23 @@ void add_fpr_summary(nvbench::state& state, FilterType& filter)
template <typename Key, typename Hash, typename Block, typename Dist>
void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block, Dist>)
{
using filter_type = cuco::bloom_filter<Key,
Block,
cuco::extent<size_t>,
cuda::thread_scope_device,
rebind_hasher_t<Hash, Key>>;
using filter_type =
cuco::bloom_filter<Key,
cuco::extent<size_t>,
cuda::thread_scope_device,
cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>, Block>>;

auto const num_keys = state.get_int64("NumInputs");
auto const filter_size_mb = state.get_int64("FilterSizeMB");
auto const pattern_bits = state.get_int64("PatternBits");

if (pattern_bits < filter_type::words_per_block and pattern_bits != defaults::BF_PATTERN_BITS) {
state.skip("pattern_bits must be at least words_per_block");
}

std::size_t const num_sub_filters =
(filter_size_mb * 1024 * 1024) /
(sizeof(typename filter_type::word_type) * filter_type::block_words);
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);

thrust::device_vector<Key> keys(num_keys);

Expand All @@ -85,9 +92,9 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block

state.add_element_count(num_keys);
state.add_global_memory_writes<typename filter_type::word_type>(num_keys *
filter_type::block_words);
filter_type::words_per_block);

filter_type filter{num_sub_filters, static_cast<uint32_t>(pattern_bits)};
filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};

add_fpr_summary(state, filter);

Expand All @@ -102,19 +109,23 @@ void bloom_filter_add(nvbench::state& state, nvbench::type_list<Key, Hash, Block
template <typename Key, typename Hash, typename Block, typename Dist>
void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash, Block, Dist>)
{
using filter_type = cuco::bloom_filter<Key,
Block,
cuco::extent<size_t>,
cuda::thread_scope_device,
rebind_hasher_t<Hash, Key>>;
using filter_type =
cuco::bloom_filter<Key,
cuco::extent<size_t>,
cuda::thread_scope_device,
cuco::bloom_filter_policy<rebind_hasher_t<Hash, Key>, Block>>;

auto const num_keys = state.get_int64("NumInputs");
auto const filter_size_mb = state.get_int64("FilterSizeMB");
auto const pattern_bits = state.get_int64("PatternBits");

if (pattern_bits < filter_type::words_per_block and pattern_bits != defaults::BF_PATTERN_BITS) {
state.skip("pattern_bits must be at least words_per_block");
}

std::size_t const num_sub_filters =
(filter_size_mb * 1024 * 1024) /
(sizeof(typename filter_type::word_type) * filter_type::block_words);
(sizeof(typename filter_type::word_type) * filter_type::words_per_block);

thrust::device_vector<Key> keys(num_keys);
thrust::device_vector<bool> result(num_keys, false);
Expand All @@ -124,9 +135,9 @@ void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash,

state.add_element_count(num_keys);
state.add_global_memory_reads<typename filter_type::word_type>(num_keys *
filter_type::block_words);
filter_type::words_per_block);

filter_type filter{num_sub_filters, static_cast<uint32_t>(pattern_bits)};
filter_type filter{num_sub_filters, {}, {static_cast<uint32_t>(pattern_bits)}};

add_fpr_summary(state, filter);

Expand All @@ -137,37 +148,34 @@ void bloom_filter_contains(nvbench::state& state, nvbench::type_list<Key, Hash,
});
}

static constexpr auto BF_N = defaults::N * 2;
using DEFAULT_BLOCK = cuda::std::array<nvbench::uint64_t, 4>;

NVBENCH_BENCH_TYPES(bloom_filter_add,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<cuco::default_hash_function<char>>,
nvbench::type_list<DEFAULT_BLOCK>,
nvbench::type_list<defaults::BF_HASH>,
nvbench::type_list<defaults::BF_BLOCK>,
nvbench::type_list<distribution::unique>))
.set_name("bloom_filter_add_unique_size")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE)
.add_int64_axis("PatternBits", {defaults::PATTERN_BITS});
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE)
.add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});

NVBENCH_BENCH_TYPES(bloom_filter_add,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
defaults::HASH_RANGE,
nvbench::type_list<DEFAULT_BLOCK>,
nvbench::type_list<defaults::BF_BLOCK>,

nvbench::type_list<distribution::unique>))
.set_name("bloom_filter_add_unique_hash")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::PATTERN_BITS});
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});

NVBENCH_BENCH_TYPES(bloom_filter_add,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<cuco::default_hash_function<char>>,
nvbench::type_list<defaults::BF_HASH>,
nvbench::type_list<cuda::std::array<nvbench::uint32_t, 1>,
cuda::std::array<nvbench::uint32_t, 2>,
cuda::std::array<nvbench::uint32_t, 4>,
Expand All @@ -180,39 +188,39 @@ NVBENCH_BENCH_TYPES(bloom_filter_add,
.set_name("bloom_filter_add_unique_block_dim")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::PATTERN_BITS});
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});

NVBENCH_BENCH_TYPES(bloom_filter_contains,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<cuco::default_hash_function<char>>,
nvbench::type_list<DEFAULT_BLOCK>,
nvbench::type_list<defaults::BF_HASH>,
nvbench::type_list<defaults::BF_BLOCK>,

nvbench::type_list<distribution::unique>))
.set_name("bloom_filter_contains_unique_size")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE)
.add_int64_axis("PatternBits", {defaults::PATTERN_BITS});
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE)
.add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});

NVBENCH_BENCH_TYPES(bloom_filter_contains,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
defaults::HASH_RANGE,
nvbench::type_list<DEFAULT_BLOCK>,
nvbench::type_list<defaults::BF_BLOCK>,

nvbench::type_list<distribution::unique>))
.set_name("bloom_filter_contains_unique_hash")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::PATTERN_BITS});
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});

NVBENCH_BENCH_TYPES(bloom_filter_contains,
NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE,
nvbench::type_list<cuco::default_hash_function<char>>,
nvbench::type_list<defaults::BF_HASH>,
nvbench::type_list<cuda::std::array<nvbench::uint32_t, 1>,
cuda::std::array<nvbench::uint32_t, 2>,
cuda::std::array<nvbench::uint32_t, 4>,
Expand All @@ -225,9 +233,9 @@ NVBENCH_BENCH_TYPES(bloom_filter_contains,
.set_name("bloom_filter_contains_unique_block_dim")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", {defaults::FILTER_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::PATTERN_BITS});
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", {defaults::BF_SIZE_MB})
.add_int64_axis("PatternBits", {defaults::BF_PATTERN_BITS});

/*
// benchmark outer product of configuration space
Expand All @@ -247,8 +255,8 @@ NVBENCH_BENCH_TYPES(
.set_name("bloom_filter_add_unique_product")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE)
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE)
.add_int64_axis("PatternBits", {1, 2, 4, 6, 8, 10});
NVBENCH_BENCH_TYPES(
Expand All @@ -267,7 +275,7 @@ NVBENCH_BENCH_TYPES(
.set_name("bloom_filter_contains_unique_product")
.set_type_axes_names({"Key", "Hash", "Block", "Distribution"})
.set_max_noise(defaults::MAX_NOISE)
.add_int64_axis("NumInputs", {BF_N})
.add_int64_axis("FilterSizeMB", defaults::FILTER_SIZE_MB_RANGE_CACHE)
.add_int64_axis("NumInputs", {defaults::BF_N})
.add_int64_axis("FilterSizeMB", defaults::BF_SIZE_MB_RANGE_CACHE)
.add_int64_axis("PatternBits", {1, 2, 4, 6, 8, 10});
*/
39 changes: 39 additions & 0 deletions benchmarks/bloom_filter/defaults.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Copyright (c) 2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cuco/bloom_filter_policy.cuh>

#include <nvbench/nvbench.cuh>

#include <cstdint>
#include <vector>

namespace cuco::benchmark::defaults {

static constexpr auto BF_N = 400'000'000;
static constexpr auto BF_SIZE_MB = 2'000;
using BF_POLICY = cuco::default_filter_policy<char>;
using BF_HASH = typename BF_POLICY::hasher;
using BF_BLOCK = cuda::std::array<typename BF_POLICY::word_type, BF_POLICY::words_per_block>;
// This is a dummy value which will be dynamically replaced with the filter's actual default
auto constexpr BF_PATTERN_BITS = 0;
auto const BF_SIZE_MB_RANGE_CACHE =
std::vector<nvbench::int64_t>{1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048};
auto const BF_PATTERN_BITS_RANGE = std::vector<nvbench::int64_t>{1, 2, 4, 6, 8, 16};

} // namespace cuco::benchmark::defaults
4 changes: 2 additions & 2 deletions examples/bloom_filter/host_bulk_example.cu
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,8 @@ int main(void)
int constexpr num_tp = num_keys * 0.5;
int constexpr num_tn = num_keys - num_tp;

// Spawn a filter with 1000 sub-filters and 6-bit patterns for each key.
cuco::bloom_filter<int> filter{1000, 6};
// Spawn a filter with 200 sub-filters.
cuco::bloom_filter<int> filter{200};

thrust::device_vector<int> keys(num_keys);
thrust::sequence(keys.begin(), keys.end(), 1);
Expand Down
Loading

0 comments on commit 5b3fe40

Please sign in to comment.