From 68167400d2c22eab4ac80045eb3bd4121af140de Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20J=C3=BCnger?= Date: Sat, 19 Oct 2024 01:09:39 +0200 Subject: [PATCH] Implement OA `retrieve(_outer)` and its `multiset` API (#537) --- README.md | 7 + benchmarks/CMakeLists.txt | 1 + benchmarks/static_multiset/retrieve_bench.cu | 87 +++++ examples/CMakeLists.txt | 1 + examples/static_multiset/host_bulk_example.cu | 82 +++++ include/cuco/detail/extent/extent.inl | 2 +- .../cuco/detail/open_addressing/kernels.cuh | 71 ++++ .../open_addressing/open_addressing_impl.cuh | 148 ++++++++- .../open_addressing_ref_impl.cuh | 304 ++++++++++++++++++ include/cuco/detail/prime.hpp | 2 +- .../static_multiset/static_multiset.inl | 40 +++ .../static_multiset/static_multiset_ref.inl | 117 +++++++ include/cuco/detail/utility/cuda.hpp | 2 +- .../detail/utility/{math.hpp => math.cuh} | 8 +- include/cuco/detail/utils.hpp | 14 +- include/cuco/operator.hpp | 6 + include/cuco/static_multiset.cuh | 69 ++++ tests/CMakeLists.txt | 4 +- tests/static_multiset/large_input_test.cu | 77 +++++ tests/static_multiset/retrieve_test.cu | 163 ++++++++++ 20 files changed, 1189 insertions(+), 16 deletions(-) create mode 100644 benchmarks/static_multiset/retrieve_bench.cu create mode 100644 examples/static_multiset/host_bulk_example.cu rename include/cuco/detail/utility/{math.hpp => math.cuh} (84%) create mode 100644 tests/static_multiset/large_input_test.cu create mode 100644 tests/static_multiset/retrieve_test.cu diff --git a/README.md b/README.md index c0f674bae..a11f5afdb 100644 --- a/README.md +++ b/README.md @@ -227,6 +227,13 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection #### Examples: - [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multimap/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJylVgtv2zYQ_isHDUXtVJYfaFDEjQN4bYoZK5whTlsUcaHQFG0TkUmNpOx6hv_77ijJlpsM67AWiCHe-7vvjtwFVlgrtbJB_34XyCTod8MgZWqRs4UI-gHPExaEgdW54fTdPpsqOIN3OtsauVg6aPAm9Dq9bgv_vA5h_Hn0fjSEdze3f9zcDu9GN-OIDLzRR8mFsiKBXCXCgFsKGGaM408pCeGzMJQN9KIONEhhGpSyadB8671sdQ4rtgWlHeRWoBtpYS5TAeI7F5kDqYDrVZZKpriAjXRLH6r049OBr6UTPXMM9RlaZPg1r2sCc4fU6d_Suazfbm82m4j5tCNtFu20ULbtj6N31-PJdQtTP5h9UinCC0b8mUuDhc-2wDLMjLMZ5puyDWgDbGEEypymzDdGOqkWIVg9dxtmhPeTSOuMnOXuBLwqT6y_roDwMYXADScwmkwD-HU4GU1C7-fL6O63m0938GV4ezsc342uJ3Bzi80avx9Rq_DrAwzHX-H30fh9CAKhw1Die2aoCkxVEqwiKTCcCHGSxlwXadlMcDmXHCoawUKvhVFYFmTCrGRBOEwy8X5SuZKOOX_2pDgfqj1VU_WLVDzNEwGXPOe6bcmEx6s8dXLFsojny6tTNbc0uXXtRKzRVbwW3GkTkdITFemEYShtc50rgj-uTp7Xt9hPgeR6XuoMUxbBWEU_ZuQrtf5QKocElKqx1jJpTtUO60Q6E0aPYhu7bSaQcgOkhHt7FK1ZmotCWIlIeLAQq8xtY_q0ggoRqWfuAFpd76VmX6gWBwflUpFUObbDUevBuqTft_IvNIQxqpx3XnY6nVKt3cZdgJom5w5bClU_irnrdjoh6oJNNUqLGlpd6rz_sb7TPhOqoe2zKb1WOdkIxtoVTOM0dxKVkfF8qVEFHpXekNcNTXqaIihWGIc5-rjo1IalQ6InzkWqWQJzRmSgeT_vvIh8ucgprPKUVJcVsGENuSvsW-bbBQjHGfTC0vgA_u5pG_anSt7b7rkW7PclsAWZ-v0T-l4WTjImzfO5XQHJbGPcrPXHCIYAMqhoS3V7tQiuBS4yggmW1I2NxuIcrjYblcbX0SIKYbdDPLGGXTfs4k8URYAnvXMiQnEM_qMU7usFHMahUWBWna_Yo4ifDNwlsvqq0WmGP69clEIEbTQru-JsJhY4YdXZ_TeI4xLOuMFyWrhN2OF2drlR8G_Q7iS8gMYY2tBrhiD3b2FfA3nkeYcTgBz0wWk8tWct0oWUaEkV7GycZlcmK1SC-R89TurdMnomPJsJdyC4iy68vnh5cXGx_2fCVKVceevY6XguMZDnx9GmYkajrnRM8OTU5xlCp84wakwxzJrz3BjyZSlxnzJea_eYdbE3mrRZ6NLFmwoFHqBqh5fu7vDsIdZ4mZkHsPl8Lr-jaoK3phMUhbkfQlEkhg8C1fLspfXQ9c58k_0mA3SX5S4mmuAO8zcGZV2E-em6m_9zOpMYr1Is1zZq-dSQ_ICRChI9bfbe702LgRAFZq3mktF9f9icdeI9HCI9VAwljGjSH-pl3ctvD5BoYdVLhzc9PiPCmi1KIysQwAQGg2evjIcDzGVBgnQ9vjhXRoq1-I8Qh0eQKrWCrD7KwWnZyFrU1lPDI7DEKacdS0Hlq5nwu9_nVS08sEudp_hEQ-6N4RX4QS-qs35o8b3ZOCHR4MdkBnAwPBClXC50XeKU4kuanqb4WDXHB3eg1px3e-d5F8U6c8VrPGjhvTvgr15130CLGb4c2FX8pgOtFl7KDv84rFkkrZStZv6JnspZzSfnPMXDdfGexgO8odVjsA8rOVL1RI5MDvbf_P-_AYKkJA4=)) +### `static_multiset` + +`cuco::static_multiset` is a fixed-size container that supports storing equivalent keys. It uses double hashing by default and supports switching to linear probing. See the Doxygen documentation in `static_multiset.cuh` for more detailed information. + +#### Examples: +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/static_multiset/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/clientstate/eJyVVw1vGkcQ_SuTqypDcnxZjSIRuyq1HRU1wpFxEkUhwsveACvf3dLdPQi1_N87s3sHhz_a1JZsuJ19--bNzFu4iyxaq3Ruo_7Xu0glUb8XR6nIF4VYYNSPZJGIKI6sLozk952XkxxewplebY1aLB00ZBOOu8e_xDD6NDwfDuDs8urD5dXgeng5anOsj3-vJOYWEyjyBA24JcJgJST9K1di-ISGicBxuwsNDphE5dokar71KFtdQCa2kGsHhUWCURbmKkXA7xJXDlQOUmerVIlcImyUW_qjShxPB76UIHrmBMUL2rGid_N6JAi3o84_S-dW_U5ns9m0hafd1mbRSUOw7bwfnl2Mxhctor7b9jFPSVkw-FehDCU-24JYETMpZsQ3FRvQBsTCIK05zcw3RjmVL2Kweu42wqDHSZR1Rs0KdyBexZPyrweQfCIn4QZjGI4nEfw-GA_Hscf5PLz-4_LjNXweXF0NRtfDizFcXlGxRudDLhW9eweD0Rf4czg6jwFJOjoKv68MZ0FUFcuKSdBwjHhAY64DLbtCqeZKQtVBsNBrNDmlBSs0mQq9RiQTj5OqTDnh_LNHyfmjOpN8kv-kcpkWCcKJLKTuWN4ip1mROmXRtWWx_PUwzC1NYV0nwTVBTdconTZtDnoUMi9yyeeL9On1VC-oZM8sWiouUqe1H56vNNUERXawySdrfWTnZWiS33zzLil6OivS2yl-FyQzUkZheWYUzuEcMxLIGeGQZLIsa9muD5SgyjMMFZ4hYfBhaPcjeE3xvOofA7UXdUu6pcZb69CTc6Mzj-o3U4l8UMFTy_VNNFdRUxnLgqXqFuFGUaWMu6EW8afcGKRuxDXe-GaEHTVifIvbPR8qrModTaLKG2utkuYkv6PndB4fQ6FTt10hnBJB95Y1A-h04CJbuS3YVLuQgkFuUMxdmDB-bdb0ehLxQ5VjSmKsRVqgbbMC1KvhHdilLlLaRCJiSg1Cm2whl-U5bikca0FeQ7kY0FIWhoeULIj_rwoHiXCC09mTlVwmnhlA5jnlhYoHZdLq7RMZFdmMcYMq7ADEJGiJPBtU2qTft-pvnLoabl5kU7_hFF53j7rd7h7xjIyPTAAkO5QilWbClpZAsT9DqgUVUvAo8A5R0Jl7YF6dhlXC7rZfv32SxB79NCxKVGljx6pTx2nWuXH_FpKrBtwN3ppJ4hQFgd5UqDdlaUMTtHogrO9IL2dQqtLTK892QAQPp-Ckqsevu-67qw6Iyy27-tw9rtT9fY04TTFpyqTDqO8qdteNoRfDcQztdgzqvtzwGY8Mux7z97bOFQWkO4O3gduQIZHBw6Ycv5z9_6gm4PGRd3RyDbcbGAh-0-8fGFotTw6sV-HYX5i7bRX5hgec4YJmrhkHeMwTft2tVWvoSVOp0jTkSiOogzMIu-T6ceDOfkOOz2IHKjvY4FsiQ6i5Al2E5AN88wWp6gpUkxpGMAhIg8gwFYX_S2fXlUXuQoN5cMMaWeaUCSeXpQ9Z7nz-qECFenBu-5kZYVxMprpwNJFTXqNx2V9XvPwj_MZUZDZ7dl9yNtr_I71Qnroyeoa28QSXw974DxivxPM4JdOr0vR9yxyId2g18PWA3ZQyjmH6DU45rFbD6hL5l4Y9ANqHHNKunjefcbPqnIe1esQSWk8fWEqg5tBoPAN2-lQ_NP39-vyWapabTbgL2gS3pTA4OaHrbVxI6lX7At4ReLIb1faECEU-23vAlK67p7e_Eyp9AWOdofOl2pD30QdQnS_qCCE5IlmYHPiquaevAvwBmz61mP03hihfS9k7fl30aFmvXPg6EbXozFP56lXvDbSEkctTm03fdKHVogvH0R9HumLSSkU2898xUjWrYUopU3q4Dt8K6AHdH_ltdB9X6-TjB-vUyNH9N__7D2OnfWU=)) + ### `dynamic_map` `cuco::dynamic_map` links together multiple `cuco::static_map`s to provide a hash table that can grow as key-value pairs are inserted. It currently only provides host-bulk APIs. See the Doxygen documentation in `dynamic_map.cuh` for more detailed information. diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 27b858898..c4d4dbc37 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -69,6 +69,7 @@ ConfigureBench(STATIC_MAP_BENCH # - static_multiset benchmarks -------------------------------------------------------------------- ConfigureBench(STATIC_MULTISET_BENCH static_multiset/contains_bench.cu + static_multiset/retrieve_bench.cu static_multiset/count_bench.cu static_multiset/find_bench.cu static_multiset/insert_bench.cu) diff --git a/benchmarks/static_multiset/retrieve_bench.cu b/benchmarks/static_multiset/retrieve_bench.cu new file mode 100644 index 000000000..efd694946 --- /dev/null +++ b/benchmarks/static_multiset/retrieve_bench.cu @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include + +#include + +#include +#include + +using namespace cuco::benchmark; +using namespace cuco::utility; + +/** + * @brief A benchmark evaluating `cuco::static_multiset::retrieve` performance + */ +template +void static_multiset_retrieve(nvbench::state& state, nvbench::type_list) +{ + auto const num_keys = state.get_int64_or_default("NumInputs", defaults::N); + auto const occupancy = state.get_float64_or_default("Occupancy", defaults::OCCUPANCY); + auto const matching_rate = state.get_float64_or_default("MatchingRate", defaults::MATCHING_RATE); + + std::size_t const size = num_keys / occupancy; + + thrust::device_vector keys(num_keys); + + key_generator gen; + gen.generate(dist_from_state(state), keys.begin(), keys.end()); + + gen.dropout(keys.begin(), keys.end(), matching_rate); + + state.add_element_count(num_keys); + + cuco::static_multiset set{size, cuco::empty_key{-1}}; + set.insert(keys.begin(), keys.end()); + + auto const output_size = set.count(keys.begin(), keys.end()); + thrust::device_vector output_match(output_size); + auto output_probe_begin = thrust::discard_iterator{}; + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + set.retrieve( + keys.begin(), keys.end(), output_probe_begin, output_match.begin(), {launch.get_stream()}); + }); +} + +NVBENCH_BENCH_TYPES(static_multiset_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multiset_retrieve_uniform_occupancy") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("Occupancy", defaults::OCCUPANCY_RANGE); + +NVBENCH_BENCH_TYPES(static_multiset_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multiset_retrieve_uniform_matching_rate") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_float64_axis("MatchingRate", defaults::MATCHING_RATE_RANGE); + +NVBENCH_BENCH_TYPES(static_multiset_retrieve, + NVBENCH_TYPE_AXES(defaults::KEY_TYPE_RANGE, + nvbench::type_list)) + .set_name("static_multiset_retrieve_uniform_multiplicity") + .set_type_axes_names({"Key", "Distribution"}) + .set_max_noise(defaults::MAX_NOISE) + .add_int64_axis("Multiplicity", defaults::MULTIPLICITY_RANGE); diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4058600fa..8e7be947a 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -38,6 +38,7 @@ ConfigureExample(STATIC_SET_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/stat ConfigureExample(STATIC_SET_DEVICE_SUBSETS_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/device_subsets_example.cu") ConfigureExample(STATIC_SET_SHARED_MEMORY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/shared_memory_example.cu") ConfigureExample(STATIC_SET_MAPPING_TABLE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_set/mapping_table_example.cu") +ConfigureExample(STATIC_MULTISET_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multiset/host_bulk_example.cu") ConfigureExample(STATIC_MAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/host_bulk_example.cu") ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/device_ref_example.cu") ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") diff --git a/examples/static_multiset/host_bulk_example.cu b/examples/static_multiset/host_bulk_example.cu new file mode 100644 index 000000000..8ce96b99a --- /dev/null +++ b/examples/static_multiset/host_bulk_example.cu @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include + +#include +#include + +/** + * @file host_bulk_example.cu + * @brief Demonstrates usage of the static_multiset "bulk" host APIs. + * + * The bulk APIs are only invocable from the host and are used for doing operations like `insert` or + * `retrieve` on a multiset of keys. + * + */ +int main(void) +{ + using key_type = int; + + // Empty slots are represented by reserved "sentinel" values. These values should be selected such + // that they never occur in your input data. + key_type constexpr empty_key_sentinel = -1; + + // Number of keys to be inserted + std::size_t constexpr num_keys = 50'000; + + // Compute capacity based on a 50% load factor + auto constexpr load_factor = 0.5; + std::size_t const capacity = std::ceil(num_keys / load_factor); + + // Constructs a set with at least `capacity` slots using -1 as the empty keys sentinel. + cuco::static_multiset multiset{capacity, cuco::empty_key{empty_key_sentinel}}; + + // Create a sequence of keys {0, 1, 2, .., i} + // We're going to insert each key twice so we only need 'num_keys / 2' distinct keys. + thrust::device_vector keys(num_keys / 2); + thrust::sequence(keys.begin(), keys.end(), 0); + + // Inserts all keys into the hash set + multiset.insert(keys.begin(), keys.end()); + // Insert the same set of keys again, so each distinct key should occur twice in the multiset + multiset.insert(keys.begin(), keys.end()); + + // Counts the occurrences of matching keys contained in the multiset. + std::size_t const counted_output_size = multiset.count(keys.begin(), keys.end()); + + // Storage for result + thrust::device_vector output_probes(counted_output_size); + thrust::device_vector output_matches(counted_output_size); + + // Retrieve all matching keys + auto const [output_probes_end, _] = + multiset.retrieve(keys.begin(), keys.end(), output_probes.begin(), output_matches.begin()); + std::size_t const retrieved_output_size = output_probes_end - output_probes.begin(); + + if ((retrieved_output_size == counted_output_size) and (retrieved_output_size == num_keys)) { + std::cout << "Success! Found all keys.\n"; + } else { + std::cout << "Fail! Something went wrong.\n"; + } + + return 0; +} \ No newline at end of file diff --git a/include/cuco/detail/extent/extent.inl b/include/cuco/detail/extent/extent.inl index c04f6c329..13f056a57 100644 --- a/include/cuco/detail/extent/extent.inl +++ b/include/cuco/detail/extent/extent.inl @@ -18,7 +18,7 @@ #include #include // TODO move to detail/extent/ -#include +#include #include #include diff --git a/include/cuco/detail/open_addressing/kernels.cuh b/include/cuco/detail/open_addressing/kernels.cuh index 24fce230c..4726683a3 100644 --- a/include/cuco/detail/open_addressing/kernels.cuh +++ b/include/cuco/detail/open_addressing/kernels.cuh @@ -399,6 +399,77 @@ CUCO_KERNEL __launch_bounds__(BlockSize) void find(InputIt first, } } +/** + * @brief Retrieves the equivalent container elements of all keys in the range `[input_probe, + * input_probe + n)`. + * + * If key `k = *(input_probe + i)` has one or more matches in the container, copies `k` to + * `output_probe` and associated slot contents to `output_match`, respectively. The output order is + * unspecified. + * + * @tparam IsOuter Flag indicating whether it's an outer count or not + * @tparam block_size The size of the thread block + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * @tparam Ref Type of non-owning device ref allowing access to storage + * + * @param input_probe Beginning of the sequence of input keys + * @param n Number of the keys to query + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Pointer to an atomic object of integral type that is used to count the + * number of output elements + * @param ref Non-owning container device ref used to access the slot storage + */ +template +CUCO_KERNEL __launch_bounds__(BlockSize) void retrieve(InputProbeIt input_probe, + cuco::detail::index_type n, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter* atomic_counter, + Ref ref) +{ + namespace cg = cooperative_groups; + + auto const block = cg::this_thread_block(); + auto constexpr tiles_in_block = BlockSize / Ref::cg_size; + // make sure all but the last block are always occupied + auto const items_per_block = detail::int_div_ceil(n, tiles_in_block * gridDim.x) * tiles_in_block; + + auto const block_begin_offset = block.group_index().x * items_per_block; + auto const block_end_offset = min(n, block_begin_offset + items_per_block); + + if (block_begin_offset < block_end_offset) { + if constexpr (IsOuter) { + ref.retrieve_outer(block, + input_probe + block_begin_offset, + input_probe + block_end_offset, + output_probe, + output_match, + *atomic_counter); + } else { + ref.retrieve(block, + input_probe + block_begin_offset, + input_probe + block_end_offset, + output_probe, + output_match, + *atomic_counter); + } + } +} + /** * @brief Inserts all elements in the range `[first, last)`. * diff --git a/include/cuco/detail/open_addressing/open_addressing_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_impl.cuh index a8eff9036..3d7abc9a5 100644 --- a/include/cuco/detail/open_addressing/open_addressing_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_impl.cuh @@ -574,6 +574,91 @@ class open_addressing_impl { first, num_keys, output_begin, container_ref); } + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param container_ref Non-owning device reference to the container + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ + template + std::pair retrieve(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + Ref container_ref, + cuda::stream_ref stream) const + { + auto constexpr is_outer = false; + return this->retrieve_impl( + first, last, output_probe, output_match, container_ref, stream); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count_outer()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param container_ref Non-owning device reference to the container + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ + template + std::pair retrieve_outer(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + Ref container_ref, + cuda::stream_ref stream) const + { + auto constexpr is_outer = true; + return this->retrieve_impl( + first, last, output_probe, output_match, container_ref, stream); + } + /** * @brief Counts the occurrences of keys in `[first, last)` contained in the container * @@ -996,6 +1081,67 @@ class open_addressing_impl { return counter.load_to_host(stream); } + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()/count_outer()` to determine the size of the output range. + * + * If `IsOuter == true` and a key `k` has no matches in the container, then `{key, + * empty_slot_sentinel}` will be added to the output sequence. + * + * This function synchronizes the given CUDA stream. + * + * @tparam IsOuter Flag indicating if an inner or outer retrieve operation should be performed + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam Ref Type of non-owning device container ref allowing access to storage + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param container_ref Non-owning device reference to the container + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ + template + std::pair retrieve_impl(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + Ref container_ref, + cuda::stream_ref stream) const + { + auto const n = detail::distance(first, last); + if (n == 0) { return {output_probe, output_match}; } + + using counter_type = detail::counter_storage; + auto counter = counter_type{this->allocator()}; + counter.reset(stream.get()); + + int32_t constexpr block_size = cuco::detail::default_block_size(); + + auto constexpr grid_stride = 1; + auto const grid_size = cuco::detail::grid_size(n, cg_size, grid_stride, block_size); + + detail::retrieve<<>>( + first, n, output_probe, output_match, counter.data(), container_ref); + + auto const num_retrieved = counter.load_to_host(stream.get()); + + return {output_probe + num_retrieved, output_match + num_retrieved}; + } + /** * @brief Extracts the key from a given slot. * @@ -1022,4 +1168,4 @@ class open_addressing_impl { }; } // namespace detail -} // namespace cuco +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh index c78705804..56a58a9e4 100644 --- a/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh +++ b/include/cuco/detail/open_addressing/open_addressing_ref_impl.cuh @@ -18,6 +18,7 @@ #include #include +#include #include #include #include @@ -977,6 +978,309 @@ class open_addressing_ref_impl { } } + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic counter type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Pointer to an atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = false; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); // TODO include + this->retrieve_impl( + block, input_probe_begin, n, output_probe, output_match, atomic_counter); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic counter type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Pointer to an atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve_outer(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto constexpr is_outer = true; + auto const n = cuco::detail::distance(input_probe_begin, input_probe_end); // TODO include + this->retrieve_impl( + block, input_probe_begin, n, output_probe, output_match, atomic_counter); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * If `IsOuter == true` and a key `k` has no matches in the container, then `{key, + * empty_slot_sentinel}` will be added to the output sequence. + * + * @tparam IsOuter Flag indicating if an inner or outer retrieve operation should be performed + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Integral atomic type that follows the same semantics as + * `cuda::(std::)atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Pointer to an atomic object of integral type that is used to count the + * number of output elements + */ + template + __device__ void retrieve_impl(cooperative_groups::thread_block const& block, + InputProbeIt input_probe, + cuco::detail::index_type n, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + namespace cg = cooperative_groups; + + if (n == 0) { return; } + + using probe_type = typename std::iterator_traits::value_type; + + // tuning parameter + auto constexpr buffer_multiplier = 1; + static_assert(buffer_multiplier > 0); + + auto constexpr probing_tile_size = cg_size; + auto constexpr flushing_tile_size = cuco::detail::warp_size(); + static_assert(flushing_tile_size >= probing_tile_size); + + auto constexpr num_flushing_tiles = BlockSize / flushing_tile_size; + auto constexpr max_matches_per_step = flushing_tile_size * window_size; + auto constexpr buffer_size = buffer_multiplier * max_matches_per_step; + + auto const flushing_tile = cg::tiled_partition(block); + auto const probing_tile = cg::tiled_partition(block); + + auto const flushing_tile_id = flushing_tile.meta_group_rank(); + auto idx = probing_tile.meta_group_rank(); + auto const stride = probing_tile.meta_group_size(); + + // TODO align to 16B? + __shared__ probe_type probe_buffers[num_flushing_tiles][buffer_size]; + __shared__ value_type match_buffers[num_flushing_tiles][buffer_size]; + size_type num_matches = 0; + + auto flush_buffers = [&](cg::coalesced_group const& tile) { + auto const rank = tile.thread_rank(); + +#if defined(CUCO_HAS_CG_INVOKE_ONE) + auto const offset = cg::invoke_one_broadcast(tile, [&]() { + return atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); + }); +#else + size_type offset; + if (rank == 0) { + offset = atomic_counter.fetch_add(num_matches, cuda::std::memory_order_relaxed); + } + offset = tile.shfl(offset, 0); +#endif + + // flush_buffers + for (size_type i = rank; i < num_matches; i += tile.size()) { + *(output_probe + offset + i) = probe_buffers[flushing_tile_id][i]; + *(output_match + offset + i) = match_buffers[flushing_tile_id][i]; + } + }; + + while (flushing_tile.any(idx < n)) { + bool active_flag = idx < n; + auto const active_flushing_tile = + cg::binary_partition(flushing_tile, active_flag); + + if (active_flag) { + // perform probing + // make sure the flushing_tile is converged at this point to get a coalesced load + auto const& probe = *(input_probe + idx); + auto probing_iter = + this->probing_scheme_(probing_tile, probe, this->storage_ref_.window_extent()); + bool empty_found = false; + bool match_found = false; + [[maybe_unused]] bool found_any_match = false; // only needed if `IsOuter == true` + + while (true) { + // TODO atomic_ref::load if insert operator is present + auto const window_slots = this->storage_ref_[*probing_iter]; + + for (int32_t i = 0; i < window_size; ++i) { + if (not empty_found) { + // inspect slot content + switch (this->predicate_.operator()( + probe, this->extract_key(window_slots[i]))) { + case detail::equal_result::EMPTY: { + empty_found = true; + break; + } + case detail::equal_result::EQUAL: { + match_found = true; + break; + } + default: { + break; + } + } + } + + if (active_flushing_tile.any(match_found)) { + auto const matching_tile = cg::binary_partition(active_flushing_tile, match_found); + // stage matches in shmem buffer + if (match_found) { + probe_buffers[flushing_tile_id][num_matches + matching_tile.thread_rank()] = probe; + match_buffers[flushing_tile_id][num_matches + matching_tile.thread_rank()] = + window_slots[i]; + } + + // add number of new matches to the buffer counter + num_matches += (match_found) ? matching_tile.size() + : active_flushing_tile.size() - matching_tile.size(); + } + + if constexpr (IsOuter) { + if (not found_any_match /*yet*/ and probing_tile.any(match_found) /*now*/) { + found_any_match = true; + } + } + + // reset flag for next iteration + match_found = false; + } + empty_found = probing_tile.any(empty_found); + + // check if all probing tiles have finished their work + bool const finished = active_flushing_tile.all(empty_found); + + if constexpr (IsOuter) { + if (finished) { + bool const writes_sentinel = + ((probing_tile.thread_rank() == 0) and not found_any_match); + + auto const sentinel_writers = + cg::binary_partition(active_flushing_tile, writes_sentinel); + if (writes_sentinel) { + auto const rank = sentinel_writers.thread_rank(); + probe_buffers[flushing_tile_id][num_matches + rank] = probe; + match_buffers[flushing_tile_id][num_matches + rank] = this->empty_slot_sentinel(); + } + // add number of new matches to the buffer counter + num_matches += (writes_sentinel) + ? sentinel_writers.size() + : active_flushing_tile.size() - sentinel_writers.size(); + } + } + + // if the buffer has not enough empty slots for the next iteration + if (num_matches > (buffer_size - max_matches_per_step)) { + flush_buffers(active_flushing_tile); + + // reset buffer counter + num_matches = 0; + } + + // the entire flushing tile has finished its work + if (finished) { break; } + + // onto the next probing window + ++probing_iter; + } + + // entire flusing_tile has finished; flush remaining elements + if (num_matches != 0 and active_flushing_tile.all((idx + stride) >= n)) { + flush_buffers(active_flushing_tile); + } + } + + // onto the next key + idx += stride; + } + } + /** * @brief For a given key, applies the function object `callback_op` to the copy of all * corresponding matches found in the container. diff --git a/include/cuco/detail/prime.hpp b/include/cuco/detail/prime.hpp index c788fa245..4ef0a35e3 100644 --- a/include/cuco/detail/prime.hpp +++ b/include/cuco/detail/prime.hpp @@ -16,7 +16,7 @@ #pragma once -#include +#include #include #include diff --git a/include/cuco/detail/static_multiset/static_multiset.inl b/include/cuco/detail/static_multiset/static_multiset.inl index 9ab01f8cb..b88f0f6ff 100644 --- a/include/cuco/detail/static_multiset/static_multiset.inl +++ b/include/cuco/detail/static_multiset/static_multiset.inl @@ -277,6 +277,46 @@ void static_multisetfind_async(first, last, output_begin, ref(op::find), stream); } +template +template +std::pair +static_multiset::retrieve( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream) const +{ + return this->impl_->retrieve( + first, last, output_probe, output_match, this->ref(op::retrieve), stream); +} + +template +template +std::pair +static_multiset::retrieve_outer( + InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream) const +{ + return this->impl_->retrieve_outer( + first, last, output_probe, output_match, this->ref(op::retrieve), stream); +} + template +class operator_impl< + op::retrieve_tag, + static_multiset_ref> { + using base_type = static_multiset_ref; + using ref_type = + static_multiset_ref; + using key_type = typename base_type::key_type; + using value_type = typename base_type::value_type; + using iterator = typename base_type::iterator; + using const_iterator = typename base_type::const_iterator; + + static constexpr auto cg_size = base_type::cg_size; + static constexpr auto window_size = base_type::window_size; + + public: + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Atomic counter type that follows the same semantics as + * `cuda::atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Counter that is used to determine the next free position in the output + * sequences + */ + template + __device__ void retrieve(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto const& ref_ = static_cast(*this); + ref_.impl_.retrieve( + block, input_probe_begin, input_probe_end, output_probe, output_match, atomic_counter); + } + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[input_probe_begin, + * input_probe_end)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count_outer()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * @tparam BlockSize Size of the thread block this operation is executed in + * @tparam InputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the container's `key_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * @tparam AtomicCounter Atomic counter type that follows the same semantics as + * `cuda::atomic(_ref)` + * + * @param block Thread block this operation is executed in + * @param input_probe_begin Beginning of the input sequence of keys + * @param input_probe_end End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param atomic_counter Counter that is used to determine the next free position in the output + * sequences + */ + template + __device__ void retrieve_outer(cooperative_groups::thread_block const& block, + InputProbeIt input_probe_begin, + InputProbeIt input_probe_end, + OutputProbeIt output_probe, + OutputMatchIt output_match, + AtomicCounter& atomic_counter) const + { + auto const& ref_ = static_cast(*this); + ref_.impl_.retrieve_outer( + block, input_probe_begin, input_probe_end, output_probe, output_match, atomic_counter); + } +}; + template -#include +#include namespace cuco { namespace detail { diff --git a/include/cuco/detail/utility/math.hpp b/include/cuco/detail/utility/math.cuh similarity index 84% rename from include/cuco/detail/utility/math.hpp rename to include/cuco/detail/utility/math.cuh index 47484d6ad..c2715f6fa 100644 --- a/include/cuco/detail/utility/math.hpp +++ b/include/cuco/detail/utility/math.cuh @@ -15,7 +15,7 @@ #pragma once -#include +#include namespace cuco { namespace detail { @@ -35,10 +35,10 @@ namespace detail { * @return Ceiling of the integer division */ template -constexpr T int_div_ceil(T dividend, U divisor) noexcept +__host__ __device__ constexpr T int_div_ceil(T dividend, U divisor) noexcept { - static_assert(std::is_integral_v); - static_assert(std::is_integral_v); + static_assert(cuda::std::is_integral_v); + static_assert(cuda::std::is_integral_v); return (dividend + divisor - 1) / divisor; } diff --git a/include/cuco/detail/utils.hpp b/include/cuco/detail/utils.hpp index 86c045e3b..d0d777ed6 100644 --- a/include/cuco/detail/utils.hpp +++ b/include/cuco/detail/utils.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,20 +18,20 @@ #include #include -#include -#include +#include +#include namespace cuco { namespace detail { template -constexpr inline index_type distance(Iterator begin, Iterator end) +__host__ __device__ constexpr inline index_type distance(Iterator begin, Iterator end) { - using category = typename std::iterator_traits::iterator_category; - static_assert(std::is_base_of_v, + using category = typename cuda::std::iterator_traits::iterator_category; + static_assert(cuda::std::is_base_of_v, "Input iterator should be a random access iterator."); // `int64_t` instead of arch-dependant `long int` - return static_cast(std::distance(begin, end)); + return static_cast(cuda::std::distance(begin, end)); } /** diff --git a/include/cuco/operator.hpp b/include/cuco/operator.hpp index 303124ec4..bcfe271c8 100644 --- a/include/cuco/operator.hpp +++ b/include/cuco/operator.hpp @@ -68,6 +68,12 @@ struct count_tag { struct find_tag { } inline constexpr find; ///< `cuco::find` operator +/** + * @brief `retrieve` operator tag + */ +struct retrieve_tag { +} inline constexpr retrieve; ///< `cuco::retrieve` operator + /** * @brief `for_each` operator tag */ diff --git a/include/cuco/static_multiset.cuh b/include/cuco/static_multiset.cuh index 22cda307f..a2a65de8c 100644 --- a/include/cuco/static_multiset.cuh +++ b/include/cuco/static_multiset.cuh @@ -476,6 +476,75 @@ class static_multiset { OutputIt output_begin, cuda::stream_ref stream = {}) const; + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count()` to determine the size of the output range. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ + template + std::pair retrieve(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream = {}) const; + + /** + * @brief Retrieves all the slots corresponding to all keys in the range `[first, last)`. + * + * If key `k = *(first + i)` exists in the container, copies `k` to `output_probe` and associated + * slot contents to `output_match`, respectively. The output order is unspecified. + * + * Behavior is undefined if the size of the output range exceeds the number of retrieved slots. + * Use `count_outer()` to determine the size of the output range. + * + * If a key `k` has no matches in the container, then `{key, empty_slot_sentinel}` will be added + * to the output sequence. + * + * This function synchronizes the given CUDA stream. + * + * @tparam InputProbeIt Device accessible input iterator + * @tparam OutputProbeIt Device accessible input iterator whose `value_type` is + * convertible to the `InputProbeIt`'s `value_type` + * @tparam OutputMatchIt Device accessible input iterator whose `value_type` is + * convertible to the container's `value_type` + * + * @param first Beginning of the input sequence of keys + * @param last End of the input sequence of keys + * @param output_probe Beginning of the sequence of keys corresponding to matching elements in + * `output_match` + * @param output_match Beginning of the sequence of matching elements + * @param stream CUDA stream this operation is executed in + * + * @return Iterator pair indicating the the end of the output sequences + */ + template + std::pair retrieve_outer(InputProbeIt first, + InputProbeIt last, + OutputProbeIt output_probe, + OutputMatchIt output_match, + cuda::stream_ref stream = {}) const; + /** * @brief Counts the occurrences of keys in `[first, last)` contained in the multiset * diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d34c89d98..1b28746ea 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -104,7 +104,9 @@ ConfigureTest(STATIC_MULTISET_TEST static_multiset/custom_count_test.cu static_multiset/find_test.cu static_multiset/insert_test.cu - static_multiset/for_each_test.cu) + static_multiset/for_each_test.cu + static_multiset/retrieve_test.cu + static_multiset/large_input_test.cu) ################################################################################################### # - static_multimap tests ------------------------------------------------------------------------- diff --git a/tests/static_multiset/large_input_test.cu b/tests/static_multiset/large_input_test.cu new file mode 100644 index 000000000..015260676 --- /dev/null +++ b/tests/static_multiset/large_input_test.cu @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include + +#include + +#include +#include + +template +void test_unique_sequence(Set& set, typename Set::value_type* res_begin, std::size_t num_keys) +{ + using Key = typename Set::key_type; + + auto const keys_begin = thrust::counting_iterator(0); + auto const keys_end = keys_begin + num_keys; + + set.insert(keys_begin, keys_end); + REQUIRE(set.size() == num_keys); + + SECTION("All inserted keys can be retrieved.") + { + auto const [_, res_end] = + set.retrieve(keys_begin, keys_end, thrust::make_discard_iterator(), res_begin); + REQUIRE(static_cast(std::distance(res_begin, res_end)) == num_keys); + + thrust::sort(res_begin, res_end); + + REQUIRE(cuco::test::equal(res_begin, res_end, keys_begin, thrust::equal_to{})); + } +} + +TEMPLATE_TEST_CASE_SIG( + "cuco::static_multiset large input test", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2)) +{ + constexpr std::size_t num_keys{1'200'000'000}; + + using extent_type = cuco::extent; + using probe = cuco::double_hashing>; + + try { + auto set = cuco::static_multiset{num_keys * 2, cuco::empty_key{-1}, {}, probe{}}; + + thrust::device_vector d_retrieved(num_keys); + test_unique_sequence(set, d_retrieved.data().get(), num_keys); + } catch (cuco::cuda_error&) { + SKIP("Out of memory"); + } catch (std::bad_alloc&) { + SKIP("Out of memory"); + } +} diff --git a/tests/static_multiset/retrieve_test.cu b/tests/static_multiset/retrieve_test.cu new file mode 100644 index 000000000..300c8dc6c --- /dev/null +++ b/tests/static_multiset/retrieve_test.cu @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +template +void test_multiplicity(Container& container, std::size_t num_keys, std::size_t multiplicity) +{ + using key_type = typename Container::key_type; + auto const empty_key_sentinel = container.empty_key_sentinel(); + + container.clear(); + + auto const num_unique_keys = num_keys / multiplicity; + REQUIRE(num_unique_keys > 0); + auto const num_actual_keys = num_unique_keys * multiplicity; + REQUIRE(num_actual_keys <= num_keys); + + thrust::device_vector input_keys(num_actual_keys); + thrust::device_vector probed_keys(num_actual_keys); + thrust::device_vector matched_keys(num_actual_keys); + + thrust::transform(thrust::counting_iterator(0), + thrust::counting_iterator(num_actual_keys), + input_keys.begin(), + cuda::proclaim_return_type([multiplicity] __device__(auto const& i) { + return static_cast(i / multiplicity); + })); + thrust::shuffle(input_keys.begin(), input_keys.end(), thrust::default_random_engine{}); + + container.insert(input_keys.begin(), input_keys.end()); + REQUIRE(container.size() == num_actual_keys); + + SECTION("All inserted keys should be contained.") + { + auto const [probed_end, matched_end] = container.retrieve( + input_keys.begin(), input_keys.end(), probed_keys.begin(), matched_keys.begin()); + thrust::sort(input_keys.begin(), input_keys.end()); + thrust::sort(probed_keys.begin(), probed_end); + thrust::sort(matched_keys.begin(), matched_end); + REQUIRE(cuco::test::equal( + probed_keys.begin(), probed_keys.end(), input_keys.begin(), thrust::equal_to{})); + REQUIRE(cuco::test::equal( + matched_keys.begin(), matched_keys.end(), input_keys.begin(), thrust::equal_to{})); + } +} + +template +void test_outer(Container& container, std::size_t num_keys) +{ + using key_type = typename Container::key_type; + auto const empty_key_sentinel = container.empty_key_sentinel(); + + container.clear(); + + thrust::device_vector insert_keys(num_keys); + thrust::sequence(insert_keys.begin(), insert_keys.end(), 0); + thrust::device_vector query_keys(num_keys * 2ull); + thrust::sequence(query_keys.begin(), query_keys.end(), 0); + + thrust::device_vector probed_keys(num_keys * 2ull); + thrust::device_vector matched_keys(num_keys * 2ull); + + SECTION("Non-inserted keys should output sentinels.") + { + auto const [probed_end, matched_end] = container.retrieve_outer( + query_keys.begin(), query_keys.end(), probed_keys.begin(), matched_keys.begin()); + REQUIRE(static_cast(std::distance(probed_keys.begin(), probed_end)) == + num_keys * 2ull); + REQUIRE(static_cast(std::distance(matched_keys.begin(), matched_end)) == + num_keys * 2ull); + REQUIRE(cuco::test::all_of( + matched_keys.begin(), + matched_keys.end(), + cuda::proclaim_return_type([empty_key_sentinel] __device__(auto const& k) { + return static_cast(k == static_cast(empty_key_sentinel)); + }))); + } + + container.insert(insert_keys.begin(), insert_keys.end()); + + SECTION("All inserted keys should be contained.") + { + auto const [probed_end, matched_end] = container.retrieve_outer( + query_keys.begin(), query_keys.end(), probed_keys.begin(), matched_keys.begin()); + thrust::sort_by_key( + probed_keys.begin(), probed_end, matched_keys.begin(), thrust::less()); + + REQUIRE(cuco::test::equal( + probed_keys.begin(), probed_keys.end(), query_keys.begin(), thrust::equal_to{})); + REQUIRE(cuco::test::equal(matched_keys.begin(), + matched_keys.begin() + num_keys, + insert_keys.begin(), + thrust::equal_to{})); + REQUIRE(cuco::test::all_of( + matched_keys.begin() + num_keys, + matched_keys.end(), + cuda::proclaim_return_type([empty_key_sentinel] __device__(auto const& k) { + return static_cast(k == static_cast(empty_key_sentinel)); + }))); + } +} + +TEMPLATE_TEST_CASE_SIG( + "static_multiset retrieve tests", + "", + ((typename Key, cuco::test::probe_sequence Probe, int CGSize), Key, Probe, CGSize), + (int32_t, cuco::test::probe_sequence::double_hashing, 1), + (int32_t, cuco::test::probe_sequence::double_hashing, 2), + (int64_t, cuco::test::probe_sequence::double_hashing, 1), + (int64_t, cuco::test::probe_sequence::double_hashing, 2), + (int32_t, cuco::test::probe_sequence::linear_probing, 1), + (int32_t, cuco::test::probe_sequence::linear_probing, 2), + (int64_t, cuco::test::probe_sequence::linear_probing, 1), + (int64_t, cuco::test::probe_sequence::linear_probing, 2)) +{ + constexpr std::size_t num_keys{400}; + constexpr double desired_load_factor = 0.5; + constexpr auto empty_key_sentinel = std::numeric_limits::max(); + + using probe = std::conditional_t>, + cuco::double_hashing>>; + + auto set = cuco::static_multiset{ + num_keys, desired_load_factor, cuco::empty_key{empty_key_sentinel}, {}, probe{}}; + + test_multiplicity(set, num_keys, 1); // unique sequence + test_multiplicity(set, num_keys, 2); // each key occurs twice + test_multiplicity(set, num_keys, 11); + test_outer(set, num_keys); +} \ No newline at end of file