diff --git a/README.md b/README.md index 6b848f8d2..c11eb5c7d 100644 --- a/README.md +++ b/README.md @@ -234,12 +234,12 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection #### Examples: - [Host-bulk APIs (TODO)]() -### `distinct_count_estimator` +### `hyperloglog` -`cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream. +`cuco::hyperloglog` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream. #### Examples: -- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/sMfofM6qd)) -- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/156T9ox7h)) +- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/hyperloglog/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/G4qdcTezE)) +- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/hyperloglog/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/n88713o4n)) diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt index 1eb09c0b4..56205692e 100644 --- a/benchmarks/CMakeLists.txt +++ b/benchmarks/CMakeLists.txt @@ -95,6 +95,6 @@ ConfigureBench(HASH_FUNCTION_BENCH hash_function/hash_function_bench.cu) ################################################################################################### -# - distinct_count_estimator benchmarks ----------------------------------------------------------- -ConfigureBench(DISTINCT_COUNT_ESTIMATOR_BENCH - distinct_count_estimator/distinct_count_estimator_bench.cu) +# - hyperloglog benchmarks ----------------------------------------------------------- +ConfigureBench(HYPERLOGLOG_BENCH + hyperloglog/hyperloglog_bench.cu) diff --git a/benchmarks/distinct_count_estimator/distinct_count_estimator_bench.cu b/benchmarks/hyperloglog/hyperloglog_bench.cu similarity index 88% rename from benchmarks/distinct_count_estimator/distinct_count_estimator_bench.cu rename to benchmarks/hyperloglog/hyperloglog_bench.cu index 8012a4ae6..901f484cb 100644 --- a/benchmarks/distinct_count_estimator/distinct_count_estimator_bench.cu +++ b/benchmarks/hyperloglog/hyperloglog_bench.cu @@ -17,7 +17,7 @@ #include #include -#include +#include #include #include @@ -74,12 +74,12 @@ template } /** - * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance + * @brief A benchmark evaluating `cuco::hyperloglog` end-to-end performance */ template -void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list) +void hyperloglog_e2e(nvbench::state& state, nvbench::type_list) { - using estimator_type = cuco::distinct_count_estimator; + using estimator_type = cuco::hyperloglog; auto const num_items = state.get_int64("NumInputs"); auto const sketch_size_kb = state.get_int64("SketchSizeKB"); @@ -114,12 +114,12 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list -void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list) +void hyperloglog_add(nvbench::state& state, nvbench::type_list) { - using estimator_type = cuco::distinct_count_estimator; + using estimator_type = cuco::hyperloglog; auto const num_items = state.get_int64("NumInputs"); auto const sketch_size_kb = state.get_int64("SketchSizeKB"); @@ -144,18 +144,18 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list; -NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e, +NVBENCH_BENCH_TYPES(hyperloglog_e2e, NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list)) - .set_name("distinct_count_estimator_e2e_uniform") + .set_name("hyperloglog_e2e_uniform") .set_type_axes_names({"T", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256}) // 256KB uses gmem fallback kernel .add_int64_axis("Multiplicity", {1}) .set_max_noise(defaults::MAX_NOISE); -NVBENCH_BENCH_TYPES(distinct_count_estimator_add, +NVBENCH_BENCH_TYPES(hyperloglog_add, NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list)) - .set_name("distinct_count_estimator_add_uniform") + .set_name("hyperloglog_add_uniform") .set_type_axes_names({"T", "Distribution"}) .add_int64_power_of_two_axis("NumInputs", {28, 29, 30}) .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256}) diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 3dad563e0..c8975f604 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -43,5 +43,5 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu") ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu") ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu") -ConfigureExample(DISTINCT_COUNT_ESTIMATOR_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/host_bulk_example.cu") -ConfigureExample(DISTINCT_COUNT_ESTIMATOR_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/device_ref_example.cu") +ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu") +ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu") diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/hyperloglog/device_ref_example.cu similarity index 96% rename from examples/distinct_count_estimator/device_ref_example.cu rename to examples/hyperloglog/device_ref_example.cu index d9a7078a0..476eed608 100644 --- a/examples/distinct_count_estimator/device_ref_example.cu +++ b/examples/hyperloglog/device_ref_example.cu @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include #include @@ -23,9 +23,9 @@ /** * @file device_ref_example.cu - * @brief Demonstrates usage of `cuco::distinct_count_estimator` device-side APIs. + * @brief Demonstrates usage of `cuco::hyperloglog` device-side APIs. * - * This example demonstrates how the non-owning reference type `cuco::distinct_count_estimator_ref` + * This example demonstrates how the non-owning reference type `cuco::hyperloglog_ref` * can be used to implement a custom kernel that fuses the cardinality estimation step with any * other workload that traverses the input data. */ @@ -119,7 +119,7 @@ __global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb, int main(void) { using T = int; - using estimator_type = cuco::distinct_count_estimator; + using estimator_type = cuco::hyperloglog; constexpr std::size_t num_items = 1ull << 28; // 1GB auto const sketch_size_kb = 32_KB; diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/hyperloglog/host_bulk_example.cu similarity index 91% rename from examples/distinct_count_estimator/host_bulk_example.cu rename to examples/hyperloglog/host_bulk_example.cu index 0cd535e8b..545c72d38 100644 --- a/examples/distinct_count_estimator/host_bulk_example.cu +++ b/examples/hyperloglog/host_bulk_example.cu @@ -13,7 +13,7 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include +#include #include #include @@ -24,7 +24,7 @@ /** * @file host_bulk_example.cu - * @brief Demonstrates usage of `cuco::distinct_count_estimator` "bulk" host APIs. + * @brief Demonstrates usage of `cuco::hyperloglog` "bulk" host APIs. */ int main(void) { @@ -41,7 +41,7 @@ int main(void) auto const sd = cuco::standard_deviation{0.0122197}; // Initialize the estimator - cuco::distinct_count_estimator estimator{sd}; + cuco::hyperloglog estimator{sd}; // Add all items to the estimator estimator.add(items.begin(), items.end()); diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl deleted file mode 100644 index b3ee95891..000000000 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl +++ /dev/null @@ -1,156 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -namespace cuco { - -template -constexpr distinct_count_estimator::distinct_count_estimator( - cuco::sketch_size_kb sketch_size_kb, - Hash const& hash, - Allocator const& alloc, - cuda::stream_ref stream) - : impl_{std::make_unique(sketch_size_kb, hash, alloc, stream)} -{ -} - -template -constexpr distinct_count_estimator::distinct_count_estimator( - cuco::standard_deviation standard_deviation, - Hash const& hash, - Allocator const& alloc, - cuda::stream_ref stream) - : impl_{std::make_unique(standard_deviation, hash, alloc, stream)} -{ -} - -template -constexpr void distinct_count_estimator::clear_async( - cuda::stream_ref stream) noexcept -{ - this->impl_->clear_async(stream); -} - -template -constexpr void distinct_count_estimator::clear(cuda::stream_ref stream) -{ - this->impl_->clear(stream); -} - -template -template -constexpr void distinct_count_estimator::add_async( - InputIt first, InputIt last, cuda::stream_ref stream) -{ - this->impl_->add_async(first, last, stream); -} - -template -template -constexpr void distinct_count_estimator::add(InputIt first, - InputIt last, - cuda::stream_ref stream) -{ - this->impl_->add(first, last, stream); -} - -template -template -constexpr void distinct_count_estimator::merge_async( - distinct_count_estimator const& other, - cuda::stream_ref stream) -{ - this->impl_->merge_async(*(other.impl_), stream); -} - -template -template -constexpr void distinct_count_estimator::merge( - distinct_count_estimator const& other, - cuda::stream_ref stream) -{ - this->impl_->merge(*(other.impl_), stream); -} - -template -template -constexpr void distinct_count_estimator::merge_async( - ref_type const& other_ref, cuda::stream_ref stream) -{ - this->impl_->merge_async(other_ref.impl_, stream); -} - -template -template -constexpr void distinct_count_estimator::merge( - ref_type const& other_ref, cuda::stream_ref stream) -{ - this->impl_->merge(other_ref.impl_, stream); -} - -template -constexpr std::size_t distinct_count_estimator::estimate( - cuda::stream_ref stream) const -{ - return this->impl_->estimate(stream); -} - -template -constexpr typename distinct_count_estimator::ref_type<> -distinct_count_estimator::ref() const noexcept -{ - return {this->sketch(), this->hash_function()}; -} - -template -constexpr auto distinct_count_estimator::hash_function() const noexcept -{ - return this->impl_->hash_function(); -} - -template -constexpr cuda::std::span -distinct_count_estimator::sketch() const noexcept -{ - return this->impl_->sketch(); -} - -template -constexpr size_t distinct_count_estimator::sketch_bytes() const noexcept -{ - return this->impl_->sketch_bytes(); -} - -template -constexpr size_t distinct_count_estimator::sketch_bytes( - cuco::sketch_size_kb sketch_size_kb) noexcept -{ - return impl_type::sketch_bytes(sketch_size_kb); -} - -template -constexpr size_t distinct_count_estimator::sketch_bytes( - cuco::standard_deviation standard_deviation) noexcept -{ - return impl_type::sketch_bytes(standard_deviation); -} - -template -constexpr size_t distinct_count_estimator::sketch_alignment() noexcept -{ - return impl_type::sketch_alignment(); -} - -} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh deleted file mode 100644 index 3ea977af0..000000000 --- a/include/cuco/detail/hyperloglog/hyperloglog.cuh +++ /dev/null @@ -1,356 +0,0 @@ -/* - * Copyright (c) 2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ -#pragma once - -#include -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -namespace cuco::detail { -/** - * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset. - * - * @note This class implements the HyperLogLog/HyperLogLog++ algorithm: - * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf. - * - * @tparam T Type of items to count - * @tparam Scope The scope in which operations will be performed by individual threads - * @tparam Hash Hash function used to hash items - * @tparam Allocator Type of allocator used for device storage - */ -template -class hyperloglog { - public: - static constexpr auto thread_scope = Scope; ///< CUDA thread scope - - template - using ref_type = hyperloglog_ref; ///< Non-owning reference - ///< type - - using value_type = typename ref_type<>::value_type; ///< Type of items to count - using hasher = typename ref_type<>::hasher; ///< Hash function type - using register_type = typename ref_type<>::register_type; ///< HLL register type - using allocator_type = - typename std::allocator_traits::template rebind_alloc; ///< Allocator - ///< type - - private: - /** - * @brief Constructs a `hyperloglog` host object. - * - * @note This function synchronizes the given stream. - * - * @param sketch_size_b Sketch size in bytes - * @param hash The hash function used to hash items - * @param alloc Allocator used for allocating device storage - * @param stream CUDA stream used to initialize the object - */ - constexpr hyperloglog(std::size_t sketch_size_b, - Hash const& hash, - Allocator const& alloc, - cuda::stream_ref stream) - : allocator_{alloc}, - sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)), - custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}}, - ref_{cuda::std::span{reinterpret_cast(this->sketch_.get()), sketch_size_b}, - hash} - { - this->ref_.clear_async(stream); - } - - public: - /** - * @brief Constructs a `hyperloglog` host object. - * - * @note This function synchronizes the given stream. - * - * @param sketch_size_kb Maximum sketch size in KB - * @param hash The hash function used to hash items - * @param alloc Allocator used for allocating device storage - * @param stream CUDA stream used to initialize the object - */ - constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb, - Hash const& hash, - Allocator const& alloc, - cuda::stream_ref stream) - : hyperloglog{sketch_bytes(sketch_size_kb), hash, alloc, stream} - { - } - - /** - * @brief Constructs a `hyperloglog` host object. - * - * @note This function synchronizes the given stream. - * - * @param standard_deviation Desired standard deviation for the approximation error - * @param hash The hash function used to hash items - * @param alloc Allocator used for allocating device storage - * @param stream CUDA stream used to initialize the object - */ - constexpr hyperloglog(cuco::standard_deviation standard_deviation, - Hash const& hash, - Allocator const& alloc, - cuda::stream_ref stream) - : hyperloglog{sketch_bytes(standard_deviation), hash, alloc, stream} - { - } - - ~hyperloglog() = default; - - hyperloglog(hyperloglog const&) = delete; - hyperloglog& operator=(hyperloglog const&) = delete; - hyperloglog(hyperloglog&&) = default; ///< Move constructor - - /** - * @brief Copy-assignment operator. - * - * @return Copy of `*this` - */ - hyperloglog& operator=(hyperloglog&&) = default; - - /** - * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. - * - * @param stream CUDA stream this operation is executed in - */ - constexpr void clear_async(cuda::stream_ref stream) noexcept { this->ref_.clear_async(stream); } - - /** - * @brief Resets the estimator, i.e., clears the current count estimate. - * - * @note This function synchronizes the given stream. For asynchronous execution use - * `clear_async`. - * - * @param stream CUDA stream this operation is executed in - */ - constexpr void clear(cuda::stream_ref stream) { this->ref_.clear(stream); } - - /** - * @brief Asynchronously adds to be counted items to the estimator. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * T> is `true` - * - * @param first Beginning of the sequence of items - * @param last End of the sequence of items - * @param stream CUDA stream this operation is executed in - */ - template - constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream) - { - this->ref_.add_async(first, last, stream); - } - - /** - * @brief Adds to be counted items to the estimator. - * - * @note This function synchronizes the given stream. For asynchronous execution use - * `add_async`. - * - * @tparam InputIt Device accessible random access input iterator where - * std::is_convertible::value_type, - * T> is `true` - * - * @param first Beginning of the sequence of items - * @param last End of the sequence of items - * @param stream CUDA stream this operation is executed in - */ - template - constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream) - { - this->ref_.add(first, last, stream); - } - - /** - * @brief Asynchronously merges the result of `other` estimator into `*this` estimator. - * - * @throw If this->sketch_bytes() != other.sketch_bytes() - * - * @tparam OtherScope Thread scope of `other` estimator - * @tparam OtherAllocator Allocator type of `other` estimator - * - * @param other Other estimator to be merged into `*this` - * @param stream CUDA stream this operation is executed in - */ - template - constexpr void merge_async(hyperloglog const& other, - cuda::stream_ref stream) - { - this->ref_.merge_async(other.ref(), stream); - } - - /** - * @brief Merges the result of `other` estimator into `*this` estimator. - * - * @note This function synchronizes the given stream. For asynchronous execution use - * `merge_async`. - * - * @throw If this->sketch_bytes() != other.sketch_bytes() - * - * @tparam OtherScope Thread scope of `other` estimator - * @tparam OtherAllocator Allocator type of `other` estimator - * - * @param other Other estimator to be merged into `*this` - * @param stream CUDA stream this operation is executed in - */ - template - constexpr void merge(hyperloglog const& other, - cuda::stream_ref stream) - { - this->ref_.merge(other.ref(), stream); - } - - /** - * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. - * - * @throw If this->sketch_bytes() != other.sketch_bytes() - * - * @tparam OtherScope Thread scope of `other` estimator - * - * @param other_ref Other estimator reference to be merged into `*this` - * @param stream CUDA stream this operation is executed in - */ - template - constexpr void merge_async(ref_type const& other_ref, cuda::stream_ref stream) - { - this->ref_.merge_async(other_ref, stream); - } - - /** - * @brief Merges the result of `other` estimator reference into `*this` estimator. - * - * @note This function synchronizes the given stream. For asynchronous execution use - * `merge_async`. - * - * @throw If this->sketch_bytes() != other.sketch_bytes() - * - * @tparam OtherScope Thread scope of `other` estimator - * - * @param other_ref Other estimator reference to be merged into `*this` - * @param stream CUDA stream this operation is executed in - */ - template - constexpr void merge(ref_type const& other_ref, cuda::stream_ref stream) - { - this->ref_.merge(other_ref, stream); - } - - /** - * @brief Compute the estimated distinct items count. - * - * @note This function synchronizes the given stream. - * - * @param stream CUDA stream this operation is executed in - * - * @return Approximate distinct items count - */ - [[nodiscard]] constexpr std::size_t estimate(cuda::stream_ref stream) const - { - return this->ref_.estimate(stream); - } - - /** - * @brief Get device ref. - * - * @return Device ref object of the current `distinct_count_estimator` host object - */ - [[nodiscard]] constexpr ref_type<> ref() const noexcept { return this->ref_; } - - /** - * @brief Get hash function. - * - * @return The hash function - */ - [[nodiscard]] constexpr auto hash_function() const noexcept { return this->ref_.hash_function(); } - - /** - * @brief Gets the span of the sketch. - * - * @return The cuda::std::span of the sketch - */ - [[nodiscard]] constexpr cuda::std::span sketch() const noexcept - { - return this->ref_.sketch(); - } - - /** - * @brief Gets the number of bytes required for the sketch storage. - * - * @return The number of bytes required for the sketch - */ - [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept - { - return this->ref_.sketch_bytes(); - } - - /** - * @brief Gets the number of bytes required for the sketch storage. - * - * @param sketch_size_kb Upper bound sketch size in KB - * - * @return The number of bytes required for the sketch - */ - [[nodiscard]] static constexpr std::size_t sketch_bytes( - cuco::sketch_size_kb sketch_size_kb) noexcept - { - return ref_type<>::sketch_bytes(sketch_size_kb); - } - - /** - * @brief Gets the number of bytes required for the sketch storage. - * - * @param standard_deviation Upper bound standard deviation for approximation error - * - * @return The number of bytes required for the sketch - */ - [[nodiscard]] static constexpr std::size_t sketch_bytes( - cuco::standard_deviation standard_deviation) noexcept - { - return ref_type<>::sketch_bytes(standard_deviation); - } - - /** - * @brief Gets the alignment required for the sketch storage. - * - * @return The required alignment - */ - [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept - { - return ref_type<>::sketch_alignment(); - } - - private: - allocator_type allocator_; ///< Storage allocator - std::unique_ptr> - sketch_; ///< Sketch storage - ref_type<> ref_; //< Ref type - - // Needs to be friends with other instantiations of this class template to have access to their - // storage - template - friend class hyperloglog; -}; -} // namespace cuco::detail diff --git a/include/cuco/detail/hyperloglog/hyperloglog.inl b/include/cuco/detail/hyperloglog/hyperloglog.inl new file mode 100644 index 000000000..3b3295ec5 --- /dev/null +++ b/include/cuco/detail/hyperloglog/hyperloglog.inl @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +namespace cuco { + +template +constexpr hyperloglog::hyperloglog(cuco::sketch_size_kb sketch_size_kb, + Hash const& hash, + Allocator const& alloc, + cuda::stream_ref stream) + : allocator_{alloc}, + sketch_{ + allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)), + detail::custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), allocator_}}, + ref_{cuda::std::span{reinterpret_cast(sketch_.get()), + sketch_bytes(sketch_size_kb)}, + hash} +{ + this->clear_async(stream); +} + +template +constexpr hyperloglog::hyperloglog( + cuco::standard_deviation standard_deviation, + Hash const& hash, + Allocator const& alloc, + cuda::stream_ref stream) + : allocator_{alloc}, + sketch_{ + allocator_.allocate(sketch_bytes(standard_deviation) / sizeof(register_type)), + detail::custom_deleter{sketch_bytes(standard_deviation) / sizeof(register_type), allocator_}}, + ref_{cuda::std::span{reinterpret_cast(sketch_.get()), + sketch_bytes(standard_deviation)}, + hash} +{ + this->clear_async(stream); +} + +template +constexpr void hyperloglog::clear_async(cuda::stream_ref stream) noexcept +{ + ref_.clear_async(stream); +} + +template +constexpr void hyperloglog::clear(cuda::stream_ref stream) +{ + ref_.clear(stream); +} + +template +template +constexpr void hyperloglog::add_async(InputIt first, + InputIt last, + cuda::stream_ref stream) +{ + ref_.add_async(first, last, stream); +} + +template +template +constexpr void hyperloglog::add(InputIt first, + InputIt last, + cuda::stream_ref stream) +{ + ref_.add(first, last, stream); +} + +template +template +constexpr void hyperloglog::merge_async( + hyperloglog const& other, cuda::stream_ref stream) +{ + ref_.merge_async(other.ref_, stream); +} + +template +template +constexpr void hyperloglog::merge( + hyperloglog const& other, cuda::stream_ref stream) +{ + ref_.merge(other.ref_, stream); +} + +template +template +constexpr void hyperloglog::merge_async( + ref_type const& other_ref, cuda::stream_ref stream) +{ + ref_.merge_async(other_ref, stream); +} + +template +template +constexpr void hyperloglog::merge(ref_type const& other_ref, + cuda::stream_ref stream) +{ + ref_.merge(other_ref, stream); +} + +template +constexpr std::size_t hyperloglog::estimate( + cuda::stream_ref stream) const +{ + return ref_.estimate(stream); +} + +template +constexpr typename hyperloglog::ref_type<> +hyperloglog::ref() const noexcept +{ + return {this->sketch(), this->hash_function()}; +} + +template +constexpr auto hyperloglog::hash_function() const noexcept +{ + return ref_.hash_function(); +} + +template +constexpr cuda::std::span hyperloglog::sketch() + const noexcept +{ + return ref_.sketch(); +} + +template +constexpr size_t hyperloglog::sketch_bytes() const noexcept +{ + return ref_.sketch_bytes(); +} + +template +constexpr size_t hyperloglog::sketch_bytes( + cuco::sketch_size_kb sketch_size_kb) noexcept +{ + return ref_type<>::sketch_bytes(sketch_size_kb); +} + +template +constexpr size_t hyperloglog::sketch_bytes( + cuco::standard_deviation standard_deviation) noexcept +{ + return ref_type<>::sketch_bytes(standard_deviation); +} + +template +constexpr size_t hyperloglog::sketch_alignment() noexcept +{ + return ref_type<>::sketch_alignment(); +} + +} // namespace cuco \ No newline at end of file diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh similarity index 91% rename from include/cuco/detail/hyperloglog/hyperloglog_ref.cuh rename to include/cuco/detail/hyperloglog/hyperloglog_impl.cuh index 5a656e325..6160ebf02 100644 --- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh +++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh @@ -26,6 +26,7 @@ #include #include +#include // TODO #include once available #include #include #include @@ -36,7 +37,6 @@ #include #include -#include // there is no #include namespace cuco::detail { @@ -52,7 +52,7 @@ namespace cuco::detail { * @tparam Hash Hash function used to hash items */ template -class hyperloglog_ref { +class hyperloglog_impl { // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs using fp_type = double; ///< Floating point type used for reduction using hash_value_type = @@ -65,11 +65,11 @@ class hyperloglog_ref { using register_type = int; ///< HLL register type template - using with_scope = hyperloglog_ref; ///< Ref type with different - ///< thread scope + using with_scope = hyperloglog_impl; ///< Ref type with different + ///< thread scope /** - * @brief Constructs a non-owning `hyperloglog_ref` object. + * @brief Constructs a non-owning `hyperloglog_impl` object. * * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from * host; UB if called from device. @@ -79,8 +79,8 @@ class hyperloglog_ref { * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, - Hash const& hash) + __host__ __device__ constexpr hyperloglog_impl(cuda::std::span sketch_span, + Hash const& hash) : hash_{hash}, precision_{cuda::std::countr_zero( sketch_bytes(cuco::sketch_size_kb(static_cast(sketch_span.size() / 1024.0))) / @@ -92,11 +92,9 @@ class hyperloglog_ref { #ifndef __CUDA_ARCH__ auto const alignment = 1ull << cuda::std::countr_zero(reinterpret_cast(sketch_span.data())); - CUCO_EXPECTS( - alignment >= sketch_alignment(), "Insufficient sketch alignment", std::runtime_error); + CUCO_EXPECTS(alignment >= sketch_alignment(), "Insufficient sketch alignment"); - CUCO_EXPECTS( - this->precision_ >= 4, "Minimum required sketch size is 0.0625KB or 64B", std::runtime_error); + CUCO_EXPECTS(this->precision_ >= 4, "Minimum required sketch size is 0.0625KB or 64B"); #endif } @@ -192,19 +190,19 @@ class hyperloglog_ref { switch (vector_size) { case 2: kernel = reinterpret_cast( - cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_ref>); + cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_impl>); break; case 4: kernel = reinterpret_cast( - cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_ref>); + cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_impl>); break; case 8: kernel = reinterpret_cast( - cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_ref>); + cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_impl>); break; case 16: kernel = reinterpret_cast( - cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_ref>); + cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_impl>); break; }; } @@ -227,7 +225,7 @@ class hyperloglog_ref { } } else { kernel = reinterpret_cast( - cuco::hyperloglog_ns::detail::add_shmem); + cuco::hyperloglog_ns::detail::add_shmem); void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast(this)}; if (this->try_reserve_shmem(kernel, shmem_bytes)) { CUCO_CUDA_TRY( @@ -239,7 +237,7 @@ class hyperloglog_ref { // Computes sketch directly in global memory. (Fallback path in case there is not enough // shared memory avalable) kernel = reinterpret_cast( - cuco::hyperloglog_ns::detail::add_gmem); + cuco::hyperloglog_ns::detail::add_gmem); CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0)); @@ -283,7 +281,7 @@ class hyperloglog_ref { */ template __device__ constexpr void merge(CG const& group, - hyperloglog_ref const& other) + hyperloglog_impl const& other) { // TODO find a better way to do error handling in device code // if (other.precision_ != this->precision_) { __trap(); } @@ -305,12 +303,11 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void merge_async(hyperloglog_ref const& other, + __host__ constexpr void merge_async(hyperloglog_impl const& other, cuda::stream_ref stream) { CUCO_EXPECTS(other.precision_ == this->precision_, - "Cannot merge estimators with different sketch sizes", - std::runtime_error); + "Cannot merge estimators with different sketch sizes"); auto constexpr block_size = 1024; cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream.get()>>>(other, *this); } @@ -329,7 +326,7 @@ class hyperloglog_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void merge(hyperloglog_ref const& other, + __host__ constexpr void merge(hyperloglog_impl const& other, cuda::stream_ref stream) { this->merge_async(other, stream); @@ -343,12 +340,12 @@ class hyperloglog_ref { * * @return Approximate distinct items count */ - [[nodiscard]] __device__ std::size_t estimate( - cooperative_groups::thread_block const& group) const noexcept + [[nodiscard]] __device__ size_t + estimate(cooperative_groups::thread_block const& group) const noexcept { __shared__ cuda::atomic block_sum; __shared__ cuda::atomic block_zeroes; - __shared__ std::size_t estimate; + __shared__ size_t estimate; if (group.thread_rank() == 0) { new (&block_sum) decltype(block_sum){0}; @@ -405,7 +402,7 @@ class hyperloglog_ref { * * @return Approximate distinct items count */ - [[nodiscard]] __host__ constexpr std::size_t estimate(cuda::stream_ref stream) const + [[nodiscard]] __host__ constexpr size_t estimate(cuda::stream_ref stream) const { auto const num_regs = 1ull << this->precision_; std::vector host_sketch(num_regs); @@ -460,7 +457,7 @@ class hyperloglog_ref { * * @return The number of bytes required for the sketch */ - [[nodiscard]] __host__ __device__ constexpr std::size_t sketch_bytes() const noexcept + [[nodiscard]] __host__ __device__ constexpr size_t sketch_bytes() const noexcept { return (1ull << this->precision_) * sizeof(register_type); } @@ -472,12 +469,12 @@ class hyperloglog_ref { * * @return The number of bytes required for the sketch */ - [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes( + [[nodiscard]] __host__ __device__ static constexpr size_t sketch_bytes( cuco::sketch_size_kb sketch_size_kb) noexcept { // minimum precision is 4 or 64 bytes - return cuda::std::max(static_cast(sizeof(register_type) * 1ull << 4), - cuda::std::bit_floor(static_cast(sketch_size_kb * 1024))); + return cuda::std::max(static_cast(sizeof(register_type) * 1ull << 4), + cuda::std::bit_floor(static_cast(sketch_size_kb * 1024))); } /** @@ -510,7 +507,7 @@ class hyperloglog_ref { * * @return The required alignment */ - [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_alignment() noexcept + [[nodiscard]] __host__ __device__ static constexpr size_t sketch_alignment() noexcept { return alignof(register_type); } @@ -565,6 +562,6 @@ class hyperloglog_ref { cuda::std::span sketch_; ///< HLL sketch storage template - friend class hyperloglog_ref; + friend class hyperloglog_impl; }; } // namespace cuco::detail diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl similarity index 50% rename from include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl rename to include/cuco/detail/hyperloglog/hyperloglog_ref.inl index bf222986c..096b68bc9 100644 --- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl +++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl @@ -17,127 +17,123 @@ namespace cuco { template -__host__ - __device__ constexpr distinct_count_estimator_ref::distinct_count_estimator_ref( - cuda::std::span sketch_span, Hash const& hash) +__host__ __device__ constexpr hyperloglog_ref::hyperloglog_ref( + cuda::std::span sketch_span, Hash const& hash) : impl_{sketch_span, hash} { } template template -__device__ constexpr void distinct_count_estimator_ref::clear( - CG const& group) noexcept +__device__ constexpr void hyperloglog_ref::clear(CG const& group) noexcept { - this->impl_.clear(group); + impl_.clear(group); } template -__host__ constexpr void distinct_count_estimator_ref::clear_async( +__host__ constexpr void hyperloglog_ref::clear_async( cuda::stream_ref stream) noexcept { - this->impl_.clear_async(stream); + impl_.clear_async(stream); } template -__host__ constexpr void distinct_count_estimator_ref::clear(cuda::stream_ref stream) +__host__ constexpr void hyperloglog_ref::clear(cuda::stream_ref stream) { - this->impl_.clear(stream); + impl_.clear(stream); } template -__device__ constexpr void distinct_count_estimator_ref::add(T const& item) noexcept +__device__ constexpr void hyperloglog_ref::add(T const& item) noexcept { - this->impl_.add(item); + impl_.add(item); } template template -__host__ constexpr void distinct_count_estimator_ref::add_async( - InputIt first, InputIt last, cuda::stream_ref stream) +__host__ constexpr void hyperloglog_ref::add_async(InputIt first, + InputIt last, + cuda::stream_ref stream) { - this->impl_.add_async(first, last, stream); + impl_.add_async(first, last, stream); } template template -__host__ constexpr void distinct_count_estimator_ref::add(InputIt first, - InputIt last, - cuda::stream_ref stream) +__host__ constexpr void hyperloglog_ref::add(InputIt first, + InputIt last, + cuda::stream_ref stream) { - this->impl_.add(first, last, stream); + impl_.add(first, last, stream); } template template -__device__ constexpr void distinct_count_estimator_ref::merge( - CG const& group, distinct_count_estimator_ref const& other) +__device__ constexpr void hyperloglog_ref::merge( + CG const& group, hyperloglog_ref const& other) { - this->impl_.merge(group, other.impl_); + impl_.merge(group, other.impl_); } template template -__host__ constexpr void distinct_count_estimator_ref::merge_async( - distinct_count_estimator_ref const& other, cuda::stream_ref stream) +__host__ constexpr void hyperloglog_ref::merge_async( + hyperloglog_ref const& other, cuda::stream_ref stream) { - this->impl_.merge_async(other, stream); + impl_.merge_async(other.impl_, stream); } template template -__host__ constexpr void distinct_count_estimator_ref::merge( - distinct_count_estimator_ref const& other, cuda::stream_ref stream) +__host__ constexpr void hyperloglog_ref::merge( + hyperloglog_ref const& other, cuda::stream_ref stream) { - this->impl_.merge(other, stream); + impl_.merge(other.impl_, stream); } template -__device__ std::size_t distinct_count_estimator_ref::estimate( +__device__ std::size_t hyperloglog_ref::estimate( cooperative_groups::thread_block const& group) const noexcept { - return this->impl_.estimate(group); + return impl_.estimate(group); } template -__host__ constexpr std::size_t distinct_count_estimator_ref::estimate( +__host__ constexpr std::size_t hyperloglog_ref::estimate( cuda::stream_ref stream) const { - return this->impl_.estimate(stream); + return impl_.estimate(stream); } template -__host__ __device__ constexpr auto distinct_count_estimator_ref::hash_function() - const noexcept +__host__ __device__ constexpr auto hyperloglog_ref::hash_function() const noexcept { - return this->impl_.hash_function(); + return impl_.hash_function(); } template __host__ __device__ constexpr cuda::std::span -distinct_count_estimator_ref::sketch() const noexcept +hyperloglog_ref::sketch() const noexcept { - return this->impl_.sketch(); + return impl_.sketch(); } template -__host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_bytes() const noexcept +__host__ __device__ constexpr std::size_t hyperloglog_ref::sketch_bytes() + const noexcept { - return this->impl_.sketch_bytes(); + return impl_.sketch_bytes(); } template -__host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_bytes( +__host__ __device__ constexpr std::size_t hyperloglog_ref::sketch_bytes( cuco::sketch_size_kb sketch_size_kb) noexcept { return impl_type::sketch_bytes(sketch_size_kb); } template -__host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_bytes( +__host__ __device__ constexpr std::size_t hyperloglog_ref::sketch_bytes( cuco::standard_deviation standard_deviation) noexcept { return impl_type::sketch_bytes(standard_deviation); @@ -145,7 +141,7 @@ distinct_count_estimator_ref::sketch_bytes( template __host__ __device__ constexpr std::size_t -distinct_count_estimator_ref::sketch_alignment() noexcept +hyperloglog_ref::sketch_alignment() noexcept { return impl_type::sketch_alignment(); } diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/hyperloglog.cuh similarity index 76% rename from include/cuco/distinct_count_estimator.cuh rename to include/cuco/hyperloglog.cuh index 5d3c7b6aa..ce8d81875 100644 --- a/include/cuco/distinct_count_estimator.cuh +++ b/include/cuco/hyperloglog.cuh @@ -15,9 +15,9 @@ */ #pragma once -#include -#include +#include #include +#include #include #include #include @@ -44,23 +44,24 @@ template , class Allocator = cuco::cuda_allocator> -class distinct_count_estimator { - using impl_type = detail::hyperloglog; - +class hyperloglog { public: - static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope + static constexpr auto thread_scope = Scope; ///< CUDA thread scope template - using ref_type = cuco::distinct_count_estimator_ref; ///< Non-owning reference - ///< type + using ref_type = hyperloglog_ref; ///< Non-owning reference + ///< type - using value_type = typename impl_type::value_type; ///< Type of items to count - using hasher = typename impl_type::hasher; ///< Type of hash function - using allocator_type = typename impl_type::allocator_type; ///< Allocator type + using value_type = typename ref_type<>::value_type; ///< Type of items to count + using hasher = typename ref_type<>::hasher; ///< Hash function type + using register_type = typename ref_type<>::register_type; ///< HLL register type + using allocator_type = + typename std::allocator_traits::template rebind_alloc; ///< Allocator + ///< type // TODO enable CTAD /** - * @brief Constructs a `distinct_count_estimator` host object. + * @brief Constructs a `hyperloglog` host object. * * @note This function synchronizes the given stream. * @@ -69,13 +70,13 @@ class distinct_count_estimator { * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr distinct_count_estimator(cuco::sketch_size_kb sketch_size_kb = 32_KB, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuda::stream_ref stream = {}); + constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb = 32_KB, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); /** - * @brief Constructs a `distinct_count_estimator` host object. + * @brief Constructs a `hyperloglog` host object. * * @note This function synchronizes the given stream. * @@ -84,23 +85,23 @@ class distinct_count_estimator { * @param alloc Allocator used for allocating device storage * @param stream CUDA stream used to initialize the object */ - constexpr distinct_count_estimator(cuco::standard_deviation standard_deviation, - Hash const& hash = {}, - Allocator const& alloc = {}, - cuda::stream_ref stream = {}); + constexpr hyperloglog(cuco::standard_deviation standard_deviation, + Hash const& hash = {}, + Allocator const& alloc = {}, + cuda::stream_ref stream = {}); - ~distinct_count_estimator() = default; + ~hyperloglog() = default; - distinct_count_estimator(distinct_count_estimator const&) = delete; - distinct_count_estimator& operator=(distinct_count_estimator const&) = delete; - distinct_count_estimator(distinct_count_estimator&&) = default; ///< Move constructor + hyperloglog(hyperloglog const&) = delete; + hyperloglog& operator=(hyperloglog const&) = delete; + hyperloglog(hyperloglog&&) = default; ///< Move constructor /** * @brief Copy-assignment operator. * * @return Copy of `*this` */ - distinct_count_estimator& operator=(distinct_count_estimator&&) = default; + hyperloglog& operator=(hyperloglog&&) = default; /** * @brief Asynchronously resets the estimator, i.e., clears the current count estimate. @@ -162,9 +163,8 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - constexpr void merge_async( - distinct_count_estimator const& other, - cuda::stream_ref stream = {}); + constexpr void merge_async(hyperloglog const& other, + cuda::stream_ref stream = {}); /** * @brief Merges the result of `other` estimator into `*this` estimator. @@ -181,7 +181,7 @@ class distinct_count_estimator { * @param stream CUDA stream this operation is executed in */ template - constexpr void merge(distinct_count_estimator const& other, + constexpr void merge(hyperloglog const& other, cuda::stream_ref stream = {}); /** @@ -227,7 +227,7 @@ class distinct_count_estimator { /** * @brief Get device ref. * - * @return Device ref object of the current `distinct_count_estimator` host object + * @return Device ref object of the current `hyperloglog` host object */ [[nodiscard]] constexpr ref_type<> ref() const noexcept; @@ -280,8 +280,16 @@ class distinct_count_estimator { [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept; private: - std::unique_ptr impl_; ///< Implementation object + allocator_type allocator_; ///< Allocator used to allocate device-accessible storage + std::unique_ptr> + sketch_; ///< Storage of the current `hyperloglog` object + ref_type<> ref_; ///< Device ref of the current `hyperloglog` object + + // Needs to be friends with other instantiations of this class template to have access to their + // storage + template + friend class hyperloglog; }; } // namespace cuco -#include \ No newline at end of file +#include \ No newline at end of file diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/hyperloglog_ref.cuh similarity index 86% rename from include/cuco/distinct_count_estimator_ref.cuh rename to include/cuco/hyperloglog_ref.cuh index 799bb46c7..8946fa8c1 100644 --- a/include/cuco/distinct_count_estimator_ref.cuh +++ b/include/cuco/hyperloglog_ref.cuh @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include #include #include @@ -39,21 +39,22 @@ namespace cuco { template > -class distinct_count_estimator_ref { - using impl_type = detail::hyperloglog_ref; +class hyperloglog_ref { + using impl_type = detail::hyperloglog_impl; public: static constexpr auto thread_scope = impl_type::thread_scope; ///< CUDA thread scope - using value_type = typename impl_type::value_type; ///< Type of items to count - using hasher = typename impl_type::hasher; ///< Type of hash function + using value_type = typename impl_type::value_type; ///< Type of items to count + using hasher = typename impl_type::hasher; ///< Type of hash function + using register_type = typename impl_type::register_type; ///< HLL register type template - using with_scope = distinct_count_estimator_ref; ///< Ref type with different - ///< thread scope + using with_scope = hyperloglog_ref; ///< Ref type with different + ///< thread scope /** - * @brief Constructs a non-owning `distinct_count_estimator_ref` object. + * @brief Constructs a non-owning `hyperloglog_ref` object. * * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from * host; UB if called from device. @@ -63,8 +64,8 @@ class distinct_count_estimator_ref { * @param sketch_span Reference to sketch storage * @param hash The hash function used to hash items */ - __host__ __device__ constexpr distinct_count_estimator_ref( - cuda::std::span sketch_span, Hash const& hash = {}); + __host__ __device__ constexpr hyperloglog_ref(cuda::std::span sketch_span, + Hash const& hash = {}); /** * @brief Resets the estimator, i.e., clears the current count estimate. @@ -144,7 +145,7 @@ class distinct_count_estimator_ref { */ template __device__ constexpr void merge(CG const& group, - distinct_count_estimator_ref const& other); + hyperloglog_ref const& other); /** * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator. @@ -157,8 +158,8 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void merge_async( - distinct_count_estimator_ref const& other, cuda::stream_ref stream = {}); + __host__ constexpr void merge_async(hyperloglog_ref const& other, + cuda::stream_ref stream = {}); /** * @brief Merges the result of `other` estimator reference into `*this` estimator. @@ -174,7 +175,7 @@ class distinct_count_estimator_ref { * @param stream CUDA stream this operation is executed in */ template - __host__ constexpr void merge(distinct_count_estimator_ref const& other, + __host__ constexpr void merge(hyperloglog_ref const& other, cuda::stream_ref stream = {}); /** @@ -251,8 +252,8 @@ class distinct_count_estimator_ref { impl_type impl_; ///< Implementation object template - friend class distinct_count_estimator_ref; + friend class hyperloglog_ref; }; } // namespace cuco -#include +#include diff --git a/include/cuco/types.cuh b/include/cuco/types.cuh index eddc289df..f3c78d9df 100644 --- a/include/cuco/types.cuh +++ b/include/cuco/types.cuh @@ -46,7 +46,7 @@ CUCO_DEFINE_TEMPLATE_STRONG_TYPE(erased_key); /** * @brief A strong type wrapper `cuco::sketch_size_kb` for specifying the upper-bound sketch size of - * `cuco::distinct_count_estimator(_ref)` in KB. + * `cuco::hyperloglog(_ref)` in KB. * * @note Values can also be specified as literals, e.g., 64.3_KB. */ @@ -54,7 +54,7 @@ CUCO_DEFINE_STRONG_TYPE(sketch_size_kb, double); /** * @brief A strong type wrapper `cuco::standard_deviation` for specifying the desired standard - * deviation for the cardinality estimate of `cuco::distinct_count_estimator(_ref)`. + * deviation for the cardinality estimate of `cuco::hyperloglog(_ref)`. */ CUCO_DEFINE_STRONG_TYPE(standard_deviation, double); diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index be88c524d..e1e036048 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -130,8 +130,8 @@ ConfigureTest(DYNAMIC_BITSET_TEST dynamic_bitset/size_test.cu) ################################################################################################### -# - distinct_count_estimator ---------------------------------------------------------------------- -ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST - distinct_count_estimator/unique_sequence_test.cu - distinct_count_estimator/spark_parity_test.cu - distinct_count_estimator/device_ref_test.cu) +# - hyperloglog ---------------------------------------------------------------------- +ConfigureTest(HYPERLOGLOG_TEST + hyperloglog/unique_sequence_test.cu + hyperloglog/spark_parity_test.cu + hyperloglog/device_ref_test.cu) diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/hyperloglog/device_ref_test.cu similarity index 93% rename from tests/distinct_count_estimator/device_ref_test.cu rename to tests/hyperloglog/device_ref_test.cu index ad40e7ab6..ab5161d97 100644 --- a/tests/distinct_count_estimator/device_ref_test.cu +++ b/tests/hyperloglog/device_ref_test.cu @@ -16,8 +16,8 @@ #include -#include #include +#include #include #include @@ -55,14 +55,14 @@ __global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb, } } -TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: device ref", +TEMPLATE_TEST_CASE_SIG("hyperloglog: device ref", "", ((typename T, typename Hash), T, Hash), (int32_t, cuco::xxhash_64), (int64_t, cuco::xxhash_64), (__int128_t, cuco::xxhash_64<__int128_t>)) { - using estimator_type = cuco::distinct_count_estimator; + using estimator_type = cuco::hyperloglog; auto num_items_pow2 = GENERATE(25, 26, 28); auto hll_precision = GENERATE(8, 10, 12, 13); diff --git a/tests/distinct_count_estimator/spark_parity_test.cu b/tests/hyperloglog/spark_parity_test.cu similarity index 86% rename from tests/distinct_count_estimator/spark_parity_test.cu rename to tests/hyperloglog/spark_parity_test.cu index 9004083a4..5abaf159d 100644 --- a/tests/distinct_count_estimator/spark_parity_test.cu +++ b/tests/hyperloglog/spark_parity_test.cu @@ -16,8 +16,8 @@ #include -#include #include +#include #include #include @@ -43,13 +43,12 @@ */ // TODO implement this test once add_if is available -// TEST_CASE("distinct_count_estimator: Spark parity: add nulls", "") +// TEST_CASE("hyperloglog: Spark parity: add nulls", "") -TEST_CASE("distinct_count_estimator: Spark parity: deterministic cardinality estimation", "") +TEST_CASE("hyperloglog: Spark parity: deterministic cardinality estimation", "") { - using T = int; - using estimator_type = - cuco::distinct_count_estimator>; + using T = int; + using estimator_type = cuco::hyperloglog>; constexpr size_t repeats = 10; // This factor determines the error threshold for passing the test @@ -101,13 +100,12 @@ TEST_CASE("distinct_count_estimator: Spark parity: deterministic cardinality est } // the following test is omitted since we refrain from doing randomized unit tests in cuco -// TEST_CASE("distinct_count_estimator: Spark parity: random cardinality estimation", "") +// TEST_CASE("hyperloglog: Spark parity: random cardinality estimation", "") -TEST_CASE("distinct_count_estimator: Spark parity: merging HLL instances", "") +TEST_CASE("hyperloglog: Spark parity: merging HLL instances", "") { - using T = int; - using estimator_type = - cuco::distinct_count_estimator>; + using T = int; + using estimator_type = cuco::hyperloglog>; auto num_items = 1000000; auto standard_deviation = cuco::standard_deviation(0.05); @@ -148,11 +146,11 @@ TEST_CASE("distinct_count_estimator: Spark parity: merging HLL instances", "") The following unit tests fail since xxhash_64 does not deduplicate different bit patterns for NaN values and +-0.0. They are thus counted as distinct items. -TEST_CASE("distinct_count_estimator: Spark parity: add 0.0 and -0.0", "") +TEST_CASE("hyperloglog: Spark parity: add 0.0 and -0.0", "") { using T = double; using estimator_type = - cuco::distinct_count_estimator>; + cuco::hyperloglog>; auto standard_deviation = cuco::standard_deviation(0.05); @@ -164,11 +162,11 @@ TEST_CASE("distinct_count_estimator: Spark parity: add 0.0 and -0.0", "") REQUIRE(estimator.estimate() == 1); } -TEST_CASE("distinct_count_estimator: Spark parity: add NaN", "") +TEST_CASE("hyperloglog: Spark parity: add NaN", "") { using T = double; using estimator_type = - cuco::distinct_count_estimator>; + cuco::hyperloglog>; auto standard_deviation = cuco::standard_deviation(0.05); diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/hyperloglog/unique_sequence_test.cu similarity index 93% rename from tests/distinct_count_estimator/unique_sequence_test.cu rename to tests/hyperloglog/unique_sequence_test.cu index 8883a218b..c9f2bbaa4 100644 --- a/tests/distinct_count_estimator/unique_sequence_test.cu +++ b/tests/hyperloglog/unique_sequence_test.cu @@ -16,8 +16,8 @@ #include -#include #include +#include #include #include @@ -29,7 +29,7 @@ #include #include -TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", +TEMPLATE_TEST_CASE_SIG("hyperloglog: unique sequence", "", ((typename T, typename Hash), T, Hash), (int32_t, cuco::xxhash_64), @@ -56,7 +56,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence", thrust::sequence(items.begin(), items.end(), 0); // Initialize the estimator - cuco::distinct_count_estimator estimator{ + cuco::hyperloglog estimator{ cuco::sketch_size_kb(sketch_size_kb)}; REQUIRE(estimator.estimate() == 0);