diff --git a/README.md b/README.md
index 6b848f8d2..c11eb5c7d 100644
--- a/README.md
+++ b/README.md
@@ -234,12 +234,12 @@ We plan to add many GPU-accelerated, concurrent data structures to `cuCollection
 #### Examples:
 - [Host-bulk APIs (TODO)]()
 
-### `distinct_count_estimator`
+### `hyperloglog`
 
-`cuco::distinct_count_estimator` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
+`cuco::hyperloglog` implements the well-established [HyperLogLog++ algorithm](https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf) for approximating the count of distinct items in a multiset/stream.
 
 #### Examples:
-- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/sMfofM6qd))
-- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/distinct_count_estimator/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/156T9ox7h))
+- [Host-bulk APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/hyperloglog/host_bulk_example.cu) (see [live example in godbolt](https://godbolt.org/z/G4qdcTezE))
+- [Device-ref APIs](https://github.com/NVIDIA/cuCollections/blob/dev/examples/hyperloglog/device_ref_example.cu) (see [live example in godbolt](https://godbolt.org/z/n88713o4n))
 
 
diff --git a/benchmarks/CMakeLists.txt b/benchmarks/CMakeLists.txt
index 1eb09c0b4..56205692e 100644
--- a/benchmarks/CMakeLists.txt
+++ b/benchmarks/CMakeLists.txt
@@ -95,6 +95,6 @@ ConfigureBench(HASH_FUNCTION_BENCH
   hash_function/hash_function_bench.cu)
 
 ###################################################################################################
-# - distinct_count_estimator benchmarks -----------------------------------------------------------
-ConfigureBench(DISTINCT_COUNT_ESTIMATOR_BENCH
-  distinct_count_estimator/distinct_count_estimator_bench.cu)
+# - hyperloglog benchmarks -----------------------------------------------------------
+ConfigureBench(HYPERLOGLOG_BENCH
+  hyperloglog/hyperloglog_bench.cu)
diff --git a/benchmarks/distinct_count_estimator/distinct_count_estimator_bench.cu b/benchmarks/hyperloglog/hyperloglog_bench.cu
similarity index 88%
rename from benchmarks/distinct_count_estimator/distinct_count_estimator_bench.cu
rename to benchmarks/hyperloglog/hyperloglog_bench.cu
index 8012a4ae6..901f484cb 100644
--- a/benchmarks/distinct_count_estimator/distinct_count_estimator_bench.cu
+++ b/benchmarks/hyperloglog/hyperloglog_bench.cu
@@ -17,7 +17,7 @@
 #include <benchmark_defaults.hpp>
 #include <benchmark_utils.hpp>
 
-#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/hyperloglog.cuh>
 #include <cuco/static_set.cuh>
 #include <cuco/utility/key_generator.cuh>
 
@@ -74,12 +74,12 @@ template <class Estimator, class Dist>
 }
 
 /**
- * @brief A benchmark evaluating `cuco::distinct_count_estimator` end-to-end performance
+ * @brief A benchmark evaluating `cuco::hyperloglog` end-to-end performance
  */
 template <typename T, typename Dist>
-void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<T, Dist>)
+void hyperloglog_e2e(nvbench::state& state, nvbench::type_list<T, Dist>)
 {
-  using estimator_type = cuco::distinct_count_estimator<T>;
+  using estimator_type = cuco::hyperloglog<T>;
 
   auto const num_items      = state.get_int64("NumInputs");
   auto const sketch_size_kb = state.get_int64("SketchSizeKB");
@@ -114,12 +114,12 @@ void distinct_count_estimator_e2e(nvbench::state& state, nvbench::type_list<T, D
 }
 
 /**
- * @brief A benchmark evaluating `cuco::distinct_count_estimator::add_async` performance
+ * @brief A benchmark evaluating `cuco::hyperloglog::add_async` performance
  */
 template <typename T, typename Dist>
-void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<T, Dist>)
+void hyperloglog_add(nvbench::state& state, nvbench::type_list<T, Dist>)
 {
-  using estimator_type = cuco::distinct_count_estimator<T>;
+  using estimator_type = cuco::hyperloglog<T>;
 
   auto const num_items      = state.get_int64("NumInputs");
   auto const sketch_size_kb = state.get_int64("SketchSizeKB");
@@ -144,18 +144,18 @@ void distinct_count_estimator_add(nvbench::state& state, nvbench::type_list<T, D
 
 using TYPE_RANGE = nvbench::type_list<nvbench::int32_t, nvbench::int64_t, __int128_t>;
 
-NVBENCH_BENCH_TYPES(distinct_count_estimator_e2e,
+NVBENCH_BENCH_TYPES(hyperloglog_e2e,
                     NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list<distribution::uniform>))
-  .set_name("distinct_count_estimator_e2e_uniform")
+  .set_name("hyperloglog_e2e_uniform")
   .set_type_axes_names({"T", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
   .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256})  // 256KB uses gmem fallback kernel
   .add_int64_axis("Multiplicity", {1})
   .set_max_noise(defaults::MAX_NOISE);
 
-NVBENCH_BENCH_TYPES(distinct_count_estimator_add,
+NVBENCH_BENCH_TYPES(hyperloglog_add,
                     NVBENCH_TYPE_AXES(TYPE_RANGE, nvbench::type_list<distribution::uniform>))
-  .set_name("distinct_count_estimator_add_uniform")
+  .set_name("hyperloglog_add_uniform")
   .set_type_axes_names({"T", "Distribution"})
   .add_int64_power_of_two_axis("NumInputs", {28, 29, 30})
   .add_int64_axis("SketchSizeKB", {8, 16, 32, 64, 128, 256})
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index 3dad563e0..c8975f604 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -43,5 +43,5 @@ ConfigureExample(STATIC_MAP_DEVICE_SIDE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/sta
 ConfigureExample(STATIC_MAP_CUSTOM_TYPE_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/custom_type_example.cu")
 ConfigureExample(STATIC_MAP_COUNT_BY_KEY_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_map/count_by_key_example.cu")
 ConfigureExample(STATIC_MULTIMAP_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/static_multimap/host_bulk_example.cu")
-ConfigureExample(DISTINCT_COUNT_ESTIMATOR_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/host_bulk_example.cu")
-ConfigureExample(DISTINCT_COUNT_ESTIMATOR_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/distinct_count_estimator/device_ref_example.cu")
+ConfigureExample(HYPERLOGLOG_HOST_BULK_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/host_bulk_example.cu")
+ConfigureExample(HYPERLOGLOG_DEVICE_REF_EXAMPLE "${CMAKE_CURRENT_SOURCE_DIR}/hyperloglog/device_ref_example.cu")
diff --git a/examples/distinct_count_estimator/device_ref_example.cu b/examples/hyperloglog/device_ref_example.cu
similarity index 96%
rename from examples/distinct_count_estimator/device_ref_example.cu
rename to examples/hyperloglog/device_ref_example.cu
index d9a7078a0..476eed608 100644
--- a/examples/distinct_count_estimator/device_ref_example.cu
+++ b/examples/hyperloglog/device_ref_example.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/hyperloglog.cuh>
 
 #include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
@@ -23,9 +23,9 @@
 
 /**
  * @file device_ref_example.cu
- * @brief Demonstrates usage of `cuco::distinct_count_estimator` device-side APIs.
+ * @brief Demonstrates usage of `cuco::hyperloglog` device-side APIs.
  *
- * This example demonstrates how the non-owning reference type `cuco::distinct_count_estimator_ref`
+ * This example demonstrates how the non-owning reference type `cuco::hyperloglog_ref`
  * can be used to implement a custom kernel that fuses the cardinality estimation step with any
  * other workload that traverses the input data.
  */
@@ -119,7 +119,7 @@ __global__ void device_estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
 int main(void)
 {
   using T                         = int;
-  using estimator_type            = cuco::distinct_count_estimator<T>;
+  using estimator_type            = cuco::hyperloglog<T>;
   constexpr std::size_t num_items = 1ull << 28;  // 1GB
   auto const sketch_size_kb       = 32_KB;
 
diff --git a/examples/distinct_count_estimator/host_bulk_example.cu b/examples/hyperloglog/host_bulk_example.cu
similarity index 91%
rename from examples/distinct_count_estimator/host_bulk_example.cu
rename to examples/hyperloglog/host_bulk_example.cu
index 0cd535e8b..545c72d38 100644
--- a/examples/distinct_count_estimator/host_bulk_example.cu
+++ b/examples/hyperloglog/host_bulk_example.cu
@@ -13,7 +13,7 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cuco/distinct_count_estimator.cuh>
+#include <cuco/hyperloglog.cuh>
 
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
@@ -24,7 +24,7 @@
 
 /**
  * @file host_bulk_example.cu
- * @brief Demonstrates usage of `cuco::distinct_count_estimator` "bulk" host APIs.
+ * @brief Demonstrates usage of `cuco::hyperloglog` "bulk" host APIs.
  */
 int main(void)
 {
@@ -41,7 +41,7 @@ int main(void)
   auto const sd = cuco::standard_deviation{0.0122197};
 
   // Initialize the estimator
-  cuco::distinct_count_estimator<T> estimator{sd};
+  cuco::hyperloglog<T> estimator{sd};
 
   // Add all items to the estimator
   estimator.add(items.begin(), items.end());
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl b/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
deleted file mode 100644
index b3ee95891..000000000
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator.inl
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-namespace cuco {
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_estimator(
-  cuco::sketch_size_kb sketch_size_kb,
-  Hash const& hash,
-  Allocator const& alloc,
-  cuda::stream_ref stream)
-  : impl_{std::make_unique<impl_type>(sketch_size_kb, hash, alloc, stream)}
-{
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr distinct_count_estimator<T, Scope, Hash, Allocator>::distinct_count_estimator(
-  cuco::standard_deviation standard_deviation,
-  Hash const& hash,
-  Allocator const& alloc,
-  cuda::stream_ref stream)
-  : impl_{std::make_unique<impl_type>(standard_deviation, hash, alloc, stream)}
-{
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear_async(
-  cuda::stream_ref stream) noexcept
-{
-  this->impl_->clear_async(stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::clear(cuda::stream_ref stream)
-{
-  this->impl_->clear(stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-template <class InputIt>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add_async(
-  InputIt first, InputIt last, cuda::stream_ref stream)
-{
-  this->impl_->add_async(first, last, stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-template <class InputIt>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::add(InputIt first,
-                                                                        InputIt last,
-                                                                        cuda::stream_ref stream)
-{
-  this->impl_->add(first, last, stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-template <cuda::thread_scope OtherScope, class OtherAllocator>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
-  distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-  cuda::stream_ref stream)
-{
-  this->impl_->merge_async(*(other.impl_), stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-template <cuda::thread_scope OtherScope, class OtherAllocator>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
-  distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-  cuda::stream_ref stream)
-{
-  this->impl_->merge(*(other.impl_), stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-template <cuda::thread_scope OtherScope>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge_async(
-  ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
-{
-  this->impl_->merge_async(other_ref.impl_, stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-template <cuda::thread_scope OtherScope>
-constexpr void distinct_count_estimator<T, Scope, Hash, Allocator>::merge(
-  ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
-{
-  this->impl_->merge(other_ref.impl_, stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr std::size_t distinct_count_estimator<T, Scope, Hash, Allocator>::estimate(
-  cuda::stream_ref stream) const
-{
-  return this->impl_->estimate(stream);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr typename distinct_count_estimator<T, Scope, Hash, Allocator>::ref_type<>
-distinct_count_estimator<T, Scope, Hash, Allocator>::ref() const noexcept
-{
-  return {this->sketch(), this->hash_function()};
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr auto distinct_count_estimator<T, Scope, Hash, Allocator>::hash_function() const noexcept
-{
-  return this->impl_->hash_function();
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr cuda::std::span<cuda::std::byte>
-distinct_count_estimator<T, Scope, Hash, Allocator>::sketch() const noexcept
-{
-  return this->impl_->sketch();
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes() const noexcept
-{
-  return this->impl_->sketch_bytes();
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes(
-  cuco::sketch_size_kb sketch_size_kb) noexcept
-{
-  return impl_type::sketch_bytes(sketch_size_kb);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_bytes(
-  cuco::standard_deviation standard_deviation) noexcept
-{
-  return impl_type::sketch_bytes(standard_deviation);
-}
-
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-constexpr size_t distinct_count_estimator<T, Scope, Hash, Allocator>::sketch_alignment() noexcept
-{
-  return impl_type::sketch_alignment();
-}
-
-}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.cuh b/include/cuco/detail/hyperloglog/hyperloglog.cuh
deleted file mode 100644
index 3ea977af0..000000000
--- a/include/cuco/detail/hyperloglog/hyperloglog.cuh
+++ /dev/null
@@ -1,356 +0,0 @@
-/*
- * Copyright (c) 2024, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cuco/detail/error.hpp>
-#include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
-#include <cuco/detail/storage/storage_base.cuh>
-#include <cuco/hash_functions.cuh>
-#include <cuco/types.cuh>
-#include <cuco/utility/cuda_thread_scope.cuh>
-
-#include <cuda/std/cstddef>
-#include <cuda/stream_ref>
-
-#include <iterator>
-#include <memory>
-
-namespace cuco::detail {
-/**
- * @brief A GPU-accelerated utility for approximating the number of distinct items in a multiset.
- *
- * @note This class implements the HyperLogLog/HyperLogLog++ algorithm:
- * https://static.googleusercontent.com/media/research.google.com/de//pubs/archive/40671.pdf.
- *
- * @tparam T Type of items to count
- * @tparam Scope The scope in which operations will be performed by individual threads
- * @tparam Hash Hash function used to hash items
- * @tparam Allocator Type of allocator used for device storage
- */
-template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
-class hyperloglog {
- public:
-  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
-
-  template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = hyperloglog_ref<T, NewScope, Hash>;  ///< Non-owning reference
-                                                        ///< type
-
-  using value_type    = typename ref_type<>::value_type;     ///< Type of items to count
-  using hasher        = typename ref_type<>::hasher;         ///< Hash function type
-  using register_type = typename ref_type<>::register_type;  ///< HLL register type
-  using allocator_type =
-    typename std::allocator_traits<Allocator>::template rebind_alloc<register_type>;  ///< Allocator
-                                                                                      ///< type
-
- private:
-  /**
-   * @brief Constructs a `hyperloglog` host object.
-   *
-   * @note This function synchronizes the given stream.
-   *
-   * @param sketch_size_b Sketch size in bytes
-   * @param hash The hash function used to hash items
-   * @param alloc Allocator used for allocating device storage
-   * @param stream CUDA stream used to initialize the object
-   */
-  constexpr hyperloglog(std::size_t sketch_size_b,
-                        Hash const& hash,
-                        Allocator const& alloc,
-                        cuda::stream_ref stream)
-    : allocator_{alloc},
-      sketch_{this->allocator_.allocate(sketch_size_b / sizeof(register_type)),
-              custom_deleter{sketch_size_b / sizeof(register_type), this->allocator_}},
-      ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(this->sketch_.get()), sketch_size_b},
-           hash}
-  {
-    this->ref_.clear_async(stream);
-  }
-
- public:
-  /**
-   * @brief Constructs a `hyperloglog` host object.
-   *
-   * @note This function synchronizes the given stream.
-   *
-   * @param sketch_size_kb Maximum sketch size in KB
-   * @param hash The hash function used to hash items
-   * @param alloc Allocator used for allocating device storage
-   * @param stream CUDA stream used to initialize the object
-   */
-  constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb,
-                        Hash const& hash,
-                        Allocator const& alloc,
-                        cuda::stream_ref stream)
-    : hyperloglog{sketch_bytes(sketch_size_kb), hash, alloc, stream}
-  {
-  }
-
-  /**
-   * @brief Constructs a `hyperloglog` host object.
-   *
-   * @note This function synchronizes the given stream.
-   *
-   * @param standard_deviation Desired standard deviation for the approximation error
-   * @param hash The hash function used to hash items
-   * @param alloc Allocator used for allocating device storage
-   * @param stream CUDA stream used to initialize the object
-   */
-  constexpr hyperloglog(cuco::standard_deviation standard_deviation,
-                        Hash const& hash,
-                        Allocator const& alloc,
-                        cuda::stream_ref stream)
-    : hyperloglog{sketch_bytes(standard_deviation), hash, alloc, stream}
-  {
-  }
-
-  ~hyperloglog() = default;
-
-  hyperloglog(hyperloglog const&)            = delete;
-  hyperloglog& operator=(hyperloglog const&) = delete;
-  hyperloglog(hyperloglog&&)                 = default;  ///< Move constructor
-
-  /**
-   * @brief Copy-assignment operator.
-   *
-   * @return Copy of `*this`
-   */
-  hyperloglog& operator=(hyperloglog&&) = default;
-
-  /**
-   * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
-   *
-   * @param stream CUDA stream this operation is executed in
-   */
-  constexpr void clear_async(cuda::stream_ref stream) noexcept { this->ref_.clear_async(stream); }
-
-  /**
-   * @brief Resets the estimator, i.e., clears the current count estimate.
-   *
-   * @note This function synchronizes the given stream. For asynchronous execution use
-   * `clear_async`.
-   *
-   * @param stream CUDA stream this operation is executed in
-   */
-  constexpr void clear(cuda::stream_ref stream) { this->ref_.clear(stream); }
-
-  /**
-   * @brief Asynchronously adds to be counted items to the estimator.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * T></tt> is `true`
-   *
-   * @param first Beginning of the sequence of items
-   * @param last End of the sequence of items
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <class InputIt>
-  constexpr void add_async(InputIt first, InputIt last, cuda::stream_ref stream)
-  {
-    this->ref_.add_async(first, last, stream);
-  }
-
-  /**
-   * @brief Adds to be counted items to the estimator.
-   *
-   * @note This function synchronizes the given stream. For asynchronous execution use
-   * `add_async`.
-   *
-   * @tparam InputIt Device accessible random access input iterator where
-   * <tt>std::is_convertible<std::iterator_traits<InputIt>::value_type,
-   * T></tt> is `true`
-   *
-   * @param first Beginning of the sequence of items
-   * @param last End of the sequence of items
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <class InputIt>
-  constexpr void add(InputIt first, InputIt last, cuda::stream_ref stream)
-  {
-    this->ref_.add(first, last, stream);
-  }
-
-  /**
-   * @brief Asynchronously merges the result of `other` estimator into `*this` estimator.
-   *
-   * @throw If this->sketch_bytes() != other.sketch_bytes()
-   *
-   * @tparam OtherScope Thread scope of `other` estimator
-   * @tparam OtherAllocator Allocator type of `other` estimator
-   *
-   * @param other Other estimator to be merged into `*this`
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <cuda::thread_scope OtherScope, class OtherAllocator>
-  constexpr void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-                             cuda::stream_ref stream)
-  {
-    this->ref_.merge_async(other.ref(), stream);
-  }
-
-  /**
-   * @brief Merges the result of `other` estimator into `*this` estimator.
-   *
-   * @note This function synchronizes the given stream. For asynchronous execution use
-   * `merge_async`.
-   *
-   * @throw If this->sketch_bytes() != other.sketch_bytes()
-   *
-   * @tparam OtherScope Thread scope of `other` estimator
-   * @tparam OtherAllocator Allocator type of `other` estimator
-   *
-   * @param other Other estimator to be merged into `*this`
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <cuda::thread_scope OtherScope, class OtherAllocator>
-  constexpr void merge(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
-                       cuda::stream_ref stream)
-  {
-    this->ref_.merge(other.ref(), stream);
-  }
-
-  /**
-   * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
-   *
-   * @throw If this->sketch_bytes() != other.sketch_bytes()
-   *
-   * @tparam OtherScope Thread scope of `other` estimator
-   *
-   * @param other_ref Other estimator reference to be merged into `*this`
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <cuda::thread_scope OtherScope>
-  constexpr void merge_async(ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
-  {
-    this->ref_.merge_async(other_ref, stream);
-  }
-
-  /**
-   * @brief Merges the result of `other` estimator reference into `*this` estimator.
-   *
-   * @note This function synchronizes the given stream. For asynchronous execution use
-   * `merge_async`.
-   *
-   * @throw If this->sketch_bytes() != other.sketch_bytes()
-   *
-   * @tparam OtherScope Thread scope of `other` estimator
-   *
-   * @param other_ref Other estimator reference to be merged into `*this`
-   * @param stream CUDA stream this operation is executed in
-   */
-  template <cuda::thread_scope OtherScope>
-  constexpr void merge(ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
-  {
-    this->ref_.merge(other_ref, stream);
-  }
-
-  /**
-   * @brief Compute the estimated distinct items count.
-   *
-   * @note This function synchronizes the given stream.
-   *
-   * @param stream CUDA stream this operation is executed in
-   *
-   * @return Approximate distinct items count
-   */
-  [[nodiscard]] constexpr std::size_t estimate(cuda::stream_ref stream) const
-  {
-    return this->ref_.estimate(stream);
-  }
-
-  /**
-   * @brief Get device ref.
-   *
-   * @return Device ref object of the current `distinct_count_estimator` host object
-   */
-  [[nodiscard]] constexpr ref_type<> ref() const noexcept { return this->ref_; }
-
-  /**
-   * @brief Get hash function.
-   *
-   * @return The hash function
-   */
-  [[nodiscard]] constexpr auto hash_function() const noexcept { return this->ref_.hash_function(); }
-
-  /**
-   * @brief Gets the span of the sketch.
-   *
-   * @return The cuda::std::span of the sketch
-   */
-  [[nodiscard]] constexpr cuda::std::span<cuda::std::byte> sketch() const noexcept
-  {
-    return this->ref_.sketch();
-  }
-
-  /**
-   * @brief Gets the number of bytes required for the sketch storage.
-   *
-   * @return The number of bytes required for the sketch
-   */
-  [[nodiscard]] constexpr std::size_t sketch_bytes() const noexcept
-  {
-    return this->ref_.sketch_bytes();
-  }
-
-  /**
-   * @brief Gets the number of bytes required for the sketch storage.
-   *
-   * @param sketch_size_kb Upper bound sketch size in KB
-   *
-   * @return The number of bytes required for the sketch
-   */
-  [[nodiscard]] static constexpr std::size_t sketch_bytes(
-    cuco::sketch_size_kb sketch_size_kb) noexcept
-  {
-    return ref_type<>::sketch_bytes(sketch_size_kb);
-  }
-
-  /**
-   * @brief Gets the number of bytes required for the sketch storage.
-   *
-   * @param standard_deviation Upper bound standard deviation for approximation error
-   *
-   * @return The number of bytes required for the sketch
-   */
-  [[nodiscard]] static constexpr std::size_t sketch_bytes(
-    cuco::standard_deviation standard_deviation) noexcept
-  {
-    return ref_type<>::sketch_bytes(standard_deviation);
-  }
-
-  /**
-   * @brief Gets the alignment required for the sketch storage.
-   *
-   * @return The required alignment
-   */
-  [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept
-  {
-    return ref_type<>::sketch_alignment();
-  }
-
- private:
-  allocator_type allocator_;  ///< Storage allocator
-  std::unique_ptr<register_type, custom_deleter<std::size_t, allocator_type>>
-    sketch_;        ///< Sketch storage
-  ref_type<> ref_;  //< Ref type
-
-  // Needs to be friends with other instantiations of this class template to have access to their
-  // storage
-  template <class T_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
-  friend class hyperloglog;
-};
-}  // namespace cuco::detail
diff --git a/include/cuco/detail/hyperloglog/hyperloglog.inl b/include/cuco/detail/hyperloglog/hyperloglog.inl
new file mode 100644
index 000000000..3b3295ec5
--- /dev/null
+++ b/include/cuco/detail/hyperloglog/hyperloglog.inl
@@ -0,0 +1,167 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+namespace cuco {
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr hyperloglog<T, Scope, Hash, Allocator>::hyperloglog(cuco::sketch_size_kb sketch_size_kb,
+                                                              Hash const& hash,
+                                                              Allocator const& alloc,
+                                                              cuda::stream_ref stream)
+  : allocator_{alloc},
+    sketch_{
+      allocator_.allocate(sketch_bytes(sketch_size_kb) / sizeof(register_type)),
+      detail::custom_deleter{sketch_bytes(sketch_size_kb) / sizeof(register_type), allocator_}},
+    ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(sketch_.get()),
+                         sketch_bytes(sketch_size_kb)},
+         hash}
+{
+  this->clear_async(stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr hyperloglog<T, Scope, Hash, Allocator>::hyperloglog(
+  cuco::standard_deviation standard_deviation,
+  Hash const& hash,
+  Allocator const& alloc,
+  cuda::stream_ref stream)
+  : allocator_{alloc},
+    sketch_{
+      allocator_.allocate(sketch_bytes(standard_deviation) / sizeof(register_type)),
+      detail::custom_deleter{sketch_bytes(standard_deviation) / sizeof(register_type), allocator_}},
+    ref_{cuda::std::span{reinterpret_cast<cuda::std::byte*>(sketch_.get()),
+                         sketch_bytes(standard_deviation)},
+         hash}
+{
+  this->clear_async(stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::clear_async(cuda::stream_ref stream) noexcept
+{
+  ref_.clear_async(stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::clear(cuda::stream_ref stream)
+{
+  ref_.clear(stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class InputIt>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::add_async(InputIt first,
+                                                                 InputIt last,
+                                                                 cuda::stream_ref stream)
+{
+  ref_.add_async(first, last, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <class InputIt>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::add(InputIt first,
+                                                           InputIt last,
+                                                           cuda::stream_ref stream)
+{
+  ref_.add(first, last, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope, class OtherAllocator>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::merge_async(
+  hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other, cuda::stream_ref stream)
+{
+  ref_.merge_async(other.ref_, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope, class OtherAllocator>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::merge(
+  hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other, cuda::stream_ref stream)
+{
+  ref_.merge(other.ref_, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::merge_async(
+  ref_type<OtherScope> const& other_ref, cuda::stream_ref stream)
+{
+  ref_.merge_async(other_ref, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+template <cuda::thread_scope OtherScope>
+constexpr void hyperloglog<T, Scope, Hash, Allocator>::merge(ref_type<OtherScope> const& other_ref,
+                                                             cuda::stream_ref stream)
+{
+  ref_.merge(other_ref, stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr std::size_t hyperloglog<T, Scope, Hash, Allocator>::estimate(
+  cuda::stream_ref stream) const
+{
+  return ref_.estimate(stream);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr typename hyperloglog<T, Scope, Hash, Allocator>::ref_type<>
+hyperloglog<T, Scope, Hash, Allocator>::ref() const noexcept
+{
+  return {this->sketch(), this->hash_function()};
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr auto hyperloglog<T, Scope, Hash, Allocator>::hash_function() const noexcept
+{
+  return ref_.hash_function();
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr cuda::std::span<cuda::std::byte> hyperloglog<T, Scope, Hash, Allocator>::sketch()
+  const noexcept
+{
+  return ref_.sketch();
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t hyperloglog<T, Scope, Hash, Allocator>::sketch_bytes() const noexcept
+{
+  return ref_.sketch_bytes();
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t hyperloglog<T, Scope, Hash, Allocator>::sketch_bytes(
+  cuco::sketch_size_kb sketch_size_kb) noexcept
+{
+  return ref_type<>::sketch_bytes(sketch_size_kb);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t hyperloglog<T, Scope, Hash, Allocator>::sketch_bytes(
+  cuco::standard_deviation standard_deviation) noexcept
+{
+  return ref_type<>::sketch_bytes(standard_deviation);
+}
+
+template <class T, cuda::thread_scope Scope, class Hash, class Allocator>
+constexpr size_t hyperloglog<T, Scope, Hash, Allocator>::sketch_alignment() noexcept
+{
+  return ref_type<>::sketch_alignment();
+}
+
+}  // namespace cuco
\ No newline at end of file
diff --git a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
similarity index 91%
rename from include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
rename to include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
index 5a656e325..6160ebf02 100644
--- a/include/cuco/detail/hyperloglog/hyperloglog_ref.cuh
+++ b/include/cuco/detail/hyperloglog/hyperloglog_impl.cuh
@@ -26,6 +26,7 @@
 #include <cuco/utility/traits.hpp>
 
 #include <cuda/atomic>
+#include <cuda/std/__algorithm/max.h>  // TODO #include <cuda/std/algorithm> once available
 #include <cuda/std/bit>
 #include <cuda/std/cstddef>
 #include <cuda/std/span>
@@ -36,7 +37,6 @@
 #include <cooperative_groups.h>
 #include <cooperative_groups/reduce.h>
 
-#include <algorithm>  // there is no <cuda/std/algorithm>
 #include <vector>
 
 namespace cuco::detail {
@@ -52,7 +52,7 @@ namespace cuco::detail {
  * @tparam Hash Hash function used to hash items
  */
 template <class T, cuda::thread_scope Scope, class Hash>
-class hyperloglog_ref {
+class hyperloglog_impl {
   // We use `int` here since this is the smallest type that supports native `atomicMax` on GPUs
   using fp_type = double;  ///< Floating point type used for reduction
   using hash_value_type =
@@ -65,11 +65,11 @@ class hyperloglog_ref {
   using register_type = int;   ///< HLL register type
 
   template <cuda::thread_scope NewScope>
-  using with_scope = hyperloglog_ref<T, NewScope, Hash>;  ///< Ref type with different
-                                                          ///< thread scope
+  using with_scope = hyperloglog_impl<T, NewScope, Hash>;  ///< Ref type with different
+                                                           ///< thread scope
 
   /**
-   * @brief Constructs a non-owning `hyperloglog_ref` object.
+   * @brief Constructs a non-owning `hyperloglog_impl` object.
    *
    * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from
    * host; UB if called from device.
@@ -79,8 +79,8 @@ class hyperloglog_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<cuda::std::byte> sketch_span,
-                                                Hash const& hash)
+  __host__ __device__ constexpr hyperloglog_impl(cuda::std::span<cuda::std::byte> sketch_span,
+                                                 Hash const& hash)
     : hash_{hash},
       precision_{cuda::std::countr_zero(
         sketch_bytes(cuco::sketch_size_kb(static_cast<double>(sketch_span.size() / 1024.0))) /
@@ -92,11 +92,9 @@ class hyperloglog_ref {
 #ifndef __CUDA_ARCH__
     auto const alignment =
       1ull << cuda::std::countr_zero(reinterpret_cast<cuda::std::uintptr_t>(sketch_span.data()));
-    CUCO_EXPECTS(
-      alignment >= sketch_alignment(), "Insufficient sketch alignment", std::runtime_error);
+    CUCO_EXPECTS(alignment >= sketch_alignment(), "Insufficient sketch alignment");
 
-    CUCO_EXPECTS(
-      this->precision_ >= 4, "Minimum required sketch size is 0.0625KB or 64B", std::runtime_error);
+    CUCO_EXPECTS(this->precision_ >= 4, "Minimum required sketch size is 0.0625KB or 64B");
 #endif
   }
 
@@ -192,19 +190,19 @@ class hyperloglog_ref {
       switch (vector_size) {
         case 2:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_ref>);
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<2, hyperloglog_impl>);
           break;
         case 4:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_ref>);
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<4, hyperloglog_impl>);
           break;
         case 8:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_ref>);
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<8, hyperloglog_impl>);
           break;
         case 16:
           kernel = reinterpret_cast<void const*>(
-            cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_ref>);
+            cuco::hyperloglog_ns::detail::add_shmem_vectorized<16, hyperloglog_impl>);
           break;
       };
     }
@@ -227,7 +225,7 @@ class hyperloglog_ref {
       }
     } else {
       kernel = reinterpret_cast<void const*>(
-        cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_ref>);
+        cuco::hyperloglog_ns::detail::add_shmem<InputIt, hyperloglog_impl>);
       void* kernel_args[] = {(void*)(&first), (void*)(&num_items), reinterpret_cast<void*>(this)};
       if (this->try_reserve_shmem(kernel, shmem_bytes)) {
         CUCO_CUDA_TRY(
@@ -239,7 +237,7 @@ class hyperloglog_ref {
         // Computes sketch directly in global memory. (Fallback path in case there is not enough
         // shared memory avalable)
         kernel = reinterpret_cast<void const*>(
-          cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_ref>);
+          cuco::hyperloglog_ns::detail::add_gmem<InputIt, hyperloglog_impl>);
 
         CUCO_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize(&grid_size, &block_size, kernel, 0));
 
@@ -283,7 +281,7 @@ class hyperloglog_ref {
    */
   template <class CG, cuda::thread_scope OtherScope>
   __device__ constexpr void merge(CG const& group,
-                                  hyperloglog_ref<T, OtherScope, Hash> const& other)
+                                  hyperloglog_impl<T, OtherScope, Hash> const& other)
   {
     // TODO find a better way to do error handling in device code
     // if (other.precision_ != this->precision_) { __trap(); }
@@ -305,12 +303,11 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ constexpr void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
+  __host__ constexpr void merge_async(hyperloglog_impl<T, OtherScope, Hash> const& other,
                                       cuda::stream_ref stream)
   {
     CUCO_EXPECTS(other.precision_ == this->precision_,
-                 "Cannot merge estimators with different sketch sizes",
-                 std::runtime_error);
+                 "Cannot merge estimators with different sketch sizes");
     auto constexpr block_size = 1024;
     cuco::hyperloglog_ns::detail::merge<<<1, block_size, 0, stream.get()>>>(other, *this);
   }
@@ -329,7 +326,7 @@ class hyperloglog_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ constexpr void merge(hyperloglog_ref<T, OtherScope, Hash> const& other,
+  __host__ constexpr void merge(hyperloglog_impl<T, OtherScope, Hash> const& other,
                                 cuda::stream_ref stream)
   {
     this->merge_async(other, stream);
@@ -343,12 +340,12 @@ class hyperloglog_ref {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] __device__ std::size_t estimate(
-    cooperative_groups::thread_block const& group) const noexcept
+  [[nodiscard]] __device__ size_t
+  estimate(cooperative_groups::thread_block const& group) const noexcept
   {
     __shared__ cuda::atomic<fp_type, cuda::thread_scope_block> block_sum;
     __shared__ cuda::atomic<int, cuda::thread_scope_block> block_zeroes;
-    __shared__ std::size_t estimate;
+    __shared__ size_t estimate;
 
     if (group.thread_rank() == 0) {
       new (&block_sum) decltype(block_sum){0};
@@ -405,7 +402,7 @@ class hyperloglog_ref {
    *
    * @return Approximate distinct items count
    */
-  [[nodiscard]] __host__ constexpr std::size_t estimate(cuda::stream_ref stream) const
+  [[nodiscard]] __host__ constexpr size_t estimate(cuda::stream_ref stream) const
   {
     auto const num_regs = 1ull << this->precision_;
     std::vector<register_type> host_sketch(num_regs);
@@ -460,7 +457,7 @@ class hyperloglog_ref {
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] __host__ __device__ constexpr std::size_t sketch_bytes() const noexcept
+  [[nodiscard]] __host__ __device__ constexpr size_t sketch_bytes() const noexcept
   {
     return (1ull << this->precision_) * sizeof(register_type);
   }
@@ -472,12 +469,12 @@ class hyperloglog_ref {
    *
    * @return The number of bytes required for the sketch
    */
-  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_bytes(
+  [[nodiscard]] __host__ __device__ static constexpr size_t sketch_bytes(
     cuco::sketch_size_kb sketch_size_kb) noexcept
   {
     // minimum precision is 4 or 64 bytes
-    return cuda::std::max(static_cast<std::size_t>(sizeof(register_type) * 1ull << 4),
-                          cuda::std::bit_floor(static_cast<std::size_t>(sketch_size_kb * 1024)));
+    return cuda::std::max(static_cast<size_t>(sizeof(register_type) * 1ull << 4),
+                          cuda::std::bit_floor(static_cast<size_t>(sketch_size_kb * 1024)));
   }
 
   /**
@@ -510,7 +507,7 @@ class hyperloglog_ref {
    *
    * @return The required alignment
    */
-  [[nodiscard]] __host__ __device__ static constexpr std::size_t sketch_alignment() noexcept
+  [[nodiscard]] __host__ __device__ static constexpr size_t sketch_alignment() noexcept
   {
     return alignof(register_type);
   }
@@ -565,6 +562,6 @@ class hyperloglog_ref {
   cuda::std::span<register_type> sketch_;  ///< HLL sketch storage
 
   template <class T_, cuda::thread_scope Scope_, class Hash_>
-  friend class hyperloglog_ref;
+  friend class hyperloglog_impl;
 };
 }  // namespace cuco::detail
diff --git a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl
similarity index 50%
rename from include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
rename to include/cuco/detail/hyperloglog/hyperloglog_ref.inl
index bf222986c..096b68bc9 100644
--- a/include/cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl
+++ b/include/cuco/detail/hyperloglog/hyperloglog_ref.inl
@@ -17,127 +17,123 @@
 namespace cuco {
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__
-  __device__ constexpr distinct_count_estimator_ref<T, Scope, Hash>::distinct_count_estimator_ref(
-    cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash)
+__host__ __device__ constexpr hyperloglog_ref<T, Scope, Hash>::hyperloglog_ref(
+  cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash)
   : impl_{sketch_span, hash}
 {
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG>
-__device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(
-  CG const& group) noexcept
+__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(CG const& group) noexcept
 {
-  this->impl_.clear(group);
+  impl_.clear(group);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear_async(
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear_async(
   cuda::stream_ref stream) noexcept
 {
-  this->impl_.clear_async(stream);
+  impl_.clear_async(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::clear(cuda::stream_ref stream)
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::clear(cuda::stream_ref stream)
 {
-  this->impl_.clear(stream);
+  impl_.clear(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(T const& item) noexcept
+__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(T const& item) noexcept
 {
-  this->impl_.add(item);
+  impl_.add(item);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add_async(
-  InputIt first, InputIt last, cuda::stream_ref stream)
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add_async(InputIt first,
+                                                                   InputIt last,
+                                                                   cuda::stream_ref stream)
 {
-  this->impl_.add_async(first, last, stream);
+  impl_.add_async(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class InputIt>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::add(InputIt first,
-                                                                          InputIt last,
-                                                                          cuda::stream_ref stream)
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::add(InputIt first,
+                                                             InputIt last,
+                                                             cuda::stream_ref stream)
 {
-  this->impl_.add(first, last, stream);
+  impl_.add(first, last, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <class CG, cuda::thread_scope OtherScope>
-__device__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge(
-  CG const& group, distinct_count_estimator_ref<T, OtherScope, Hash> const& other)
+__device__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(
+  CG const& group, hyperloglog_ref<T, OtherScope, Hash> const& other)
 {
-  this->impl_.merge(group, other.impl_);
+  impl_.merge(group, other.impl_);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge_async(
-  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream)
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge_async(
+  hyperloglog_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream)
 {
-  this->impl_.merge_async(other, stream);
+  impl_.merge_async(other.impl_, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 template <cuda::thread_scope OtherScope>
-__host__ constexpr void distinct_count_estimator_ref<T, Scope, Hash>::merge(
-  distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream)
+__host__ constexpr void hyperloglog_ref<T, Scope, Hash>::merge(
+  hyperloglog_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream)
 {
-  this->impl_.merge(other, stream);
+  impl_.merge(other.impl_, stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__device__ std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
+__device__ std::size_t hyperloglog_ref<T, Scope, Hash>::estimate(
   cooperative_groups::thread_block const& group) const noexcept
 {
-  return this->impl_.estimate(group);
+  return impl_.estimate(group);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ constexpr std::size_t distinct_count_estimator_ref<T, Scope, Hash>::estimate(
+__host__ constexpr std::size_t hyperloglog_ref<T, Scope, Hash>::estimate(
   cuda::stream_ref stream) const
 {
-  return this->impl_.estimate(stream);
+  return impl_.estimate(stream);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ constexpr auto distinct_count_estimator_ref<T, Scope, Hash>::hash_function()
-  const noexcept
+__host__ __device__ constexpr auto hyperloglog_ref<T, Scope, Hash>::hash_function() const noexcept
 {
-  return this->impl_.hash_function();
+  return impl_.hash_function();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr cuda::std::span<cuda::std::byte>
-distinct_count_estimator_ref<T, Scope, Hash>::sketch() const noexcept
+hyperloglog_ref<T, Scope, Hash>::sketch() const noexcept
 {
-  return this->impl_.sketch();
+  return impl_.sketch();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes() const noexcept
+__host__ __device__ constexpr std::size_t hyperloglog_ref<T, Scope, Hash>::sketch_bytes()
+  const noexcept
 {
-  return this->impl_.sketch_bytes();
+  return impl_.sketch_bytes();
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
+__host__ __device__ constexpr std::size_t hyperloglog_ref<T, Scope, Hash>::sketch_bytes(
   cuco::sketch_size_kb sketch_size_kb) noexcept
 {
   return impl_type::sketch_bytes(sketch_size_kb);
 }
 
 template <class T, cuda::thread_scope Scope, class Hash>
-__host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
+__host__ __device__ constexpr std::size_t hyperloglog_ref<T, Scope, Hash>::sketch_bytes(
   cuco::standard_deviation standard_deviation) noexcept
 {
   return impl_type::sketch_bytes(standard_deviation);
@@ -145,7 +141,7 @@ distinct_count_estimator_ref<T, Scope, Hash>::sketch_bytes(
 
 template <class T, cuda::thread_scope Scope, class Hash>
 __host__ __device__ constexpr std::size_t
-distinct_count_estimator_ref<T, Scope, Hash>::sketch_alignment() noexcept
+hyperloglog_ref<T, Scope, Hash>::sketch_alignment() noexcept
 {
   return impl_type::sketch_alignment();
 }
diff --git a/include/cuco/distinct_count_estimator.cuh b/include/cuco/hyperloglog.cuh
similarity index 76%
rename from include/cuco/distinct_count_estimator.cuh
rename to include/cuco/hyperloglog.cuh
index 5d3c7b6aa..ce8d81875 100644
--- a/include/cuco/distinct_count_estimator.cuh
+++ b/include/cuco/hyperloglog.cuh
@@ -15,9 +15,9 @@
  */
 #pragma once
 
-#include <cuco/detail/hyperloglog/hyperloglog.cuh>
-#include <cuco/distinct_count_estimator_ref.cuh>
+#include <cuco/detail/storage/storage_base.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/hyperloglog_ref.cuh>
 #include <cuco/types.cuh>
 #include <cuco/utility/allocator.hpp>
 #include <cuco/utility/cuda_thread_scope.cuh>
@@ -44,23 +44,24 @@ template <class T,
           cuda::thread_scope Scope = cuda::thread_scope_device,
           class Hash               = cuco::xxhash_64<T>,
           class Allocator          = cuco::cuda_allocator<cuda::std::byte>>
-class distinct_count_estimator {
-  using impl_type = detail::hyperloglog<T, Scope, Hash, Allocator>;
-
+class hyperloglog {
  public:
-  static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
+  static constexpr auto thread_scope = Scope;  ///< CUDA thread scope
 
   template <cuda::thread_scope NewScope = thread_scope>
-  using ref_type = cuco::distinct_count_estimator_ref<T, NewScope, Hash>;  ///< Non-owning reference
-                                                                           ///< type
+  using ref_type = hyperloglog_ref<T, NewScope, Hash>;  ///< Non-owning reference
+                                                        ///< type
 
-  using value_type     = typename impl_type::value_type;      ///< Type of items to count
-  using hasher         = typename impl_type::hasher;          ///< Type of hash function
-  using allocator_type = typename impl_type::allocator_type;  ///< Allocator type
+  using value_type    = typename ref_type<>::value_type;     ///< Type of items to count
+  using hasher        = typename ref_type<>::hasher;         ///< Hash function type
+  using register_type = typename ref_type<>::register_type;  ///< HLL register type
+  using allocator_type =
+    typename std::allocator_traits<Allocator>::template rebind_alloc<register_type>;  ///< Allocator
+                                                                                      ///< type
 
   // TODO enable CTAD
   /**
-   * @brief Constructs a `distinct_count_estimator` host object.
+   * @brief Constructs a `hyperloglog` host object.
    *
    * @note This function synchronizes the given stream.
    *
@@ -69,13 +70,13 @@ class distinct_count_estimator {
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr distinct_count_estimator(cuco::sketch_size_kb sketch_size_kb = 32_KB,
-                                     Hash const& hash                    = {},
-                                     Allocator const& alloc              = {},
-                                     cuda::stream_ref stream             = {});
+  constexpr hyperloglog(cuco::sketch_size_kb sketch_size_kb = 32_KB,
+                        Hash const& hash                    = {},
+                        Allocator const& alloc              = {},
+                        cuda::stream_ref stream             = {});
 
   /**
-   * @brief Constructs a `distinct_count_estimator` host object.
+   * @brief Constructs a `hyperloglog` host object.
    *
    * @note This function synchronizes the given stream.
    *
@@ -84,23 +85,23 @@ class distinct_count_estimator {
    * @param alloc Allocator used for allocating device storage
    * @param stream CUDA stream used to initialize the object
    */
-  constexpr distinct_count_estimator(cuco::standard_deviation standard_deviation,
-                                     Hash const& hash        = {},
-                                     Allocator const& alloc  = {},
-                                     cuda::stream_ref stream = {});
+  constexpr hyperloglog(cuco::standard_deviation standard_deviation,
+                        Hash const& hash        = {},
+                        Allocator const& alloc  = {},
+                        cuda::stream_ref stream = {});
 
-  ~distinct_count_estimator() = default;
+  ~hyperloglog() = default;
 
-  distinct_count_estimator(distinct_count_estimator const&)            = delete;
-  distinct_count_estimator& operator=(distinct_count_estimator const&) = delete;
-  distinct_count_estimator(distinct_count_estimator&&) = default;  ///< Move constructor
+  hyperloglog(hyperloglog const&)            = delete;
+  hyperloglog& operator=(hyperloglog const&) = delete;
+  hyperloglog(hyperloglog&&)                 = default;  ///< Move constructor
 
   /**
    * @brief Copy-assignment operator.
    *
    * @return Copy of `*this`
    */
-  distinct_count_estimator& operator=(distinct_count_estimator&&) = default;
+  hyperloglog& operator=(hyperloglog&&) = default;
 
   /**
    * @brief Asynchronously resets the estimator, i.e., clears the current count estimate.
@@ -162,9 +163,8 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  constexpr void merge_async(
-    distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
-    cuda::stream_ref stream = {});
+  constexpr void merge_async(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
+                             cuda::stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator into `*this` estimator.
@@ -181,7 +181,7 @@ class distinct_count_estimator {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope, class OtherAllocator>
-  constexpr void merge(distinct_count_estimator<T, OtherScope, Hash, OtherAllocator> const& other,
+  constexpr void merge(hyperloglog<T, OtherScope, Hash, OtherAllocator> const& other,
                        cuda::stream_ref stream = {});
 
   /**
@@ -227,7 +227,7 @@ class distinct_count_estimator {
   /**
    * @brief Get device ref.
    *
-   * @return Device ref object of the current `distinct_count_estimator` host object
+   * @return Device ref object of the current `hyperloglog` host object
    */
   [[nodiscard]] constexpr ref_type<> ref() const noexcept;
 
@@ -280,8 +280,16 @@ class distinct_count_estimator {
   [[nodiscard]] static constexpr std::size_t sketch_alignment() noexcept;
 
  private:
-  std::unique_ptr<impl_type> impl_;  ///< Implementation object
+  allocator_type allocator_;  ///< Allocator used to allocate device-accessible storage
+  std::unique_ptr<register_type, detail::custom_deleter<std::size_t, allocator_type>>
+    sketch_;        ///< Storage of the current `hyperloglog` object
+  ref_type<> ref_;  ///< Device ref of the current `hyperloglog` object
+
+  // Needs to be friends with other instantiations of this class template to have access to their
+  // storage
+  template <class T_, cuda::thread_scope Scope_, class Hash_, class Allocator_>
+  friend class hyperloglog;
 };
 }  // namespace cuco
 
-#include <cuco/detail/distinct_count_estimator/distinct_count_estimator.inl>
\ No newline at end of file
+#include <cuco/detail/hyperloglog/hyperloglog.inl>
\ No newline at end of file
diff --git a/include/cuco/distinct_count_estimator_ref.cuh b/include/cuco/hyperloglog_ref.cuh
similarity index 86%
rename from include/cuco/distinct_count_estimator_ref.cuh
rename to include/cuco/hyperloglog_ref.cuh
index 799bb46c7..8946fa8c1 100644
--- a/include/cuco/distinct_count_estimator_ref.cuh
+++ b/include/cuco/hyperloglog_ref.cuh
@@ -15,7 +15,7 @@
  */
 #pragma once
 
-#include <cuco/detail/hyperloglog/hyperloglog_ref.cuh>
+#include <cuco/detail/hyperloglog/hyperloglog_impl.cuh>
 #include <cuco/hash_functions.cuh>
 #include <cuco/types.cuh>
 #include <cuco/utility/cuda_thread_scope.cuh>
@@ -39,21 +39,22 @@ namespace cuco {
 template <class T,
           cuda::thread_scope Scope = cuda::thread_scope_device,
           class Hash               = cuco::xxhash_64<T>>
-class distinct_count_estimator_ref {
-  using impl_type = detail::hyperloglog_ref<T, Scope, Hash>;
+class hyperloglog_ref {
+  using impl_type = detail::hyperloglog_impl<T, Scope, Hash>;
 
  public:
   static constexpr auto thread_scope = impl_type::thread_scope;  ///< CUDA thread scope
 
-  using value_type = typename impl_type::value_type;  ///< Type of items to count
-  using hasher     = typename impl_type::hasher;      ///< Type of hash function
+  using value_type    = typename impl_type::value_type;     ///< Type of items to count
+  using hasher        = typename impl_type::hasher;         ///< Type of hash function
+  using register_type = typename impl_type::register_type;  ///< HLL register type
 
   template <cuda::thread_scope NewScope>
-  using with_scope = distinct_count_estimator_ref<T, NewScope, Hash>;  ///< Ref type with different
-                                                                       ///< thread scope
+  using with_scope = hyperloglog_ref<T, NewScope, Hash>;  ///< Ref type with different
+                                                          ///< thread scope
 
   /**
-   * @brief Constructs a non-owning `distinct_count_estimator_ref` object.
+   * @brief Constructs a non-owning `hyperloglog_ref` object.
    *
    * @throw If sketch size < 0.0625KB or 64B or standard deviation > 0.2765. Throws if called from
    * host; UB if called from device.
@@ -63,8 +64,8 @@ class distinct_count_estimator_ref {
    * @param sketch_span Reference to sketch storage
    * @param hash The hash function used to hash items
    */
-  __host__ __device__ constexpr distinct_count_estimator_ref(
-    cuda::std::span<cuda::std::byte> sketch_span, Hash const& hash = {});
+  __host__ __device__ constexpr hyperloglog_ref(cuda::std::span<cuda::std::byte> sketch_span,
+                                                Hash const& hash = {});
 
   /**
    * @brief Resets the estimator, i.e., clears the current count estimate.
@@ -144,7 +145,7 @@ class distinct_count_estimator_ref {
    */
   template <class CG, cuda::thread_scope OtherScope>
   __device__ constexpr void merge(CG const& group,
-                                  distinct_count_estimator_ref<T, OtherScope, Hash> const& other);
+                                  hyperloglog_ref<T, OtherScope, Hash> const& other);
 
   /**
    * @brief Asynchronously merges the result of `other` estimator reference into `*this` estimator.
@@ -157,8 +158,8 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ constexpr void merge_async(
-    distinct_count_estimator_ref<T, OtherScope, Hash> const& other, cuda::stream_ref stream = {});
+  __host__ constexpr void merge_async(hyperloglog_ref<T, OtherScope, Hash> const& other,
+                                      cuda::stream_ref stream = {});
 
   /**
    * @brief Merges the result of `other` estimator reference into `*this` estimator.
@@ -174,7 +175,7 @@ class distinct_count_estimator_ref {
    * @param stream CUDA stream this operation is executed in
    */
   template <cuda::thread_scope OtherScope>
-  __host__ constexpr void merge(distinct_count_estimator_ref<T, OtherScope, Hash> const& other,
+  __host__ constexpr void merge(hyperloglog_ref<T, OtherScope, Hash> const& other,
                                 cuda::stream_ref stream = {});
 
   /**
@@ -251,8 +252,8 @@ class distinct_count_estimator_ref {
   impl_type impl_;  ///< Implementation object
 
   template <class T_, cuda::thread_scope Scope_, class Hash_>
-  friend class distinct_count_estimator_ref;
+  friend class hyperloglog_ref;
 };
 }  // namespace cuco
 
-#include <cuco/detail/distinct_count_estimator/distinct_count_estimator_ref.inl>
+#include <cuco/detail/hyperloglog/hyperloglog_ref.inl>
diff --git a/include/cuco/types.cuh b/include/cuco/types.cuh
index eddc289df..f3c78d9df 100644
--- a/include/cuco/types.cuh
+++ b/include/cuco/types.cuh
@@ -46,7 +46,7 @@ CUCO_DEFINE_TEMPLATE_STRONG_TYPE(erased_key);
 
 /**
  * @brief A strong type wrapper `cuco::sketch_size_kb` for specifying the upper-bound sketch size of
- * `cuco::distinct_count_estimator(_ref)` in KB.
+ * `cuco::hyperloglog(_ref)` in KB.
  *
  * @note Values can also be specified as literals, e.g., 64.3_KB.
  */
@@ -54,7 +54,7 @@ CUCO_DEFINE_STRONG_TYPE(sketch_size_kb, double);
 
 /**
  * @brief A strong type wrapper `cuco::standard_deviation` for specifying the desired standard
- * deviation for the cardinality estimate of `cuco::distinct_count_estimator(_ref)`.
+ * deviation for the cardinality estimate of `cuco::hyperloglog(_ref)`.
  */
 CUCO_DEFINE_STRONG_TYPE(standard_deviation, double);
 
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
index be88c524d..e1e036048 100644
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -130,8 +130,8 @@ ConfigureTest(DYNAMIC_BITSET_TEST
     dynamic_bitset/size_test.cu)
 
 ###################################################################################################
-# - distinct_count_estimator ----------------------------------------------------------------------
-ConfigureTest(DISTINCT_COUNT_ESTIMATOR_TEST
-    distinct_count_estimator/unique_sequence_test.cu
-    distinct_count_estimator/spark_parity_test.cu
-    distinct_count_estimator/device_ref_test.cu)
+# - hyperloglog ----------------------------------------------------------------------
+ConfigureTest(HYPERLOGLOG_TEST
+    hyperloglog/unique_sequence_test.cu
+    hyperloglog/spark_parity_test.cu
+    hyperloglog/device_ref_test.cu)
diff --git a/tests/distinct_count_estimator/device_ref_test.cu b/tests/hyperloglog/device_ref_test.cu
similarity index 93%
rename from tests/distinct_count_estimator/device_ref_test.cu
rename to tests/hyperloglog/device_ref_test.cu
index ad40e7ab6..ab5161d97 100644
--- a/tests/distinct_count_estimator/device_ref_test.cu
+++ b/tests/hyperloglog/device_ref_test.cu
@@ -16,8 +16,8 @@
 
 #include <test_utils.hpp>
 
-#include <cuco/distinct_count_estimator.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/hyperloglog.cuh>
 
 #include <cuda/std/cstddef>
 #include <thrust/device_vector.h>
@@ -55,14 +55,14 @@ __global__ void estimate_kernel(cuco::sketch_size_kb sketch_size_kb,
   }
 }
 
-TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: device ref",
+TEMPLATE_TEST_CASE_SIG("hyperloglog: device ref",
                        "",
                        ((typename T, typename Hash), T, Hash),
                        (int32_t, cuco::xxhash_64<int32_t>),
                        (int64_t, cuco::xxhash_64<int64_t>),
                        (__int128_t, cuco::xxhash_64<__int128_t>))
 {
-  using estimator_type = cuco::distinct_count_estimator<T, cuda::thread_scope_device, Hash>;
+  using estimator_type = cuco::hyperloglog<T, cuda::thread_scope_device, Hash>;
 
   auto num_items_pow2 = GENERATE(25, 26, 28);
   auto hll_precision  = GENERATE(8, 10, 12, 13);
diff --git a/tests/distinct_count_estimator/spark_parity_test.cu b/tests/hyperloglog/spark_parity_test.cu
similarity index 86%
rename from tests/distinct_count_estimator/spark_parity_test.cu
rename to tests/hyperloglog/spark_parity_test.cu
index 9004083a4..5abaf159d 100644
--- a/tests/distinct_count_estimator/spark_parity_test.cu
+++ b/tests/hyperloglog/spark_parity_test.cu
@@ -16,8 +16,8 @@
 
 #include <test_utils.hpp>
 
-#include <cuco/distinct_count_estimator.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/hyperloglog.cuh>
 
 #include <cuda/functional>
 #include <thrust/device_vector.h>
@@ -43,13 +43,12 @@
  */
 
 // TODO implement this test once add_if is available
-// TEST_CASE("distinct_count_estimator: Spark parity: add nulls", "")
+// TEST_CASE("hyperloglog: Spark parity: add nulls", "")
 
-TEST_CASE("distinct_count_estimator: Spark parity: deterministic cardinality estimation", "")
+TEST_CASE("hyperloglog: Spark parity: deterministic cardinality estimation", "")
 {
-  using T = int;
-  using estimator_type =
-    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+  using T              = int;
+  using estimator_type = cuco::hyperloglog<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
 
   constexpr size_t repeats = 10;
   // This factor determines the error threshold for passing the test
@@ -101,13 +100,12 @@ TEST_CASE("distinct_count_estimator: Spark parity: deterministic cardinality est
 }
 
 // the following test is omitted since we refrain from doing randomized unit tests in cuco
-// TEST_CASE("distinct_count_estimator: Spark parity: random cardinality estimation", "")
+// TEST_CASE("hyperloglog: Spark parity: random cardinality estimation", "")
 
-TEST_CASE("distinct_count_estimator: Spark parity: merging HLL instances", "")
+TEST_CASE("hyperloglog: Spark parity: merging HLL instances", "")
 {
-  using T = int;
-  using estimator_type =
-    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+  using T              = int;
+  using estimator_type = cuco::hyperloglog<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
 
   auto num_items          = 1000000;
   auto standard_deviation = cuco::standard_deviation(0.05);
@@ -148,11 +146,11 @@ TEST_CASE("distinct_count_estimator: Spark parity: merging HLL instances", "")
 The following unit tests fail since xxhash_64 does not deduplicate different bit patterns for NaN
 values and +-0.0. They are thus counted as distinct items.
 
-TEST_CASE("distinct_count_estimator: Spark parity: add 0.0 and -0.0", "")
+TEST_CASE("hyperloglog: Spark parity: add 0.0 and -0.0", "")
 {
   using T = double;
   using estimator_type =
-    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+    cuco::hyperloglog<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
 
   auto standard_deviation = cuco::standard_deviation(0.05);
 
@@ -164,11 +162,11 @@ TEST_CASE("distinct_count_estimator: Spark parity: add 0.0 and -0.0", "")
   REQUIRE(estimator.estimate() == 1);
 }
 
-TEST_CASE("distinct_count_estimator: Spark parity: add NaN", "")
+TEST_CASE("hyperloglog: Spark parity: add NaN", "")
 {
   using T = double;
   using estimator_type =
-    cuco::distinct_count_estimator<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
+    cuco::hyperloglog<T, cuda::thread_scope_device, cuco::xxhash_64<T>>;
 
   auto standard_deviation = cuco::standard_deviation(0.05);
 
diff --git a/tests/distinct_count_estimator/unique_sequence_test.cu b/tests/hyperloglog/unique_sequence_test.cu
similarity index 93%
rename from tests/distinct_count_estimator/unique_sequence_test.cu
rename to tests/hyperloglog/unique_sequence_test.cu
index 8883a218b..c9f2bbaa4 100644
--- a/tests/distinct_count_estimator/unique_sequence_test.cu
+++ b/tests/hyperloglog/unique_sequence_test.cu
@@ -16,8 +16,8 @@
 
 #include <test_utils.hpp>
 
-#include <cuco/distinct_count_estimator.cuh>
 #include <cuco/hash_functions.cuh>
+#include <cuco/hyperloglog.cuh>
 
 #include <thrust/device_vector.h>
 #include <thrust/sequence.h>
@@ -29,7 +29,7 @@
 #include <cstddef>
 #include <cstdint>
 
-TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
+TEMPLATE_TEST_CASE_SIG("hyperloglog: unique sequence",
                        "",
                        ((typename T, typename Hash), T, Hash),
                        (int32_t, cuco::xxhash_64<int32_t>),
@@ -56,7 +56,7 @@ TEMPLATE_TEST_CASE_SIG("distinct_count_estimator: unique sequence",
   thrust::sequence(items.begin(), items.end(), 0);
 
   // Initialize the estimator
-  cuco::distinct_count_estimator<T, cuda::thread_scope_device, Hash> estimator{
+  cuco::hyperloglog<T, cuda::thread_scope_device, Hash> estimator{
     cuco::sketch_size_kb(sketch_size_kb)};
 
   REQUIRE(estimator.estimate() == 0);