From 11a8dd061e671585f49c2c6eeaabe14a3ae146e0 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 2 Aug 2023 17:01:39 -0400
Subject: [PATCH 01/13] Fix Byte-Pair-Encoding usage of cuco static-map for
 storing merge-pairs

---
 cpp/include/nvtext/bpe_tokenize.hpp      |  8 +---
 cpp/src/text/subword/bpe_tokenizer.cu    | 38 ++++++-----------
 cpp/src/text/subword/bpe_tokenizer.cuh   | 52 +++++++++++++++++++++---
 cpp/src/text/subword/load_merges_file.cu | 24 ++++-------
 4 files changed, 66 insertions(+), 56 deletions(-)
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index b93d93b07c6..4438f262fae 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -68,12 +68,6 @@ struct bpe_merge_pairs {
    * @return The number of merge pairs in the table
    */
   cudf::size_type get_size();
-  /**
-   * @brief  Returns the number of unique merge pairs in the table.
-   *
-   * @return The number of unique merge pairs in the table
-   */
-  std::size_t get_map_size();
 };
 
 /**
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 413fb2497c0..d1c443a63a6 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -80,10 +80,11 @@ __device__ cudf::string_view get_first_token(cudf::string_view const& d_str)
  *
  * @see The byte_pair_encoding_fn::operator() function below for details.
  */
+template <typename MapRefType>
 struct byte_pair_encoding_fn {
   cudf::column_device_view const d_merges;
   cudf::column_device_view const d_strings;
-  merge_pairs_map_type::device_view const d_map;
+  MapRefType const d_map;
   cudf::size_type* d_sizes;  // output size of encoded string
   string_hasher_type const hasher;
   cudf::size_type* d_byte_indices;
@@ -135,18 +136,7 @@ struct byte_pair_encoding_fn {
     return cudf::string_view(d_str.data() + *begin, size);
   }
 
-  /**
-   * @brief Compute the hash over the input strings.
-   *
-   * The input strings are combined with a space to produce hash for matching
-   * a merge pair within the `d_map`.
-   *
-   * @param lhs First string.
-   * @param rhs Second string.
-   * @return The hash value to match with `d_map`.
-   */
-  __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
-                                                cudf::string_view const& rhs)
+  __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
     auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;
@@ -155,7 +145,7 @@ struct byte_pair_encoding_fn {
     // Edge case check.
     // Empirically found only two merge pair strings that were greater than 70 bytes
     // and they both looked like ignorable errors. Double check this analysis with Vibhu.
-    if (thread_memory_size < total_size) { return 0; }
+    if (thread_memory_size < total_size) { return d_map.end(); }
 
     // build the target string in shared memory
     char* ptr = &shmem[threadIdx.x * thread_memory_size];
@@ -165,8 +155,8 @@ struct byte_pair_encoding_fn {
     memcpy(ptr + lhs.size_bytes(), " ", 1);
     memcpy(ptr + lhs.size_bytes() + 1, rhs.data(), rhs.size_bytes());
 
-    auto const d_hash_str = cudf::string_view(ptr, total_size);
-    return hasher(d_hash_str);  // return the hash for the temp string
+    auto const d_str = cudf::string_view(ptr, total_size);
+    return d_map.find(d_str);
   }
 
   /**
@@ -233,11 +223,10 @@ struct byte_pair_encoding_fn {
         auto const rhs = next_substr(itr, end, d_str);
         if (rhs.empty()) break;  // no more adjacent pairs
 
-        auto const hash    = compute_hash(lhs, rhs);
-        auto const map_itr = d_map.find(hash, thrust::identity<cudf::hash_value_type>{});
-        if (map_itr != d_map.end()) {
+        auto const map_itr = get_merge_pair(lhs, rhs);
+        if (!(map_itr == d_map.end())) {
           // found a match; record the rank (and other min_ vars)
-          auto const rank = static_cast<cudf::size_type>(map_itr->second);
+          auto const rank = static_cast<cudf::size_type>((*map_itr).second);
           if (rank < min_rank) {
             min_rank = rank;
             min_itr  = itr;
@@ -369,12 +358,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
                                            rmm::mr::get_current_device_resource());
   auto d_offsets = offsets->mutable_view().data<cudf::offset_type>();
 
-  byte_pair_encoding_fn fn{*d_merges,
-                           *d_strings,
-                           merge_pairs.get_merge_pairs_map(),
-                           d_offsets,
-                           string_hasher_type{},
-                           d_byte_indices.data()};
+  auto map_ref = merge_pairs.get_merge_pairs_map();
+  byte_pair_encoding_fn<decltype(map_ref)> fn{
+    *d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
   thrust::for_each_n(
     rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
 
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 0697a9961c7..836166f5b40 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -21,12 +21,15 @@
 #include <hash/hash_allocator.cuh>
 
 #include <cudf/column/column.hpp>
+#include <cudf/column/column_device_view.cuh>
 #include <cudf/hashing/detail/murmurhash3_x86_32.cuh>
+#include <cudf/strings/string_view.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <cuco/operator.hpp>
 #include <cuco/static_map.cuh>
 
 #include <cstdint>
@@ -34,14 +37,51 @@
 namespace nvtext {
 namespace detail {
 
+using hash_value_type    = uint32_t;
+using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+
+struct bpe_hasher {
+  cudf::column_device_view const d_strings;
+  string_hasher_type hasher{};
+  // used by insert
+  __device__ hash_value_type operator()(cudf::size_type index) const
+  {
+    return hasher(d_strings.element<cudf::string_view>(index));
+  }
+  // used by find
+  __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
+};
+
+struct bpe_equal {
+  cudf::column_device_view const d_strings;
+  // used by insert
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == d_strings.element<cudf::string_view>(rhs);
+  }
+  // used by find
+  __device__ bool operator()(cudf::size_type lhs, cudf::string_view const& rhs) const noexcept
+  {
+    return d_strings.element<cudf::string_view>(lhs) == rhs;
+  }
+};
+
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
-                                              cudf::size_type,
-                                              cuda::thread_scope_device,
-                                              hash_table_allocator_type>;
+using probe_scheme = cuco::experimental::double_hashing<1, bpe_hasher>;
 
-using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
+                                                            cudf::size_type,
+                                                            cuco::experimental::extent<std::size_t>,
+                                                            cuda::thread_scope_device,
+                                                            bpe_equal,
+                                                            probe_scheme,
+                                                            hash_table_allocator_type>;
+
+// using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
+//                                               cudf::size_type,
+//                                               cuda::thread_scope_device,
+//                                               hash_table_allocator_type>;
 
 }  // namespace detail
 
@@ -53,7 +93,7 @@ struct bpe_merge_pairs::bpe_merge_pairs_impl {
                        std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
 
   auto get_merge_pairs() const { return merge_pairs->view(); }
-  auto get_merge_pairs_map() const { return merge_pairs_map->get_device_view(); }
+  auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
 };
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index dffe035ad35..d83263118a1 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -36,20 +36,13 @@
 
 namespace nvtext {
 namespace detail {
-
 namespace {
 
 struct make_pair_function {
-  /**
-   * @brief Hash the merge pair entry
-   */
-  __device__ cuco::pair<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
+  __device__ cuco::pair<cudf::size_type, cudf::size_type> operator()(cudf::size_type idx)
   {
-    auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
-    return cuco::make_pair(result, idx);
+    return cuco::make_pair(idx, idx);
   }
-
-  string_hasher_type const _hasher;
   cudf::column_device_view const d_strings;
 };
 
@@ -103,24 +96,22 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
 {
+  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
     cuco::empty_key{std::numeric_limits<cudf::hash_value_type>::max()},
     cuco::empty_value{-1},                  // empty value is not used
+    bpe_equal{*d_strings},
+    probe_scheme{bpe_hasher{*d_strings}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
-  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
-  make_pair_function pair_func{string_hasher_type{}, *d_strings};
+  make_pair_function pair_func{*d_strings};
   auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
 
-  merge_pairs_map->insert(iter,
-                          iter + input.size(),
-                          thrust::identity<cudf::hash_value_type>{},
-                          thrust::equal_to<cudf::hash_value_type>{},
-                          stream.value());
+  merge_pairs_map->insert(iter, iter + input.size(), stream.value());
 
   return merge_pairs_map;
 }
@@ -185,6 +176,5 @@ bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
 bpe_merge_pairs::~bpe_merge_pairs() = default;
 
 cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); }
-std::size_t bpe_merge_pairs::get_map_size() { return impl->merge_pairs_map->get_size(); }
 
 }  // namespace nvtext

From b3e01f06abe13c208973f8d7c714525299e25231 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 2 Aug 2023 15:37:55 -0700
Subject: [PATCH 02/13] Use linear probing + use proper key sentinel

---
 cpp/cmake/thirdparty/get_cucollections.cmake | 16 +++++++++-------
 cpp/src/text/subword/bpe_tokenizer.cuh       |  2 +-
 cpp/src/text/subword/load_merges_file.cu     |  2 +-
 3 files changed, 11 insertions(+), 9 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index 9758958b44f..b1aaad4d573 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,12 +14,14 @@
 
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
-  include(${rapids-cmake-dir}/cpm/cuco.cmake)
-  if(BUILD_SHARED_LIBS)
-    rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
-  else()
-    rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
-  endif()
+  rapids_cpm_find(
+    cuco 0.0.1
+    GLOBAL_TARGETS cuco::cuco
+    CPM_ARGS
+    GIT_REPOSITORY https://github.com/PointKernel/cuCollections.git
+    GIT_TAG fix-ref-issues
+    OPTIONS "BUILD_TESTS OFF" ￼ "BUILD_BENCHMARKS OFF" ￼ "BUILD_EXAMPLES OFF"
+  )
 endfunction()
 
 find_and_configure_cucollections()
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 836166f5b40..0926839c4ac 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -68,7 +68,7 @@ struct bpe_equal {
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using probe_scheme = cuco::experimental::double_hashing<1, bpe_hasher>;
+using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
 using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
                                                             cudf::size_type,
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index d83263118a1..99be5f485da 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -101,7 +101,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
-    cuco::empty_key{std::numeric_limits<cudf::hash_value_type>::max()},
+    cuco::empty_key{-1},
     cuco::empty_value{-1},                  // empty value is not used
     bpe_equal{*d_strings},
     probe_scheme{bpe_hasher{*d_strings}},

From ac73b10d79baf8c583178b64f8c30d70cc5f4523 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 2 Aug 2023 15:57:23 -0700
Subject: [PATCH 03/13] Use != operator for map iterators

---
 cpp/src/text/subword/bpe_tokenizer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index d1c443a63a6..fc0784d757f 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -224,7 +224,7 @@ struct byte_pair_encoding_fn {
         if (rhs.empty()) break;  // no more adjacent pairs
 
         auto const map_itr = get_merge_pair(lhs, rhs);
-        if (!(map_itr == d_map.end())) {
+        if (map_itr != d_map.end()) {
           // found a match; record the rank (and other min_ vars)
           auto const rank = static_cast<cudf::size_type>((*map_itr).second);
           if (rank < min_rank) {

From 326c2fac390d9515fcc39d45f6e13ce2d3e8feba Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 2 Aug 2023 16:13:16 -0700
Subject: [PATCH 04/13] Use access operator

---
 cpp/src/text/subword/bpe_tokenizer.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index fc0784d757f..f3e9dff43eb 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -226,7 +226,7 @@ struct byte_pair_encoding_fn {
         auto const map_itr = get_merge_pair(lhs, rhs);
         if (map_itr != d_map.end()) {
           // found a match; record the rank (and other min_ vars)
-          auto const rank = static_cast<cudf::size_type>((*map_itr).second);
+          auto const rank = map_itr->second;
           if (rank < min_rank) {
             min_rank = rank;
             min_itr  = itr;

From fceefc443d18d10317b8e473d103eef0373e1454 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 3 Aug 2023 10:15:18 -0400
Subject: [PATCH 05/13] use linear-probe scheme

---
 cpp/src/text/subword/bpe_tokenizer.cuh   | 2 +-
 cpp/src/text/subword/load_merges_file.cu | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 836166f5b40..0926839c4ac 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -68,7 +68,7 @@ struct bpe_equal {
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using probe_scheme = cuco::experimental::double_hashing<1, bpe_hasher>;
+using probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
 
 using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
                                                             cudf::size_type,
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index f77e3569f5c..ab640531c8d 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -101,7 +101,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
-    cuco::empty_key{std::numeric_limits<cudf::hash_value_type>::max()},
+    cuco::empty_key{-1},
     cuco::empty_value{-1},                  // empty value is not used
     bpe_equal{*d_strings},
     probe_scheme{bpe_hasher{*d_strings}},

From 91ab4fb33cd079fa1309426c3e51f6f01d3793d7 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Sun, 6 Aug 2023 11:21:36 -0400
Subject: [PATCH 06/13] keep merge-pairs column-device-view alive

---
 cpp/src/text/subword/bpe_tokenizer.cuh   |  9 +++++++--
 cpp/src/text/subword/load_merges_file.cu | 22 +++++++++++++---------
 2 files changed, 20 insertions(+), 11 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 0926839c4ac..7a8395e3bfb 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -87,10 +87,15 @@ using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
 
 struct bpe_merge_pairs::bpe_merge_pairs_impl {
   std::unique_ptr<cudf::column> const merge_pairs;
+  std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>> const
+    d_merge_pairs;
   std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
 
-  bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
-                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
+  bpe_merge_pairs_impl(
+    std::unique_ptr<cudf::column>&& merge_pairs,
+    std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
+      d_merge_pairs,
+    std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
 
   auto get_merge_pairs() const { return merge_pairs->view(); }
   auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index ab640531c8d..d881b386a57 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -94,21 +94,20 @@ std::unique_ptr<cudf::column> load_file_to_column(std::string const& filename_me
 }
 
 std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
-  cudf::strings_column_view const& input, rmm::cuda_stream_view stream)
+  cudf::column_device_view const& input, rmm::cuda_stream_view stream)
 {
-  auto d_strings = cudf::column_device_view::create(input.parent(), stream);
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
     static_cast<size_t>(input.size() * 2),  // capacity is 2x;
     cuco::empty_key{-1},
     cuco::empty_value{-1},                  // empty value is not used
-    bpe_equal{*d_strings},
-    probe_scheme{bpe_hasher{*d_strings}},
+    bpe_equal{input},
+    probe_scheme{bpe_hasher{input}},
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
-  make_pair_function pair_func{*d_strings};
+  make_pair_function pair_func{input};
   auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
 
   merge_pairs_map->insert(iter, iter + input.size(), stream.value());
@@ -119,9 +118,10 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto merge_pairs = initialize_merge_pairs_map(cudf::strings_column_view(input->view()), stream);
-  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(std::move(input),
-                                                                         std::move(merge_pairs));
+  auto d_strings   = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs = initialize_merge_pairs_map(*d_strings, stream);
+  return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
+    std::move(input), std::move(d_strings), std::move(merge_pairs));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
@@ -154,8 +154,12 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
 
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& merge_pairs,
+  std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
+    d_merge_pairs,
   std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map)
-  : merge_pairs(std::move(merge_pairs)), merge_pairs_map(std::move(merge_pairs_map))
+  : merge_pairs(std::move(merge_pairs)),
+    d_merge_pairs(std::move(d_merge_pairs)),
+    merge_pairs_map(std::move(merge_pairs_map))
 {
 }
 

From 6a4fcf607ff54d8382a8154a7b7a224c46d35ae2 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Sun, 6 Aug 2023 16:37:24 -0400
Subject: [PATCH 07/13] cleanup col-device-view declaration in impl class

---
 cpp/src/text/subword/bpe_tokenizer.cuh   | 24 ++++++++++++------------
 cpp/src/text/subword/load_merges_file.cu | 23 +++++++++++------------
 2 files changed, 23 insertions(+), 24 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 7a8395e3bfb..583ff876072 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -33,6 +33,7 @@
 #include <cuco/static_map.cuh>
 
 #include <cstdint>
+#include <type_traits>
 
 namespace nvtext {
 namespace detail {
@@ -78,24 +79,23 @@ using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
                                                             probe_scheme,
                                                             hash_table_allocator_type>;
 
-// using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
-//                                               cudf::size_type,
-//                                               cuda::thread_scope_device,
-//                                               hash_table_allocator_type>;
-
 }  // namespace detail
 
+// since column_device_view::create returns is a little more than
+// std::unique_ptr<column_device_view> this helper simplifies the return type in a more maintainable
+// way
+using col_device_view = std::invoke_result_t<decltype(&cudf::column_device_view::create),
+                                             cudf::column_view,
+                                             rmm::cuda_stream_view>;
+
 struct bpe_merge_pairs::bpe_merge_pairs_impl {
   std::unique_ptr<cudf::column> const merge_pairs;
-  std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>> const
-    d_merge_pairs;
+  col_device_view const d_merge_pairs;
   std::unique_ptr<detail::merge_pairs_map_type> merge_pairs_map;
 
-  bpe_merge_pairs_impl(
-    std::unique_ptr<cudf::column>&& merge_pairs,
-    std::unique_ptr<cudf::column_device_view, std::function<void(cudf::column_device_view*)>>&&
-      d_merge_pairs,
-    std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
+  bpe_merge_pairs_impl(std::unique_ptr<cudf::column>&& merge_pairs,
+                       col_device_view&& d_merge_pairs,
+                       std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
 
   auto get_merge_pairs() const { return merge_pairs->view(); }
   auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index d881b386a57..28454da8d3b 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -38,13 +38,12 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-struct make_pair_function {
-  __device__ cuco::pair<cudf::size_type, cudf::size_type> operator()(cudf::size_type idx)
-  {
-    return cuco::make_pair(idx, idx);
-  }
-  cudf::column_device_view const d_strings;
-};
+// struct make_pair_function {
+//   __device__ cuco::pair<cudf::size_type, cudf::size_type> operator()(cudf::size_type idx)
+//   {
+//     return cuco::make_pair(idx, idx);
+//   }
+// };
 
 /**
  * @brief Loads a text file of merge-pairs into a strings column.
@@ -107,8 +106,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
-  make_pair_function pair_func{input};
-  auto iter = cudf::detail::make_counting_transform_iterator(0, pair_func);
+  auto iter = cudf::detail::make_counting_transform_iterator(
+    0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
   merge_pairs_map->insert(iter, iter + input.size(), stream.value());
 
@@ -118,10 +117,10 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(
   std::unique_ptr<cudf::column>&& input, rmm::cuda_stream_view stream)
 {
-  auto d_strings   = cudf::column_device_view::create(input->view(), stream);
-  auto merge_pairs = initialize_merge_pairs_map(*d_strings, stream);
+  auto d_input     = cudf::column_device_view::create(input->view(), stream);
+  auto merge_pairs = initialize_merge_pairs_map(*d_input, stream);
   return std::make_unique<nvtext::bpe_merge_pairs::bpe_merge_pairs_impl>(
-    std::move(input), std::move(d_strings), std::move(merge_pairs));
+    std::move(input), std::move(d_input), std::move(merge_pairs));
 }
 
 std::unique_ptr<bpe_merge_pairs::bpe_merge_pairs_impl> create_bpe_merge_pairs_impl(

From 0008bd21674a1e337107ee6a1ced5c01d40aee86 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Sun, 6 Aug 2023 20:05:42 -0400
Subject: [PATCH 08/13] cleanup code; add comments

---
 cpp/include/nvtext/bpe_tokenize.hpp      |  7 -------
 cpp/src/text/subword/bpe_tokenizer.cuh   | 12 ++++++++++++
 cpp/src/text/subword/load_merges_file.cu | 11 +----------
 3 files changed, 13 insertions(+), 17 deletions(-)

diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 4438f262fae..c67f4bd8b1c 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -61,13 +61,6 @@ struct bpe_merge_pairs {
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
-
-  /**
-   * @brief Returns the number of merge pairs in the table.
-   *
-   * @return The number of merge pairs in the table
-   */
-  cudf::size_type get_size();
 };
 
 /**
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 583ff876072..916ea20d262 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -41,6 +41,12 @@ namespace detail {
 using hash_value_type    = uint32_t;
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
 
+/**
+ * @brief Hasher function used for building and using the cuco static-map
+ *
+ * This takes advantage of hetergenouse lookup feature in cuco static-map which
+ * allows inserting with one type (index)  and looking up with a different type (string).
+ */
 struct bpe_hasher {
   cudf::column_device_view const d_strings;
   string_hasher_type hasher{};
@@ -53,6 +59,12 @@ struct bpe_hasher {
   __device__ hash_value_type operator()(cudf::string_view const& s) const { return hasher(s); }
 };
 
+/**
+ * @brief Equal function used for building and using the cuco static-map
+ *
+ * This takes advantage of hetergenouse lookup feature in cuco static-map which
+ * allows inserting with one type (index)  and looking up with a different type (string).
+ */
 struct bpe_equal {
   cudf::column_device_view const d_strings;
   // used by insert
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index 28454da8d3b..1f1b90b3f49 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -38,13 +38,6 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-// struct make_pair_function {
-//   __device__ cuco::pair<cudf::size_type, cudf::size_type> operator()(cudf::size_type idx)
-//   {
-//     return cuco::make_pair(idx, idx);
-//   }
-// };
-
 /**
  * @brief Loads a text file of merge-pairs into a strings column.
  *
@@ -109,7 +102,7 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   auto iter = cudf::detail::make_counting_transform_iterator(
     0, [] __device__(cudf::size_type idx) { return cuco::make_pair(idx, idx); });
 
-  merge_pairs_map->insert(iter, iter + input.size(), stream.value());
+  merge_pairs_map->insert_async(iter, iter + input.size(), stream.value());
 
   return merge_pairs_map;
 }
@@ -178,6 +171,4 @@ bpe_merge_pairs::bpe_merge_pairs(cudf::strings_column_view const& input,
 
 bpe_merge_pairs::~bpe_merge_pairs() = default;
 
-cudf::size_type bpe_merge_pairs::get_size() { return impl->merge_pairs->size(); }
-
 }  // namespace nvtext

From 40755df11fb7932228b18a76fafcbed3e4b7d94c Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Mon, 7 Aug 2023 11:58:46 -0700
Subject: [PATCH 09/13] Update cuco git tag

---
 cpp/cmake/thirdparty/get_cucollections.cmake | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index b1aaad4d573..d81e5eb1970 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -18,8 +18,8 @@ function(find_and_configure_cucollections)
     cuco 0.0.1
     GLOBAL_TARGETS cuco::cuco
     CPM_ARGS
-    GIT_REPOSITORY https://github.com/PointKernel/cuCollections.git
-    GIT_TAG fix-ref-issues
+    GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
+    GIT_TAG 5186b39522e13a3681c0eb591db4eaacbf969485
     OPTIONS "BUILD_TESTS OFF" ￼ "BUILD_BENCHMARKS OFF" ￼ "BUILD_EXAMPLES OFF"
   )
 endfunction()

From 440b8b5114778c8ed9a60c53611221cd13fe172d Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 11 Aug 2023 12:22:18 -0400
Subject: [PATCH 10/13] reuse already created col-dev-view

---
 cpp/src/text/subword/bpe_tokenizer.cu  | 15 +++++++++++----
 cpp/src/text/subword/bpe_tokenizer.cuh | 12 ++++++------
 2 files changed, 17 insertions(+), 10 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index b65a3fef341..30862563c49 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -136,6 +136,13 @@ struct byte_pair_encoding_fn {
     return cudf::string_view(d_str.data() + *begin, size);
   }
 
+  /**
+   * @brief Lookup the pair of strings in the d_map/d_merges
+   *
+   * @param lhs Left half of the string
+   * @param rhs Right half of the string
+   * @return Position of merge pair within d_map
+   */
   __device__ auto get_merge_pair(cudf::string_view const& lhs, cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
@@ -343,12 +350,12 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
   bpe_merge_pairs::bpe_merge_pairs_impl const& merge_pairs,
   rmm::cuda_stream_view stream)
 {
-  CUDF_EXPECTS(!merge_pairs.get_merge_pairs().is_empty(), "Merge pairs table must not be empty");
+  auto const d_merges = merge_pairs.get_merge_pairs();
+  CUDF_EXPECTS(d_merges.size() > 0, "Merge pairs table must not be empty");
 
   // build working vector to hold index values per byte
   rmm::device_uvector<cudf::size_type> d_byte_indices(input.chars().size(), stream);
 
-  auto const d_merges  = cudf::column_device_view::create(merge_pairs.get_merge_pairs(), stream);
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
   auto offsets   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<cudf::size_type>()},
@@ -358,9 +365,9 @@ std::unique_ptr<cudf::column> byte_pair_encoding(
                                            rmm::mr::get_current_device_resource());
   auto d_offsets = offsets->mutable_view().data<cudf::size_type>();
 
-  auto map_ref = merge_pairs.get_merge_pairs_map();
+  auto map_ref = merge_pairs.get_merge_pairs_ref();
   byte_pair_encoding_fn<decltype(map_ref)> fn{
-    *d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
+    d_merges, *d_strings, map_ref, d_offsets, string_hasher_type{}, d_byte_indices.data()};
   thrust::for_each_n(
     rmm::exec_policy(stream), thrust::make_counting_iterator<cudf::size_type>(0), input.size(), fn);
 
diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 916ea20d262..6ee3b236b90 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -44,8 +44,8 @@ using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::strin
 /**
  * @brief Hasher function used for building and using the cuco static-map
  *
- * This takes advantage of hetergenouse lookup feature in cuco static-map which
- * allows inserting with one type (index)  and looking up with a different type (string).
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
  */
 struct bpe_hasher {
   cudf::column_device_view const d_strings;
@@ -62,8 +62,8 @@ struct bpe_hasher {
 /**
  * @brief Equal function used for building and using the cuco static-map
  *
- * This takes advantage of hetergenouse lookup feature in cuco static-map which
- * allows inserting with one type (index)  and looking up with a different type (string).
+ * This takes advantage of heterogeneous lookup feature in cuco static-map which
+ * allows inserting with one type (index) and looking up with a different type (string).
  */
 struct bpe_equal {
   cudf::column_device_view const d_strings;
@@ -109,8 +109,8 @@ struct bpe_merge_pairs::bpe_merge_pairs_impl {
                        col_device_view&& d_merge_pairs,
                        std::unique_ptr<detail::merge_pairs_map_type>&& merge_pairs_map);
 
-  auto get_merge_pairs() const { return merge_pairs->view(); }
-  auto get_merge_pairs_map() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
+  auto const get_merge_pairs() const { return *d_merge_pairs; }
+  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
 };
 
 }  // namespace nvtext

From d6234c6aaeed429ec08d71dfeb880709bc54fa7d Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 11 Aug 2023 11:52:29 -0700
Subject: [PATCH 11/13] Revert temporary CMake changes

---
 cpp/cmake/thirdparty/get_cucollections.cmake | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_cucollections.cmake b/cpp/cmake/thirdparty/get_cucollections.cmake
index d81e5eb1970..9758958b44f 100644
--- a/cpp/cmake/thirdparty/get_cucollections.cmake
+++ b/cpp/cmake/thirdparty/get_cucollections.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2023, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -14,14 +14,12 @@
 
 # This function finds cuCollections and performs any additional configuration.
 function(find_and_configure_cucollections)
-  rapids_cpm_find(
-    cuco 0.0.1
-    GLOBAL_TARGETS cuco::cuco
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/NVIDIA/cuCollections.git
-    GIT_TAG 5186b39522e13a3681c0eb591db4eaacbf969485
-    OPTIONS "BUILD_TESTS OFF" ￼ "BUILD_BENCHMARKS OFF" ￼ "BUILD_EXAMPLES OFF"
-  )
+  include(${rapids-cmake-dir}/cpm/cuco.cmake)
+  if(BUILD_SHARED_LIBS)
+    rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports)
+  else()
+    rapids_cpm_cuco(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-exports)
+  endif()
 endfunction()
 
 find_and_configure_cucollections()

From caf5e68e5ad8eb268eb289ceca27e273cfa7ea05 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 11 Aug 2023 16:49:22 -0400
Subject: [PATCH 12/13] remove unneeded include

---
 cpp/src/text/subword/bpe_tokenizer.cuh | 1 -
 1 file changed, 1 deletion(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
index 6ee3b236b90..83aa22aaae9 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cuh
+++ b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -29,7 +29,6 @@
 #include <rmm/device_uvector.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
-#include <cuco/operator.hpp>
 #include <cuco/static_map.cuh>
 
 #include <cstdint>

From 708d8fda380e5bb0510e0f1de45cf92e3ace55fb Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Aug 2023 20:19:20 -0400
Subject: [PATCH 13/13] fix some comments

---
 cpp/src/text/subword/bpe_tokenizer.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 30862563c49..4c4f5b3a4b1 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -137,7 +137,7 @@ struct byte_pair_encoding_fn {
   }
 
   /**
-   * @brief Lookup the pair of strings in the d_map/d_merges
+   * @brief Look up the pair of strings in the d_map/d_merges
    *
    * @param lhs Left half of the string
    * @param rhs Right half of the string
@@ -151,7 +151,7 @@ struct byte_pair_encoding_fn {
 
     // Edge case check.
     // Empirically found only two merge pair strings that were greater than 70 bytes
-    // and they both looked like ignorable errors. Double check this analysis with Vibhu.
+    // and they both looked like ignorable errors.
     if (thread_memory_size < total_size) { return d_map.end(); }
 
     // build the target string in shared memory