From 7f38f2113bff84fc425ce3a058891b7403c6aa2e Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 5 Sep 2024 14:58:22 -0400
Subject: [PATCH 01/25] Improve minhash performance by using more working
 memory

---
 cpp/benchmarks/text/minhash.cpp  | 11 +++---
 cpp/src/text/minhash.cu          | 61 +++++++++++++++++++-------------
 cpp/tests/text/minhash_tests.cpp | 50 ++++++++++++++++++++------
 3 files changed, 81 insertions(+), 41 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 31ce60d8f9a..86760528eef 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -43,12 +43,11 @@ static void bench_minhash(nvbench::state& state)
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
 
-  data_profile const seeds_profile = data_profile_builder().null_probability(0).distribution(
+  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
     cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
   auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
   auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
   auto seeds             = seeds_table->get_column(0);
-  seeds.set_null_mask(rmm::device_buffer{}, 0);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -64,8 +63,8 @@ static void bench_minhash(nvbench::state& state)
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {1024, 8192, 16364, 131072})
-  .add_int64_axis("row_width", {128, 512, 2048})
-  .add_int64_axis("hash_width", {5, 10})
-  .add_int64_axis("seed_count", {2, 26})
+  .add_int64_axis("num_rows", {16364, 131072})
+  .add_int64_axis("row_width", {256, 512, 1024})
+  .add_int64_axis("hash_width", {5, 10, 20})
+  .add_int64_axis("seed_count", {2, 26, 260})
   .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 4318123627d..d2e1dfa58cb 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -56,6 +56,7 @@ namespace {
  * @param d_strings Strings column to process
  * @param seeds Seeds for hashing each string
  * @param width Substring window size in characters
+ * @param working_memory Memory used to hold intermediate hash values
  * @param d_hashes Minhash output values for each string
  */
 template <
@@ -65,29 +66,30 @@ template <
 CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 cudf::device_span<hash_value_type const> seeds,
                                 cudf::size_type width,
+                                hash_value_type* working_memory,
                                 hash_value_type* d_hashes)
 {
-  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
-  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
-              static_cast<std::size_t>(cudf::detail::warp_size))) {
-    return;
-  }
-
-  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
-  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / cudf::detail::warp_size;
+  if (str_idx >= d_strings.size()) { return; }
 
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-  auto const d_output = d_hashes + (str_idx * seeds.size());
+  auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = idx % cudf::detail::warp_size;
 
-  // initialize hashes output for this string
-  if (lane_idx == 0) {
-    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
+  auto warp_hashes = working_memory + (str_idx * cudf::detail::warp_size * seeds.size());
+
+  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size();
+       seed_idx += cudf::detail::warp_size) {
+    auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size);
+    thrust::uninitialized_fill(thrust::seq, begin, begin + cudf::detail::warp_size, init);
   }
   __syncwarp();
 
+  auto const d_output = d_hashes + (str_idx * seeds.size());
+
   auto const begin = d_str.data() + lane_idx;
   auto const end   = d_str.data() + d_str.size_bytes();
 
@@ -100,24 +102,28 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
     if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
 
     auto const hash_str = cudf::string_view(itr, bytes);
-    // hashing with each seed on the same section of the string is 10x faster than
-    // computing the substrings for each seed
     for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
       auto const hasher = HashFunction(seeds[seed_idx]);
-      // hash substring and store the min value
+      hash_value_type hv;
       if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        auto const hvalue = hasher(hash_str);
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
+        hv = hasher(hash_str);
       } else {
-        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
-        // but only uses the first uint64 value as requested by the LLM team.
-        auto const hvalue = thrust::get<0>(hasher(hash_str));
-        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
-        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
+        hv = thrust::get<0>(hasher(hash_str));
       }
+      warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx] =
+        min(hv, warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx]);
     }
   }
+  __syncwarp();
+
+  // compute final result
+  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size();
+       seed_idx += cudf::detail::warp_size) {
+    auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size);
+    auto hv =
+      thrust::reduce(thrust::seq, begin, begin + cudf::detail::warp_size, init, thrust::minimum{});
+    d_output[seed_idx] = hv;
+  }
 }
 
 template <
@@ -152,9 +158,14 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
   constexpr int block_size = 256;
+
+  auto const wm_size = cudf::util::round_up_safe(
+    seeds.size() * cudf::detail::warp_size * input.size(), static_cast<std::size_t>(block_size));
+  auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
+
   cudf::detail::grid_1d grid{input.size() * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, seeds, width, d_hashes);
+    *d_strings, seeds, width, working_memory.data(), d_hashes);
 
   return hashes;
 }
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 7575a3ba846..4abd45a62c9 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -42,15 +42,27 @@ TEST_F(MinHashTest, Basic)
                                         "",
                                         "doc 3",
                                         "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."},
+                                        "The quick brown fox jumpéd over the lazy brown dog.",
+                                        "line eight",
+                                        "line nine",
+                                        "line ten"},
                                        validity);
 
   auto view = cudf::strings_column_view(input);
 
   auto results = nvtext::minhash(view);
 
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {1207251914u, 0u, 21141582u, 0u, 1207251914u, 655955059u, 86520422u}, validity);
+  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>({1207251914u,
+                                                                    0u,
+                                                                    21141582u,
+                                                                    0u,
+                                                                    1207251914u,
+                                                                    655955059u,
+                                                                    86520422u,
+                                                                    304329233u,
+                                                                    640477688u,
+                                                                    640477688u},
+                                                                   validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
   auto results64  = nvtext::minhash64(view);
@@ -60,7 +72,10 @@ TEST_F(MinHashTest, Basic)
                                                                       0ul,
                                                                       13145552576991307582ul,
                                                                       14660046701545912182ul,
-                                                                      398062025280761388ul},
+                                                                      398062025280761388ul,
+                                                                      1273320923074904938ul,
+                                                                      3456065052701055601ul,
+                                                                      10664519708968191209ul},
                                                                      validity);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
@@ -82,7 +97,12 @@ TEST_F(MinHashTest, MultiSeed)
                                         "this is doc 2",
                                         "doc 3",
                                         "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog."});
+                                        "The quick brown fox jumpéd over the lazy brown dog.",
+                                        "line six",
+                                        "line seven",
+                                        "line eight",
+                                        "line nine",
+                                        "line ten"});
 
   auto view = cudf::strings_column_view(input);
 
@@ -95,7 +115,12 @@ TEST_F(MinHashTest, MultiSeed)
                 LCW{  21141582u,  580916568u, 1258052021u},
                 LCW{1207251914u,  943567174u, 1109272887u},
                 LCW{ 655955059u,  488346356u, 2394664816u},
-                LCW{  86520422u,  236622901u,  102546228u}});
+                LCW{  86520422u,  236622901u,  102546228u},
+                LCW{ 640477688u,  198451716u,  136303992u},
+                LCW{ 640477688u,  198451716u,  577802054u},
+                LCW{ 304329233u,  198451716u,  714941560u},
+                LCW{ 640477688u,  198451716u,  261342259u},
+                LCW{ 640477688u,  198451716u,  139988887u}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
@@ -104,11 +129,16 @@ TEST_F(MinHashTest, MultiSeed)
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
-  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul, 1188598072697676120ul},
-                    LCW64{ 3232308021562742685ul,  4445611509348165860ul, 1188598072697676120ul},
-                    LCW64{13145552576991307582ul,  6846192680998069919ul, 1188598072697676120ul},
+  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul,  1188598072697676120ul},
+                    LCW64{ 3232308021562742685ul,  4445611509348165860ul,  1188598072697676120ul},
+                    LCW64{13145552576991307582ul,  6846192680998069919ul,  1188598072697676120ul},
                     LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
-                    LCW64{  398062025280761388ul,   377720198157450084ul,  984941365662009329ul}});
+                    LCW64{  398062025280761388ul,   377720198157450084ul,   984941365662009329ul},
+                    LCW64{ 2837259098848821044ul,   650799815433771163ul,  2428991957842356245ul},
+                    LCW64{ 2105419906076957667ul,   650799815433771163ul,  2428991957842356245ul},
+                    LCW64{ 1273320923074904938ul,   650799815433771163ul,  2428991957842356245ul},
+                    LCW64{ 3456065052701055601ul,   650799815433771163ul,  2428991957842356245ul},
+                    LCW64{10664519708968191209ul,   650799815433771163ul,  2428991957842356245ul}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }

From f35c16d3fa32d0b95d2952da1b019f22997971db Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 19 Sep 2024 19:45:31 -0400
Subject: [PATCH 02/25] change to block per string

---
 cpp/src/text/minhash.cu | 43 ++++++++++++++++++++---------------------
 1 file changed, 21 insertions(+), 22 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index ce884de5257..613834fb34e 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -47,6 +47,9 @@ namespace nvtext {
 namespace detail {
 namespace {
 
+constexpr cudf::thread_index_type block_size = 256;
+constexpr cudf::thread_index_type tile_size  = block_size;  // cudf::detail::warp_size;
+
 /**
  * @brief Compute the minhash of each string for each seed
  *
@@ -72,21 +75,20 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 hash_value_type* d_hashes)
 {
   auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / cudf::detail::warp_size;
+  auto const str_idx = idx / tile_size;
   if (str_idx >= d_strings.size()) { return; }
 
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
   auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % cudf::detail::warp_size;
+  auto const lane_idx = idx % tile_size;
 
-  auto warp_hashes = working_memory + (str_idx * cudf::detail::warp_size * seeds.size());
+  auto warp_hashes = working_memory + (str_idx * tile_size * seeds.size());
 
-  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size();
-       seed_idx += cudf::detail::warp_size) {
-    auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size);
-    thrust::uninitialized_fill(thrust::seq, begin, begin + cudf::detail::warp_size, init);
+  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
+    auto begin = warp_hashes + (seed_idx * tile_size);
+    thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init);
   }
   __syncwarp();
 
@@ -96,7 +98,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
   auto const end   = d_str.data() + d_str.size_bytes();
 
   // each lane hashes 'width' substrings of d_str
-  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
+  for (auto itr = begin; itr < end; itr += tile_size) {
     if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
     auto const check_str =  // used for counting 'width' characters
       cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
@@ -112,18 +114,17 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
       } else {
         hv = thrust::get<0>(hasher(hash_str));
       }
-      warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx] =
-        min(hv, warp_hashes[(seed_idx * cudf::detail::warp_size) + lane_idx]);
+      warp_hashes[(seed_idx * tile_size) + lane_idx] =
+        cuda::std::min(hv, warp_hashes[(seed_idx * tile_size) + lane_idx]);
     }
   }
-  __syncwarp();
+  //__syncwarp();
+  __syncthreads();
 
   // compute final result
-  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size();
-       seed_idx += cudf::detail::warp_size) {
-    auto begin = warp_hashes + (seed_idx * cudf::detail::warp_size);
-    auto hv =
-      thrust::reduce(thrust::seq, begin, begin + cudf::detail::warp_size, init, thrust::minimum{});
+  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
+    auto begin = warp_hashes + (seed_idx * tile_size);
+    auto hv    = thrust::reduce(thrust::seq, begin, begin + tile_size, init, thrust::minimum{});
     d_output[seed_idx] = hv;
   }
 }
@@ -159,14 +160,12 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  constexpr cudf::thread_index_type block_size = 256;
-
-  auto const wm_size = cudf::util::round_up_safe(
-    seeds.size() * cudf::detail::warp_size * input.size(), static_cast<std::size_t>(block_size));
+  auto const wm_size  = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(),
+                                                 static_cast<std::size_t>(block_size));
   auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
 
-  cudf::detail::grid_1d grid{
-    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+                             block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
     *d_strings, seeds, width, working_memory.data(), d_hashes);
 

From 01500dd55bfaf0b9687517cb7d8040cf7fd179cf Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 19 Sep 2024 20:05:11 -0400
Subject: [PATCH 03/25] fix sync call

---
 cpp/src/text/minhash.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 613834fb34e..216ca511410 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -90,7 +90,8 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
     auto begin = warp_hashes + (seed_idx * tile_size);
     thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init);
   }
-  __syncwarp();
+  //__syncwarp();
+  __syncthreads();
 
   auto const d_output = d_hashes + (str_idx * seeds.size());
 

From 70948b9c50ffa9b8cfa8ed53913c924882fc5d9a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 1 Oct 2024 20:08:40 -0400
Subject: [PATCH 04/25] fix benchmark ranges

---
 cpp/benchmarks/text/minhash.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 86760528eef..1ba4d51b4af 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -32,9 +32,9 @@ static void bench_minhash(nvbench::state& state)
   auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
   auto const base64     = state.get_int64("hash_type") == 64;
 
-  if (static_cast<std::size_t>(num_rows) * static_cast<std::size_t>(row_width) >=
+  if ((num_rows * seed_count * (base64 ? sizeof(int64_t) : sizeof(int32_t)) * 32L) >=
       static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks greater than size_type limit");
+    state.skip("Skip benchmarks requiring more than 2GB working memory");
   }
 
   data_profile const strings_profile = data_profile_builder().distribution(
@@ -63,7 +63,7 @@ static void bench_minhash(nvbench::state& state)
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {16364, 131072})
+  .add_int64_axis("num_rows", {16364, 65456})
   .add_int64_axis("row_width", {256, 512, 1024})
   .add_int64_axis("hash_width", {5, 10, 20})
   .add_int64_axis("seed_count", {2, 26, 260})

From ef3b22867f3e63ea8bf41fa86a9010d8274d990f Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 2 Oct 2024 16:04:27 -0400
Subject: [PATCH 05/25] minor fixes

---
 cpp/src/text/minhash.cu | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 216ca511410..54d92cb68d0 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -53,8 +53,8 @@ constexpr cudf::thread_index_type tile_size  = block_size;  // cudf::detail::war
 /**
  * @brief Compute the minhash of each string for each seed
  *
- * This is a warp-per-string algorithm where parallel threads within a warp
- * work on substrings of a single string row.
+ * This is a block-per-string algorithm where parallel threads within a block
+ * work on a single string row.
  *
  * @tparam HashFunction hash function to use on each substring
  *
@@ -77,20 +77,18 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
   auto const idx     = cudf::detail::grid_1d::global_thread_id();
   auto const str_idx = idx / tile_size;
   if (str_idx >= d_strings.size()) { return; }
-
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
   auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = idx % tile_size;
 
-  auto warp_hashes = working_memory + (str_idx * tile_size * seeds.size());
+  auto tile_hashes = working_memory + (str_idx * tile_size * seeds.size());
 
   for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
-    auto begin = warp_hashes + (seed_idx * tile_size);
+    auto begin = tile_hashes + (seed_idx * tile_size);
     thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init);
   }
-  //__syncwarp();
   __syncthreads();
 
   auto const d_output = d_hashes + (str_idx * seeds.size());
@@ -115,16 +113,15 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
       } else {
         hv = thrust::get<0>(hasher(hash_str));
       }
-      warp_hashes[(seed_idx * tile_size) + lane_idx] =
-        cuda::std::min(hv, warp_hashes[(seed_idx * tile_size) + lane_idx]);
+      tile_hashes[(seed_idx * tile_size) + lane_idx] =
+        cuda::std::min(hv, tile_hashes[(seed_idx * tile_size) + lane_idx]);
     }
   }
-  //__syncwarp();
   __syncthreads();
 
   // compute final result
   for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
-    auto begin = warp_hashes + (seed_idx * tile_size);
+    auto begin = tile_hashes + (seed_idx * tile_size);
     auto hv    = thrust::reduce(thrust::seq, begin, begin + tile_size, init, thrust::minimum{});
     d_output[seed_idx] = hv;
   }

From 03d570de78abd400a4763eaa1753b391cca557ec Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 3 Oct 2024 11:11:40 -0400
Subject: [PATCH 06/25] minor cleanups

---
 cpp/src/text/minhash.cu | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 54d92cb68d0..f89b0bb3a17 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -48,7 +48,8 @@ namespace detail {
 namespace {
 
 constexpr cudf::thread_index_type block_size = 256;
-constexpr cudf::thread_index_type tile_size  = block_size;  // cudf::detail::warp_size;
+// for tuning independently from block_size
+constexpr cudf::thread_index_type tile_size = block_size;
 
 /**
  * @brief Compute the minhash of each string for each seed
@@ -85,6 +86,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
 
   auto tile_hashes = working_memory + (str_idx * tile_size * seeds.size());
 
+  // initialize working memory
   for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
     auto begin = tile_hashes + (seed_idx * tile_size);
     thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init);

From f74716349d07efac55d4c973e65c93fb9f3bb7ee Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 9 Oct 2024 21:36:53 -0400
Subject: [PATCH 07/25] match benchmark to curator parameters

---
 cpp/benchmarks/text/minhash.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 1ba4d51b4af..a0f9196d05f 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -63,8 +63,8 @@ static void bench_minhash(nvbench::state& state)
 
 NVBENCH_BENCH(bench_minhash)
   .set_name("minhash")
-  .add_int64_axis("num_rows", {16364, 65456})
-  .add_int64_axis("row_width", {256, 512, 1024})
-  .add_int64_axis("hash_width", {5, 10, 20})
-  .add_int64_axis("seed_count", {2, 26, 260})
+  .add_int64_axis("num_rows", {15000, 30000, 60000})
+  .add_int64_axis("row_width", {6000, 28000, 50000})
+  .add_int64_axis("hash_width", {12, 24})
+  .add_int64_axis("seed_count", {26, 260})
   .add_int64_axis("hash_type", {32, 64});

From aa6f3e047e5641fd7ad08cf5628a5e1b29372187 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 10 Oct 2024 17:03:05 -0400
Subject: [PATCH 08/25] add minhash_permuted API

---
 cpp/benchmarks/text/minhash.cpp               |   4 +-
 cpp/include/nvtext/minhash.hpp                |  56 ++++++
 cpp/src/text/minhash.cu                       | 169 ++++++++++++++++++
 cpp/tests/text/minhash_tests.cpp              |  58 ++++++
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |  46 +++++
 python/cudf/cudf/_lib/strings/__init__.py     |   2 +
 python/cudf/cudf/core/column/string.py        |  38 ++++
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |  14 ++
 8 files changed, 385 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index a0f9196d05f..06aeff3e644 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -56,8 +56,8 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
-                         : nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64 ? nvtext::minhash64_permuted(input, seeds.view(), seeds.view(), hash_width)
+                         : nvtext::minhash_permuted(input, seeds.view(), seeds.view(), hash_width);
   });
 }
 
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7c909f1a948..f06323e3f5d 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -90,6 +90,34 @@ std::unique_ptr<cudf::column> minhash(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parmA is empty
+ * @throw std::invalid_argument if `parmB.size() != parmA.size()`
+ * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param parmA Values used for the hash algorithm
+ * @param parmB Values used for the hash algorithm
+ * @param width The character width used for apply substrings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash_permuted(
+  cudf::strings_column_view const& input,
+  cudf::device_span<uint32_t const> parmA,
+  cudf::device_span<uint32_t const> parmB,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash value for each string
  *
@@ -151,6 +179,34 @@ std::unique_ptr<cudf::column> minhash64(
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
 
+/**
+ * @brief Returns the minhash values for each string
+ *
+ * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ *
+ * Any null row entries result in corresponding null output rows.
+ *
+ * @throw std::invalid_argument if the width < 2
+ * @throw std::invalid_argument if parmA is empty
+ * @throw std::invalid_argument if `parmB.size() != parmA.size()`
+ * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit
+ *
+ * @param input Strings column to compute minhash
+ * @param parmA Values used for the hash algorithm
+ * @param parmB Values used for the hash algorithm
+ * @param width The character width used for apply substrings
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return List column of minhash values for each string per seed
+ */
+std::unique_ptr<cudf::column> minhash64_permuted(
+  cudf::strings_column_view const& input,
+  cudf::device_span<uint64_t const> parmA,
+  cudf::device_span<uint64_t const> parmB,
+  cudf::size_type width,
+  rmm::cuda_stream_view stream      = cudf::get_default_stream(),
+  rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
+
 /**
  * @brief Returns the minhash values for each row of strings per seed
  *
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index f89b0bb3a17..1aa6bbb5613 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -129,6 +129,82 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
   }
 }
 
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         cudf::device_span<hash_value_type const> parmA,
+                                         cudf::device_span<hash_value_type const> parmB,
+                                         cudf::size_type width,
+                                         hash_value_type* d_hashes)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
+  auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = idx % tile_size;
+
+  auto const d_output = d_hashes + (str_idx * parmA.size());
+
+  auto const begin = d_str.data() + (lane_idx);
+  auto const end   = d_str.data() + d_str.size_bytes();
+
+  constexpr std::size_t seed_chunk   = 16;  // based on block-size==256
+  constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
+  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
+
+  extern __shared__ char shmem[];
+  auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
+
+  for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) {
+    // initialize working memory
+    auto const tile_hashes = block_hashes + (lane_idx * seed_chunk);
+    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init);
+    __syncthreads();
+
+    auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i);
+
+    // each lane hashes 'width' substrings of d_str
+    for (auto itr = begin; itr < end; itr += tile_size) {
+      if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
+      auto const check_str =  // used for counting 'width' characters
+        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+      auto const [bytes, left] =
+        cudf::strings::detail::bytes_to_character_position(check_str, width);
+      if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+
+      auto const hash_str = cudf::string_view(itr, bytes);
+      auto const hasher   = HashFunction(parmA[0]);
+      hash_value_type hv1;
+      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+        hv1 = hasher(hash_str);
+      } else {
+        hv1 = thrust::get<0>(hasher(hash_str));
+      }
+
+      for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
+        hash_value_type const hv =
+          seed_idx == 0 ? hv1
+                        : ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((seed_idx % seed_chunk) * tile_size) + lane_idx;
+        block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
+      }
+    }
+    __syncthreads();
+
+    if (lane_idx < seed_count) {
+      auto const hvs = block_hashes + (lane_idx * tile_size);
+      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{});
+      d_output[lane_idx + i] = hv;
+    }
+    __syncthreads();
+  }
+}
+
 template <
   typename HashFunction,
   typename hash_value_type = std::
@@ -172,6 +248,53 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return hashes;
 }
 
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         cudf::device_span<hash_value_type const> parmA,
+                                         cudf::device_span<hash_value_type const> parmB,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!parmA.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(width >= 2,
+               "Parameter width should be an integer value of 2 or greater",
+               std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * parmA.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+  CUDF_EXPECTS(parmA.size() == parmB.size(),
+               "Parameters A and B should have the same number of elements",
+               std::invalid_argument);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(parmA.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+
+  // 16 seeds can be held in shared-memory: 32K/block_size(256)/sizeof(hash_value_type) = ~16
+  auto const shmem_size = block_size * 16 * sizeof(hash_value_type);
+
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+                             block_size};
+  minhash_permuted_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+      *d_strings, parmA, parmB, width, d_hashes);
+
+  return hashes;
+}
+
 /**
  * @brief Compute the minhash of each list row of strings for each seed
  *
@@ -319,6 +442,18 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      cudf::device_span<uint32_t const> parmA,
+                                      cudf::device_span<uint32_t const> parmB,
+                                      cudf::size_type width,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
+  auto hashes        = detail::minhash_fn<HashFunction>(input, parmA, parmB, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> const& seed,
                                         cudf::size_type width,
@@ -343,6 +478,18 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return build_list_result(input.parent(), std::move(hashes), seeds.size(), stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        cudf::device_span<uint64_t const> parmA,
+                                        cudf::device_span<uint64_t const> parmB,
+                                        cudf::size_type width,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::device_async_resource_ref mr)
+{
+  using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
+  auto hashes        = detail::minhash_fn<HashFunction>(input, parmA, parmB, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
@@ -384,6 +531,17 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
   return detail::minhash(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
+                                               cudf::device_span<uint32_t const> parmA,
+                                               cudf::device_span<uint32_t const> parmB,
+                                               cudf::size_type width,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash(input, parmA, parmB, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         cudf::numeric_scalar<uint64_t> seed,
                                         cudf::size_type width,
@@ -404,6 +562,17 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
   return detail::minhash64(input, seeds, width, stream, mr);
 }
 
+std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
+                                                 cudf::device_span<uint64_t const> parmA,
+                                                 cudf::device_span<uint64_t const> parmB,
+                                                 cudf::size_type width,
+                                                 rmm::cuda_stream_view stream,
+                                                 rmm::device_async_resource_ref mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::minhash64(input, parmA, parmB, width, stream, mr);
+}
+
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
                                            cudf::device_span<uint32_t const> seeds,
                                            rmm::cuda_stream_view stream,
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index c568c162e81..3f6e6b6d666 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -169,6 +169,64 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, PermutedMultiSeed)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"doc 1",
+                                        "this is doc 2",
+                                        "doc 3",
+                                        "d",
+                                        "The quick brown fox jumpéd over the lazy brown dog.",
+                                        "line six",
+                                        "line seven",
+                                        "line eight",
+                                        "line nine",
+                                        "line ten"});
+
+  auto view = cudf::strings_column_view(input);
+
+  auto zero  = thrust::counting_iterator<uint32_t>(0);
+  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>(zero, zero + 3);
+  auto results =
+    nvtext::minhash_permuted(view, cudf::column_view(seeds), cudf::column_view(seeds), 4);
+
+  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW expected({LCW{1207251914u, 1207251915u, 2414503830u},
+                LCW{  21141582u,   21141583u,   42283166u},
+                LCW{1207251914u, 1207251915u, 2414503830u},
+                LCW{ 655955059u,  655955060u, 1311910120u},
+                LCW{  86520422u,   86520423u,  173040846u},
+                LCW{ 640477688u,  640477689u, 1280955378u},
+                LCW{ 640477688u,  640477689u, 1119757432u},
+                LCW{ 304329233u,  304329234u,  608658468u},
+                LCW{ 640477688u,  640477689u, 1253170324u},
+                LCW{ 640477688u,  640477689u,  109416974u}});
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
+  auto results64 =
+    nvtext::minhash64_permuted(view, cudf::column_view(seeds64), cudf::column_view(seeds64), 4);
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+  LCW64 expected64({
+    LCW64{  774489391575805754ul,  774489391575805755ul, 1355379376230368507ul},
+    LCW64{ 3232308021562742685ul,  536921680321302593ul,    3662418748204373ul},
+    LCW64{13145552576991307582ul, 1616337530922837828ul,  926832052631981697ul},
+    LCW64{14660046701545912182ul,  824988646263748477ul, 1649977292527496946ul},
+    LCW64{  398062025280761388ul,   44496840736841087ul,   88993681473682166ul},
+    LCW64{ 2837259098848821044ul,  531416089635127094ul, 1062832179270254188ul},
+    LCW64{ 2105419906076957667ul,  291144650667480443ul,  582289301334960878ul},
+    LCW64{ 1273320923074904938ul,  584582941553763022ul,   77252561447143819ul},
+    LCW64{ 3456065052701055601ul,  584582941553763022ul, 1169165883107526036ul},
+    LCW64{10664519708968191209ul,  304692340605188099ul,  152114268187701233ul}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, WordsMinHash)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index 59cb8d51440..3775548ff42 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -10,6 +10,8 @@ from pylibcudf.libcudf.column.column_view cimport column_view
 from pylibcudf.libcudf.nvtext.minhash cimport (
     minhash as cpp_minhash,
     minhash64 as cpp_minhash64,
+    minhash64_permuted as cpp_minhash64_permuted,
+    minhash_permuted as cpp_minhash_permuted,
     word_minhash as cpp_word_minhash,
     word_minhash64 as cpp_word_minhash64,
 )
@@ -38,6 +40,28 @@ def minhash(Column strings, Column seeds, int width):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
+def minhash_permuted(Column strings, Column a, Column b, int width):
+
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_width = width
+    cdef column_view c_a = a.view()
+    cdef column_view c_b = b.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_minhash_permuted(
+                c_strings,
+                c_a,
+                c_b,
+                c_width
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
 @acquire_spill_lock()
 def minhash64(Column strings, Column seeds, int width):
 
@@ -58,6 +82,28 @@ def minhash64(Column strings, Column seeds, int width):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
+def minhash64_permuted(Column strings, Column a, Column b, int width):
+
+    cdef column_view c_strings = strings.view()
+    cdef size_type c_width = width
+    cdef column_view c_a = a.view()
+    cdef column_view c_b = b.view()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(
+            cpp_minhash64_permuted(
+                c_strings,
+                c_a,
+                c_b,
+                c_width
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
+
+
 @acquire_spill_lock()
 def word_minhash(Column input, Column seeds):
 
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index e712937f816..1aafc7df066 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -9,6 +9,8 @@
 from cudf._lib.nvtext.minhash import (
     minhash,
     minhash64,
+    minhash64_permuted,
+    minhash_permuted,
     word_minhash,
     word_minhash64,
 )
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index b50e23bd52e..616ff19cc53 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5349,6 +5349,25 @@ def minhash(
             libstrings.minhash(self._column, seeds_column, width)
         )
 
+    def minhash_permuted(
+        self, a: ColumnLike, b: ColumnLike, width: int = 4
+    ) -> SeriesOrIndex:
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint32:
+            raise ValueError(
+                f"Expecting a Series with dtype uint32, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash_permuted(
+                self._column, a_column, b_column, width
+            )
+        )
+
     def minhash64(
         self, seeds: ColumnLike | None = None, width: int = 4
     ) -> SeriesOrIndex:
@@ -5389,6 +5408,25 @@ def minhash64(
             libstrings.minhash64(self._column, seeds_column, width)
         )
 
+    def minhash64_permuted(
+        self, a: ColumnLike, b: ColumnLike, width: int = 4
+    ) -> SeriesOrIndex:
+        a_column = column.as_column(a)
+        if a_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(a)}"
+            )
+        b_column = column.as_column(b)
+        if b_column.dtype != np.uint64:
+            raise ValueError(
+                f"Expecting a Series with dtype uint64, got {type(b)}"
+            )
+        return self._return_or_inplace(
+            libstrings.minhash64_permuted(
+                self._column, a_column, b_column, width
+            )
+        )
+
     def word_minhash(self, seeds: ColumnLike | None = None) -> SeriesOrIndex:
         """
         Compute the minhash of a list column of strings.
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index f2dd22f43aa..54d97e0559f 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -14,12 +14,26 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash_permuted(
+        const column_view &strings,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] minhash64(
         const column_view &strings,
         const column_view &seeds,
         const size_type width,
     ) except +
 
+    cdef unique_ptr[column] minhash64_permuted(
+        const column_view &strings,
+        const column_view &a,
+        const column_view &b,
+        const size_type width,
+    ) except +
+
     cdef unique_ptr[column] word_minhash(
         const column_view &input,
         const column_view &seeds

From 186477ec6a2640bcc3ec15f2053ca19063441e07 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 11 Oct 2024 16:10:01 -0400
Subject: [PATCH 09/25] revert benchmark API call

---
 cpp/benchmarks/text/minhash.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index 06aeff3e644..a0f9196d05f 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -56,8 +56,8 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64_permuted(input, seeds.view(), seeds.view(), hash_width)
-                         : nvtext::minhash_permuted(input, seeds.view(), seeds.view(), hash_width);
+    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
+                         : nvtext::minhash(input, seeds.view(), hash_width);
   });
 }
 

From a7583d0088b79f03d81b532b49bb45443b6026ea Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 14 Oct 2024 15:20:33 -0400
Subject: [PATCH 10/25] experimental single-hash permutation

---
 cpp/include/nvtext/minhash.hpp                |   4 +
 cpp/src/text/minhash.cu                       | 234 ++++++++++++++----
 cpp/tests/text/minhash_tests.cpp              |  52 ++--
 python/cudf/cudf/_lib/nvtext/minhash.pyx      |   8 +-
 python/cudf/cudf/core/column/string.py        |   8 +-
 .../pylibcudf/libcudf/nvtext/minhash.pxd      |   2 +
 python/pylibcudf/pylibcudf/nvtext/minhash.pxd |  16 +-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx |  24 +-
 8 files changed, 258 insertions(+), 90 deletions(-)

diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index f06323e3f5d..7ebd945c66d 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -103,6 +103,7 @@ std::unique_ptr<cudf::column> minhash(
  * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
+ * @param seed  Seed value used for the hash algorithm
  * @param parmA Values used for the hash algorithm
  * @param parmB Values used for the hash algorithm
  * @param width The character width used for apply substrings
@@ -112,6 +113,7 @@ std::unique_ptr<cudf::column> minhash(
  */
 std::unique_ptr<cudf::column> minhash_permuted(
   cudf::strings_column_view const& input,
+  uint32_t seed,
   cudf::device_span<uint32_t const> parmA,
   cudf::device_span<uint32_t const> parmB,
   cudf::size_type width,
@@ -192,6 +194,7 @@ std::unique_ptr<cudf::column> minhash64(
  * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
+ * @param seed  Seed value used for the hash algorithm
  * @param parmA Values used for the hash algorithm
  * @param parmB Values used for the hash algorithm
  * @param width The character width used for apply substrings
@@ -201,6 +204,7 @@ std::unique_ptr<cudf::column> minhash64(
  */
 std::unique_ptr<cudf::column> minhash64_permuted(
   cudf::strings_column_view const& input,
+  uint64_t seed,
   cudf::device_span<uint64_t const> parmA,
   cudf::device_span<uint64_t const> parmB,
   cudf::size_type width,
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 1aa6bbb5613..e067758aed9 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -65,10 +65,7 @@ constexpr cudf::thread_index_type tile_size = block_size;
  * @param working_memory Memory used to hold intermediate hash values
  * @param d_hashes Minhash output values for each string
  */
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 cudf::device_span<hash_value_type const> seeds,
                                 cudf::size_type width,
@@ -129,11 +126,49 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
   }
 }
 
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         cudf::device_span<hash_value_type const> seeds,
+                                         cudf::size_type width,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::device_async_resource_ref mr)
+{
+  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(width >= 2,
+               "Parameter width should be an integer value of 2 or greater",
+               std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
+                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+               "The number of seeds times the number of input rows exceeds the column size limit",
+               std::overflow_error);
+
+  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
+  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+
+  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+
+  auto hashes   = cudf::make_numeric_column(output_type,
+                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
+                                          cudf::mask_state::UNALLOCATED,
+                                          stream,
+                                          mr);
+  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+
+  auto const wm_size  = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(),
+                                                 static_cast<std::size_t>(block_size));
+  auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
+
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+                             block_size};
+  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+    *d_strings, seeds, width, working_memory.data(), d_hashes);
+
+  return hashes;
+}
+
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         hash_value_type seed,
                                          cudf::device_span<hash_value_type const> parmA,
                                          cudf::device_span<hash_value_type const> parmB,
                                          cudf::size_type width,
@@ -160,6 +195,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   extern __shared__ char shmem[];
   auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
+  auto const hasher = HashFunction(seed);
+
   for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) {
     // initialize working memory
     auto const tile_hashes = block_hashes + (lane_idx * seed_chunk);
@@ -178,7 +215,6 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
 
       auto const hash_str = cudf::string_view(itr, bytes);
-      auto const hasher   = HashFunction(parmA[0]);
       hash_value_type hv1;
       if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
         hv1 = hasher(hash_str);
@@ -187,9 +223,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       }
 
       for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
-        hash_value_type const hv =
-          seed_idx == 0 ? hv1
-                        : ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
+        hash_value_type const hv =  // seed_idx ==  0 ? hv1 :
+          ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
         auto const block_idx    = ((seed_idx % seed_chunk) * tile_size) + lane_idx;
         block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
       }
@@ -205,54 +240,127 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   }
 }
 
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
-std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
-                                         cudf::device_span<hash_value_type const> seeds,
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
+                                     hash_value_type seed,
+                                     cudf::size_type width,
+                                     hash_value_type* working_memory)
+{
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
+
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset0    = offsets_itr[0];  // pass this in
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+  if (size_bytes == 0) { return; }
+  auto const seed_hashes = working_memory + offset - offset0;  // hashes_offset
+
+  auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
+  auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = idx % tile_size;
+
+  auto const begin  = d_str.data() + (lane_idx);
+  auto const end    = d_str.data() + d_str.size_bytes();
+  auto const hasher = HashFunction(seed);
+
+  auto d_output = seed_hashes + lane_idx;
+  for (auto itr = begin; itr < end; itr += tile_size, d_output += tile_size) {
+    if (cudf::strings::detail::is_utf8_continuation_char(*itr)) {
+      *d_output = 0;
+      continue;
+    }
+    auto const check_str =  // used for counting 'width' characters
+      cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
+    auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
+    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+
+    auto const hash_str = cudf::string_view(itr, bytes);
+    hash_value_type hv;
+    if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
+      hv = hasher(hash_str);
+    } else {
+      hv = thrust::get<0>(hasher(hash_str));
+    }
+    *d_output = hv;
+  }
+}
+
+template <typename hash_value_type>
+CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         cudf::device_span<hash_value_type const> parmA,
+                                         cudf::device_span<hash_value_type const> parmB,
                                          cudf::size_type width,
-                                         rmm::cuda_stream_view stream,
-                                         rmm::device_async_resource_ref mr)
+                                         hash_value_type* working_memory,
+                                         hash_value_type* d_hashes)
 {
-  CUDF_EXPECTS(!seeds.empty(), "Parameter seeds cannot be empty", std::invalid_argument);
-  CUDF_EXPECTS(width >= 2,
-               "Parameter width should be an integer value of 2 or greater",
-               std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * seeds.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
+  auto const idx     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = idx / tile_size;
+  if (str_idx >= d_strings.size()) { return; }
+  if (d_strings.is_null(str_idx)) { return; }
 
-  auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
-  if (input.is_empty()) { return cudf::make_empty_column(output_type); }
+  auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
+  auto const offsets_itr =
+    cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
+  auto const offset0     = offsets_itr[0];  // pass this in
+  auto const offset      = offsets_itr[str_idx];
+  auto const size_bytes  = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+  auto const seed_hashes = working_memory + offset - offset0;  // hashes_offset;
+  auto const hashes_size =
+    cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), size_bytes - width + 1);
+
+  auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
+  auto const lane_idx = idx % tile_size;
+  auto const d_output = d_hashes + (str_idx * parmA.size());
 
-  auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
+  auto const begin = seed_hashes + (lane_idx);
+  auto const end   = seed_hashes + hashes_size;
 
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(seeds.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  constexpr std::size_t seed_chunk   = 16;  // based on block-size==256
+  constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
+  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
-  auto const wm_size  = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(),
-                                                 static_cast<std::size_t>(block_size));
-  auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
+  extern __shared__ char shmem[];
+  auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
-  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
-                             block_size};
-  minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, seeds, width, working_memory.data(), d_hashes);
+  for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) {
+    // initialize working memory
+    auto const tile_hashes = block_hashes + (lane_idx * seed_chunk);
+    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init);
+    __syncthreads();
 
-  return hashes;
+    auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i);
+
+    // each lane hashes 'width' substrings of d_str
+    for (auto itr = begin; itr < end; itr += tile_size) {
+      auto const hv1 = *itr;
+      if (hv1 == 0) { continue; }  // skips intermediate UTF-8 bytes
+
+      for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
+        hash_value_type const hv =  // seed_idx == 0 ? hv1 :
+          ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((seed_idx % seed_chunk) * tile_size) + lane_idx;
+        block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
+      }
+    }
+    __syncthreads();
+
+    if (lane_idx < seed_count) {
+      auto const hvs = block_hashes + (lane_idx * tile_size);
+      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{});
+      d_output[lane_idx + i] = hv;
+    }
+    __syncthreads();
+  }
 }
 
-template <
-  typename HashFunction,
-  typename hash_value_type = std::
-    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
+template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
+                                         hash_value_type seed,
                                          cudf::device_span<hash_value_type const> parmA,
                                          cudf::device_span<hash_value_type const> parmB,
                                          cudf::size_type width,
@@ -288,9 +396,23 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 
   cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
                              block_size};
+#if 1
   minhash_permuted_kernel<HashFunction>
     <<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
-      *d_strings, parmA, parmB, width, d_hashes);
+      *d_strings, seed, parmA, parmB, width, d_hashes);
+
+#else
+  auto const wm_size  = input.chars_size(stream);
+  auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
+
+  minhash_seed_kernel<HashFunction>
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+      *d_strings, seed, width, working_memory.data());
+
+  minhash_permuted_kernel<hash_value_type>
+    <<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+      *d_strings, parmA, parmB, width, working_memory.data(), d_hashes);
+#endif
 
   return hashes;
 }
@@ -443,6 +565,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 }
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
+                                      uint32_t seed,
                                       cudf::device_span<uint32_t const> parmA,
                                       cudf::device_span<uint32_t const> parmB,
                                       cudf::size_type width,
@@ -450,7 +573,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, parmA, parmB, width, stream, mr);
+  auto hashes = detail::minhash_fn<HashFunction>(input, seed, parmA, parmB, width, stream, mr);
   return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr);
 }
 
@@ -479,6 +602,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
+                                        uint64_t seed,
                                         cudf::device_span<uint64_t const> parmA,
                                         cudf::device_span<uint64_t const> parmB,
                                         cudf::size_type width,
@@ -486,7 +610,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes        = detail::minhash_fn<HashFunction>(input, parmA, parmB, width, stream, mr);
+  auto hashes = detail::minhash_fn<HashFunction>(input, seed, parmA, parmB, width, stream, mr);
   return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr);
 }
 
@@ -532,6 +656,7 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 }
 
 std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
+                                               uint32_t seed,
                                                cudf::device_span<uint32_t const> parmA,
                                                cudf::device_span<uint32_t const> parmB,
                                                cudf::size_type width,
@@ -539,7 +664,7 @@ std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const&
                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, parmA, parmB, width, stream, mr);
+  return detail::minhash(input, seed, parmA, parmB, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -563,6 +688,7 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 }
 
 std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
+                                                 uint64_t seed,
                                                  cudf::device_span<uint64_t const> parmA,
                                                  cudf::device_span<uint64_t const> parmB,
                                                  cudf::size_type width,
@@ -570,7 +696,7 @@ std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const
                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash64(input, parmA, parmB, width, stream, mr);
+  return detail::minhash64(input, seed, parmA, parmB, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 3f6e6b6d666..4ca0607ede8 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -185,43 +185,45 @@ TEST_F(MinHashTest, PermutedMultiSeed)
 
   auto view = cudf::strings_column_view(input);
 
-  auto zero  = thrust::counting_iterator<uint32_t>(0);
-  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>(zero, zero + 3);
+  auto first = thrust::counting_iterator<uint32_t>(10);
+  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
   auto results =
-    nvtext::minhash_permuted(view, cudf::column_view(seeds), cudf::column_view(seeds), 4);
+    nvtext::minhash_permuted(view, 0, cudf::column_view(seeds), cudf::column_view(seeds), 4);
 
   using LCW = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
-  LCW expected({LCW{1207251914u, 1207251915u, 2414503830u},
-                LCW{  21141582u,   21141583u,   42283166u},
-                LCW{1207251914u, 1207251915u, 2414503830u},
-                LCW{ 655955059u,  655955060u, 1311910120u},
-                LCW{  86520422u,   86520423u,  173040846u},
-                LCW{ 640477688u,  640477689u, 1280955378u},
-                LCW{ 640477688u,  640477689u, 1119757432u},
-                LCW{ 304329233u,  304329234u,  608658468u},
-                LCW{ 640477688u,  640477689u, 1253170324u},
-                LCW{ 640477688u,  640477689u,  109416974u}});
+  LCW expected({
+    LCW{1392101586u,  394869177u,  811528444u},
+    LCW{ 211415830u,  187088503u,  130291444u},
+    LCW{2098117052u,  394869177u,  799753544u},
+    LCW{2264583304u, 2920538364u, 3576493424u},
+    LCW{ 253327882u,   41747273u,  302030804u},
+    LCW{2109809594u, 1017470651u,  326988172u},
+    LCW{1303819864u,  850676747u,  147107852u},
+    LCW{ 736021564u,  720812292u, 1405158760u},
+    LCW{ 902780242u,  134064807u, 1613944636u},
+    LCW{ 547084870u, 1748895564u,  656501844u}
+  });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
+  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
   auto results64 =
-    nvtext::minhash64_permuted(view, cudf::column_view(seeds64), cudf::column_view(seeds64), 4);
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(seeds64), cudf::column_view(seeds64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
   LCW64 expected64({
-    LCW64{  774489391575805754ul,  774489391575805755ul, 1355379376230368507ul},
-    LCW64{ 3232308021562742685ul,  536921680321302593ul,    3662418748204373ul},
-    LCW64{13145552576991307582ul, 1616337530922837828ul,  926832052631981697ul},
-    LCW64{14660046701545912182ul,  824988646263748477ul, 1649977292527496946ul},
-    LCW64{  398062025280761388ul,   44496840736841087ul,   88993681473682166ul},
-    LCW64{ 2837259098848821044ul,  531416089635127094ul, 1062832179270254188ul},
-    LCW64{ 2105419906076957667ul,  291144650667480443ul,  582289301334960878ul},
-    LCW64{ 1273320923074904938ul,  584582941553763022ul,   77252561447143819ul},
-    LCW64{ 3456065052701055601ul,  584582941553763022ul, 1169165883107526036ul},
-    LCW64{10664519708968191209ul,  304692340605188099ul,  152114268187701233ul}
+    LCW64{ 827364888116975697ul, 1601854279692781452ul,  70500662054893256ul},
+    LCW64{  18312093741021833ul,  133793446674258329ul,  21974512489226198ul},
+    LCW64{  22474244732520567ul, 1638811775655358395ul, 949306297364502264ul},
+    LCW64{1332357434996402861ul, 2157346081260151330ul, 676491718310205848ul},
+    LCW64{  65816830624808020ul,   43323600380520789ul,  63511816333816345ul},
+    LCW64{ 629657184954525200ul,   49741036507643002ul,  97466271004074331ul},
+    LCW64{ 301611977846331113ul,  101188874709594830ul,  97466271004074331ul},
+    LCW64{ 121498891461700668ul,  171065800427907402ul,  97466271004074331ul},
+    LCW64{  54617739511834072ul,  231454301607238929ul,  97466271004074331ul},
+    LCW64{ 576418665851990314ul,  231454301607238929ul,  97466271004074331ul}
   });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index a0ab7747d73..a95f8edb3c8 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint32_t, uint64_t
+
 from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
@@ -18,9 +20,10 @@ def minhash(Column input, Column seeds, int width=4):
 
 
 @acquire_spill_lock()
-def minhash_permuted(Column input, Column a, Column b, int width):
+def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
     result = nvtext.minhash.minhash_permuted(
         input.to_pylibcudf(mode="read"),
+        seed,
         a.to_pylibcudf(mode="read"),
         b.to_pylibcudf(mode="read"),
         width,
@@ -39,9 +42,10 @@ def minhash64(Column input, Column seeds, int width=4):
 
 
 @acquire_spill_lock()
-def minhash64_permuted(Column input, Column a, Column b, int width):
+def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
     result = nvtext.minhash.minhash64_permuted(
         input.to_pylibcudf(mode="read"),
+        seed,
         a.to_pylibcudf(mode="read"),
         b.to_pylibcudf(mode="read"),
         width,
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index fe50c228248..565806dd271 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5349,7 +5349,7 @@ def minhash(
         )
 
     def minhash_permuted(
-        self, a: ColumnLike, b: ColumnLike, width: int = 4
+        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int = 4
     ) -> SeriesOrIndex:
         a_column = column.as_column(a)
         if a_column.dtype != np.uint32:
@@ -5363,7 +5363,7 @@ def minhash_permuted(
             )
         return self._return_or_inplace(
             libstrings.minhash_permuted(
-                self._column, a_column, b_column, width
+                self._column, seed, a_column, b_column, width
             )
         )
 
@@ -5408,7 +5408,7 @@ def minhash64(
         )
 
     def minhash64_permuted(
-        self, a: ColumnLike, b: ColumnLike, width: int = 4
+        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int = 4
     ) -> SeriesOrIndex:
         a_column = column.as_column(a)
         if a_column.dtype != np.uint64:
@@ -5422,7 +5422,7 @@ def minhash64_permuted(
             )
         return self._return_or_inplace(
             libstrings.minhash64_permuted(
-                self._column, a_column, b_column, width
+                self._column, seed, a_column, b_column, width
             )
         )
 
diff --git a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
index 00c65948a3e..ebf8eda1ce3 100644
--- a/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/libcudf/nvtext/minhash.pxd
@@ -24,6 +24,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] minhash_permuted(
         const column_view &strings,
+        const uint32_t seed,
         const column_view &a,
         const column_view &b,
         const size_type width,
@@ -43,6 +44,7 @@ cdef extern from "nvtext/minhash.hpp" namespace "nvtext" nogil:
 
     cdef unique_ptr[column] minhash64_permuted(
         const column_view &strings,
+        const uint64_t seed,
         const column_view &a,
         const column_view &b,
         const size_type width,
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
index 1c639934273..6b544282f44 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pxd
@@ -11,11 +11,23 @@ ctypedef fused ColumnOrScalar:
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=*)
 
-cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width)
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
 
 cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=*)
 
-cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type width)
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+)
 
 cpdef Column word_minhash(Column input, Column seeds)
 
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index bc7374644fa..c81abce6649 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -59,7 +59,13 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
-cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width):
+cpdef Column minhash_permuted(
+    Column input,
+    uint32_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
     """
     Returns the minhash values for each string.
     This function uses MurmurHash3_x86_32 for the hash algorithm.
@@ -70,6 +76,8 @@ cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width)
     ----------
     input : Column
         Strings column to compute minhash
+    seed : uint32_t
+        Seed used for the hash function
     a : Column
         Seed value(s) used for the hash algorithm.
     b : Column
@@ -88,6 +96,7 @@ cpdef Column minhash_permuted(Column input, Column a, Column b, size_type width)
         c_result = move(
             cpp_minhash_permuted(
                 input.view(),
+                seed,
                 a.view(),
                 b.view(),
                 width
@@ -135,10 +144,16 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
 
     return Column.from_libcudf(move(c_result))
 
-cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type width):
+cpdef Column minhash64_permuted(
+    Column input,
+    uint64_t seed,
+    Column a,
+    Column b,
+    size_type width
+):
     """
     Returns the minhash values for each string.
-    This function uses MurmurHash3_x128_65 for the hash algorithm.
+    This function uses MurmurHash3_x64_128 for the hash algorithm.
 
     For details, see :cpp:func:`minhash`.
 
@@ -146,6 +161,8 @@ cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type widt
     ----------
     input : Column
         Strings column to compute minhash
+    seed : uint64_t
+        Seed used for the hash function
     a : Column
         Seed value(s) used for the hash algorithm.
     b : Column
@@ -164,6 +181,7 @@ cpdef Column minhash64_permuted(Column input, Column a, Column b, size_type widt
         c_result = move(
             cpp_minhash64_permuted(
                 input.view(),
+                seed,
                 a.view(),
                 b.view(),
                 width

From 42e7d7bcce484ead47b56f8ee580514e8f668446 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 15 Oct 2024 17:35:39 -0400
Subject: [PATCH 11/25] enable seed-hash temporary memory

---
 cpp/src/text/minhash.cu | 41 +++++++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index e067758aed9..7f77125bf68 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -166,6 +166,12 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return hashes;
 }
 
+// Number of seeds to process per thread.
+// The intermediate values are stored in shared-memory and therefore limits the count.
+// Regardless, this value was found to be the most efficient size for both uint32 and uint64
+// hash types based on benchmarks
+CUDF_HOST_DEVICE constexpr std::size_t seeds_chunk_size = 16;
+
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
                                          hash_value_type seed,
@@ -188,7 +194,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   auto const begin = d_str.data() + (lane_idx);
   auto const end   = d_str.data() + d_str.size_bytes();
 
-  constexpr std::size_t seed_chunk   = 16;  // based on block-size==256
+  // constants used for the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
@@ -197,13 +203,13 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
 
   auto const hasher = HashFunction(seed);
 
-  for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) {
+  for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) {
     // initialize working memory
-    auto const tile_hashes = block_hashes + (lane_idx * seed_chunk);
-    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init);
+    auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size);
+    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init);
     __syncthreads();
 
-    auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i);
+    auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i);
 
     // each lane hashes 'width' substrings of d_str
     for (auto itr = begin; itr < end; itr += tile_size) {
@@ -223,9 +229,9 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       }
 
       for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
-        hash_value_type const hv =  // seed_idx ==  0 ? hv1 :
+        hash_value_type const hv =
           ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((seed_idx % seed_chunk) * tile_size) + lane_idx;
+        auto const block_idx    = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx;
         block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
       }
     }
@@ -261,7 +267,6 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   auto const seed_hashes = working_memory + offset - offset0;  // hashes_offset
 
   auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
-  auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = idx % tile_size;
 
   auto const begin  = d_str.data() + (lane_idx);
@@ -320,20 +325,20 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   auto const begin = seed_hashes + (lane_idx);
   auto const end   = seed_hashes + hashes_size;
 
-  constexpr std::size_t seed_chunk   = 16;  // based on block-size==256
+  // constants used in the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
   extern __shared__ char shmem[];
   auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
-  for (std::size_t i = 0; i < parmA.size(); i += seed_chunk) {
+  for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) {
     // initialize working memory
-    auto const tile_hashes = block_hashes + (lane_idx * seed_chunk);
-    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seed_chunk, init);
+    auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size);
+    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init);
     __syncthreads();
 
-    auto const seed_count = cuda::std::min(seed_chunk, parmA.size() - i);
+    auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i);
 
     // each lane hashes 'width' substrings of d_str
     for (auto itr = begin; itr < end; itr += tile_size) {
@@ -341,9 +346,9 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       if (hv1 == 0) { continue; }  // skips intermediate UTF-8 bytes
 
       for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
-        hash_value_type const hv =  // seed_idx == 0 ? hv1 :
+        hash_value_type const hv =
           ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((seed_idx % seed_chunk) * tile_size) + lane_idx;
+        auto const block_idx    = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx;
         block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
       }
     }
@@ -391,12 +396,12 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  // 16 seeds can be held in shared-memory: 32K/block_size(256)/sizeof(hash_value_type) = ~16
-  auto const shmem_size = block_size * 16 * sizeof(hash_value_type);
+  // Found to be the most efficient shared memory size for both hash types
+  constexpr auto shmem_size = block_size * seeds_chunk_size * sizeof(hash_value_type);
 
   cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
                              block_size};
-#if 1
+#if 0
   minhash_permuted_kernel<HashFunction>
     <<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
       *d_strings, seed, parmA, parmB, width, d_hashes);

From f79631a79b6449d8abbed4288a51b7183eec6c75 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 16 Oct 2024 16:02:50 -0400
Subject: [PATCH 12/25] dynamic shared memory to static

---
 cpp/src/text/minhash.cu | 59 ++++++++++++++++++++++-------------------
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 7f77125bf68..2c4e08e002c 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -170,7 +170,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 // The intermediate values are stored in shared-memory and therefore limits the count.
 // Regardless, this value was found to be the most efficient size for both uint32 and uint64
 // hash types based on benchmarks
-CUDF_HOST_DEVICE constexpr std::size_t seeds_chunk_size = 16;
+constexpr cuda::std::size_t seeds_chunk_size = 16;
 
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
@@ -181,24 +181,25 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
                                          hash_value_type* d_hashes)
 {
   auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / tile_size;
+  auto const str_idx = idx / block_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
   auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % tile_size;
+  auto const lane_idx = idx % block_size;
 
   auto const d_output = d_hashes + (str_idx * parmA.size());
 
-  auto const begin = d_str.data() + (lane_idx);
+  auto const begin = d_str.data() + lane_idx;
   auto const end   = d_str.data() + d_str.size_bytes();
 
   // constants used for the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
-  extern __shared__ char shmem[];
+  // Found to be the most efficient shared memory size for both hash types
+  __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)];
   auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
   auto const hasher = HashFunction(seed);
@@ -209,10 +210,11 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
     thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init);
     __syncthreads();
 
-    auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i);
+    auto const seed_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(seeds_chunk_size), parmA.size() - i);
 
     // each lane hashes 'width' substrings of d_str
-    for (auto itr = begin; itr < end; itr += tile_size) {
+    for (auto itr = begin; itr < end; itr += block_size) {
       if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
       auto const check_str =  // used for counting 'width' characters
         cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
@@ -229,17 +231,19 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       }
 
       for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
+        // permutation formula used by datatrove
         hash_value_type const hv =
           ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx;
+        auto const block_idx    = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx;
         block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
       }
     }
     __syncthreads();
 
+    // reduce each seed to a single min value
     if (lane_idx < seed_count) {
-      auto const hvs = block_hashes + (lane_idx * tile_size);
-      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{});
+      auto const hvs = block_hashes + (lane_idx * block_size);
+      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{});
       d_output[lane_idx + i] = hv;
     }
     __syncthreads();
@@ -304,7 +308,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
                                          hash_value_type* d_hashes)
 {
   auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / tile_size;
+  auto const str_idx = idx / block_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
@@ -319,17 +323,18 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
     cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), size_bytes - width + 1);
 
   auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % tile_size;
+  auto const lane_idx = idx % block_size;
   auto const d_output = d_hashes + (str_idx * parmA.size());
 
-  auto const begin = seed_hashes + (lane_idx);
+  auto const begin = seed_hashes + lane_idx;
   auto const end   = seed_hashes + hashes_size;
 
   // constants used in the permutation calculations
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
-  extern __shared__ char shmem[];
+  // Found to be the most efficient shared memory size for both hash types
+  __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)];
   auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
   for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) {
@@ -338,25 +343,28 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
     thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init);
     __syncthreads();
 
-    auto const seed_count = cuda::std::min(seeds_chunk_size, parmA.size() - i);
+    auto const seed_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(seeds_chunk_size), parmA.size() - i);
 
-    // each lane hashes 'width' substrings of d_str
-    for (auto itr = begin; itr < end; itr += tile_size) {
+    // each lane accumulates min hashes in its spot in shared memory
+    for (auto itr = begin; itr < end; itr += block_size) {
       auto const hv1 = *itr;
-      if (hv1 == 0) { continue; }  // skips intermediate UTF-8 bytes
+      if (hv1 == 0) { continue; }  // skip intermediate UTF-8 bytes
 
       for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
+        // permutation formula used by datatrove
         hash_value_type const hv =
           ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((seed_idx % seeds_chunk_size) * tile_size) + lane_idx;
+        auto const block_idx    = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx;
         block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
       }
     }
     __syncthreads();
 
+    // reduce each seed to a single min value
     if (lane_idx < seed_count) {
-      auto const hvs = block_hashes + (lane_idx * tile_size);
-      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + tile_size, init, thrust::minimum{});
+      auto const hvs = block_hashes + (lane_idx * block_size);
+      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{});
       d_output[lane_idx + i] = hv;
     }
     __syncthreads();
@@ -396,14 +404,11 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  // Found to be the most efficient shared memory size for both hash types
-  constexpr auto shmem_size = block_size * seeds_chunk_size * sizeof(hash_value_type);
-
-  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
+  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
                              block_size};
 #if 0
   minhash_permuted_kernel<HashFunction>
-    <<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *d_strings, seed, parmA, parmB, width, d_hashes);
 
 #else
@@ -415,7 +420,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
       *d_strings, seed, width, working_memory.data());
 
   minhash_permuted_kernel<hash_value_type>
-    <<<grid.num_blocks, grid.num_threads_per_block, shmem_size, stream.value()>>>(
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
       *d_strings, parmA, parmB, width, working_memory.data(), d_hashes);
 #endif
 

From fda43cc721176aaee3ee6881d8c10bac1f97e66b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 17 Oct 2024 19:23:45 -0400
Subject: [PATCH 13/25] cleanup variable names, doxygen

---
 cpp/include/nvtext/minhash.hpp |  68 +++++++++---
 cpp/src/text/minhash.cu        | 185 +++++++++++++++++----------------
 2 files changed, 146 insertions(+), 107 deletions(-)

diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 7ebd945c66d..0e352cf9b74 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -93,20 +93,37 @@ std::unique_ptr<cudf::column> minhash(
 /**
  * @brief Returns the minhash values for each string
  *
- * This function uses MurmurHash3_x64_128 for the hash algorithm.
+ * This function uses MurmurHash3_x86_32 for the hash algorithm.
+ *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` values using the following formula:
+ * ```
+ *   max_hash = max of uint32
+ *   mp = (1 << 61) - 1
+ *   hv = hash value of a substring
+ *   pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
  *
  * Any null row entries result in corresponding null output rows.
  *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if parmA is empty
- * @throw std::invalid_argument if `parmB.size() != parmA.size()`
- * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seed  Seed value used for the hash algorithm
- * @param parmA Values used for the hash algorithm
- * @param parmB Values used for the hash algorithm
- * @param width The character width used for apply substrings
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
@@ -114,8 +131,8 @@ std::unique_ptr<cudf::column> minhash(
 std::unique_ptr<cudf::column> minhash_permuted(
   cudf::strings_column_view const& input,
   uint32_t seed,
-  cudf::device_span<uint32_t const> parmA,
-  cudf::device_span<uint32_t const> parmB,
+  cudf::device_span<uint32_t const> parameter_a,
+  cudf::device_span<uint32_t const> parameter_b,
   cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
@@ -186,18 +203,35 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * This function uses MurmurHash3_x64_128 for the hash algorithm.
  *
+ * The input strings are first hashed using the given `seed` over substrings
+ * of `width` characters. These hash values are then combined with the `a`
+ * and `b` values using the following formula:
+ * ```
+ *   max_hash = max of uint64
+ *   mp = (1 << 61) - 1
+ *   hv = hash value of a substring
+ *   pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash
+ * ```
+ *
+ * This calculation is performed on each substring and the minimum value is computed
+ * as follows:
+ * ```
+ *   mh[j,i] = min(pv[i]) for all substrings in row j
+ *                        and where i=[0,a.size())
+ * ```
+ *
  * Any null row entries result in corresponding null output rows.
  *
  * @throw std::invalid_argument if the width < 2
- * @throw std::invalid_argument if parmA is empty
- * @throw std::invalid_argument if `parmB.size() != parmA.size()`
- * @throw std::overflow_error if `parmA.size() * input.size()` exceeds the column size limit
+ * @throw std::invalid_argument if parameter_a is empty
+ * @throw std::invalid_argument if `parameter_b.size() != parameter_a.size()`
+ * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
  * @param seed  Seed value used for the hash algorithm
- * @param parmA Values used for the hash algorithm
- * @param parmB Values used for the hash algorithm
- * @param width The character width used for apply substrings
+ * @param parameter_a Values used for the permuted calculation
+ * @param parameter_b Values used for the permuted calculation
+ * @param width The character width of substrings to hash for each row
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return List column of minhash values for each string per seed
@@ -205,8 +239,8 @@ std::unique_ptr<cudf::column> minhash64(
 std::unique_ptr<cudf::column> minhash64_permuted(
   cudf::strings_column_view const& input,
   uint64_t seed,
-  cudf::device_span<uint64_t const> parmA,
-  cudf::device_span<uint64_t const> parmB,
+  cudf::device_span<uint64_t const> parameter_a,
+  cudf::device_span<uint64_t const> parameter_b,
   cudf::size_type width,
   rmm::cuda_stream_view stream      = cudf::get_default_stream(),
   rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref());
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 2c4e08e002c..3b08088ffc2 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -73,20 +73,20 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 hash_value_type* d_hashes)
 {
   auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / tile_size;
+  auto const str_idx = idx / block_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
   auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % tile_size;
+  auto const lane_idx = idx % block_size;
 
-  auto tile_hashes = working_memory + (str_idx * tile_size * seeds.size());
+  auto tile_hashes = working_memory + (str_idx * block_size * seeds.size());
 
   // initialize working memory
-  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
-    auto begin = tile_hashes + (seed_idx * tile_size);
-    thrust::uninitialized_fill(thrust::seq, begin, begin + tile_size, init);
+  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) {
+    auto begin = tile_hashes + (seed_idx * block_size);
+    thrust::uninitialized_fill(thrust::seq, begin, begin + block_size, init);
   }
   __syncthreads();
 
@@ -96,7 +96,7 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
   auto const end   = d_str.data() + d_str.size_bytes();
 
   // each lane hashes 'width' substrings of d_str
-  for (auto itr = begin; itr < end; itr += tile_size) {
+  for (auto itr = begin; itr < end; itr += block_size) {
     if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
     auto const check_str =  // used for counting 'width' characters
       cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
@@ -112,16 +112,16 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
       } else {
         hv = thrust::get<0>(hasher(hash_str));
       }
-      tile_hashes[(seed_idx * tile_size) + lane_idx] =
-        cuda::std::min(hv, tile_hashes[(seed_idx * tile_size) + lane_idx]);
+      tile_hashes[(seed_idx * block_size) + lane_idx] =
+        cuda::std::min(hv, tile_hashes[(seed_idx * block_size) + lane_idx]);
     }
   }
   __syncthreads();
 
   // compute final result
-  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += tile_size) {
-    auto begin = tile_hashes + (seed_idx * tile_size);
-    auto hv    = thrust::reduce(thrust::seq, begin, begin + tile_size, init, thrust::minimum{});
+  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) {
+    auto begin = tile_hashes + (seed_idx * block_size);
+    auto hv    = thrust::reduce(thrust::seq, begin, begin + block_size, init, thrust::minimum{});
     d_output[seed_idx] = hv;
   }
 }
@@ -166,17 +166,17 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
   return hashes;
 }
 
-// Number of seeds to process per thread.
+// Number of parameter a/b values to process per thread.
 // The intermediate values are stored in shared-memory and therefore limits the count.
 // Regardless, this value was found to be the most efficient size for both uint32 and uint64
 // hash types based on benchmarks
-constexpr cuda::std::size_t seeds_chunk_size = 16;
+constexpr cuda::std::size_t calc_chunk_size = 16;
 
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
                                          hash_value_type seed,
-                                         cudf::device_span<hash_value_type const> parmA,
-                                         cudf::device_span<hash_value_type const> parmB,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
                                          cudf::size_type width,
                                          hash_value_type* d_hashes)
 {
@@ -189,7 +189,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = idx % block_size;
 
-  auto const d_output = d_hashes + (str_idx * parmA.size());
+  auto const d_output = d_hashes + (str_idx * parameter_a.size());
 
   auto const begin = d_str.data() + lane_idx;
   auto const end   = d_str.data() + d_str.size_bytes();
@@ -199,19 +199,19 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
   // Found to be the most efficient shared memory size for both hash types
-  __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)];
+  __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)];
   auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
   auto const hasher = HashFunction(seed);
 
-  for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) {
+  for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) {
     // initialize working memory
-    auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size);
-    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init);
+    auto const tile_hashes = block_hashes + (lane_idx * calc_chunk_size);
+    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + calc_chunk_size, init);
     __syncthreads();
 
-    auto const seed_count =
-      cuda::std::min(static_cast<cuda::std::size_t>(seeds_chunk_size), parmA.size() - i);
+    auto const param_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(calc_chunk_size), parameter_a.size() - i);
 
     // each lane hashes 'width' substrings of d_str
     for (auto itr = begin; itr < end; itr += block_size) {
@@ -223,28 +223,29 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
 
       auto const hash_str = cudf::string_view(itr, bytes);
-      hash_value_type hv1;
+      hash_value_type hv;
       if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        hv1 = hasher(hash_str);
+        hv = hasher(hash_str);
       } else {
-        hv1 = thrust::get<0>(hasher(hash_str));
+        hv = thrust::get<0>(hasher(hash_str));
       }
 
-      for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
+      for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
         // permutation formula used by datatrove
-        hash_value_type const hv =
-          ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx;
-        block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
+        hash_value_type const v =
+          ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((param_idx % calc_chunk_size) * block_size) + lane_idx;
+        block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]);
       }
     }
     __syncthreads();
 
-    // reduce each seed to a single min value
-    if (lane_idx < seed_count) {
-      auto const hvs = block_hashes + (lane_idx * block_size);
-      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{});
-      d_output[lane_idx + i] = hv;
+    // reduce each parameter values vector to a single min value
+    if (lane_idx < param_count) {
+      auto const values = block_hashes + (lane_idx * block_size);
+      auto const minv =
+        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
+      d_output[lane_idx + i] = minv;
     }
     __syncthreads();
   }
@@ -264,16 +265,16 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
     cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
-  auto const offset0    = offsets_itr[0];  // pass this in
+  auto const offset0    = offsets_itr[0];
   auto const offset     = offsets_itr[str_idx];
   auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
   if (size_bytes == 0) { return; }
-  auto const seed_hashes = working_memory + offset - offset0;  // hashes_offset
+  auto const seed_hashes = working_memory + offset - offset0;
 
   auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
   auto const lane_idx = idx % tile_size;
 
-  auto const begin  = d_str.data() + (lane_idx);
+  auto const begin  = d_str.data() + lane_idx;
   auto const end    = d_str.data() + d_str.size_bytes();
   auto const hasher = HashFunction(seed);
 
@@ -301,8 +302,8 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
 
 template <typename hash_value_type>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
-                                         cudf::device_span<hash_value_type const> parmA,
-                                         cudf::device_span<hash_value_type const> parmB,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
                                          cudf::size_type width,
                                          hash_value_type* working_memory,
                                          hash_value_type* d_hashes)
@@ -315,16 +316,16 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
     cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
-  auto const offset0     = offsets_itr[0];  // pass this in
+  auto const offset0     = offsets_itr[0];
   auto const offset      = offsets_itr[str_idx];
   auto const size_bytes  = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
-  auto const seed_hashes = working_memory + offset - offset0;  // hashes_offset;
+  auto const seed_hashes = working_memory + offset - offset0;
   auto const hashes_size =
     cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), size_bytes - width + 1);
 
   auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
   auto const lane_idx = idx % block_size;
-  auto const d_output = d_hashes + (str_idx * parmA.size());
+  auto const d_output = d_hashes + (str_idx * parameter_a.size());
 
   auto const begin = seed_hashes + lane_idx;
   auto const end   = seed_hashes + hashes_size;
@@ -334,38 +335,39 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
   // Found to be the most efficient shared memory size for both hash types
-  __shared__ char shmem[block_size * seeds_chunk_size * sizeof(hash_value_type)];
+  __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)];
   auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
 
-  for (std::size_t i = 0; i < parmA.size(); i += seeds_chunk_size) {
+  for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) {
     // initialize working memory
-    auto const tile_hashes = block_hashes + (lane_idx * seeds_chunk_size);
-    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + seeds_chunk_size, init);
+    auto const tiles = block_hashes + (lane_idx * calc_chunk_size);
+    thrust::uninitialized_fill(thrust::seq, tiles, tiles + calc_chunk_size, init);
     __syncthreads();
 
-    auto const seed_count =
-      cuda::std::min(static_cast<cuda::std::size_t>(seeds_chunk_size), parmA.size() - i);
+    auto const param_count =
+      cuda::std::min(static_cast<cuda::std::size_t>(calc_chunk_size), parameter_a.size() - i);
 
     // each lane accumulates min hashes in its spot in shared memory
     for (auto itr = begin; itr < end; itr += block_size) {
-      auto const hv1 = *itr;
-      if (hv1 == 0) { continue; }  // skip intermediate UTF-8 bytes
+      auto const hv = *itr;
+      if (hv == 0) { continue; }  // skip intermediate UTF-8 bytes
 
-      for (std::size_t seed_idx = i; seed_idx < (i + seed_count); ++seed_idx) {
+      for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
         // permutation formula used by datatrove
-        hash_value_type const hv =
-          ((hv1 * parmA[seed_idx] + parmB[seed_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((seed_idx % seeds_chunk_size) * block_size) + lane_idx;
-        block_hashes[block_idx] = cuda::std::min(hv, block_hashes[block_idx]);
+        hash_value_type const v =
+          ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
+        auto const block_idx    = ((param_idx % calc_chunk_size) * block_size) + lane_idx;
+        block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]);
       }
     }
     __syncthreads();
 
-    // reduce each seed to a single min value
-    if (lane_idx < seed_count) {
-      auto const hvs = block_hashes + (lane_idx * block_size);
-      auto const hv  = thrust::reduce(thrust::seq, hvs, hvs + block_size, init, thrust::minimum{});
-      d_output[lane_idx + i] = hv;
+    // reduce each parameter values vector to a single min value
+    if (lane_idx < param_count) {
+      auto const values = block_hashes + (lane_idx * block_size);
+      auto const minv =
+        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
+      d_output[lane_idx + i] = minv;
     }
     __syncthreads();
   }
@@ -374,34 +376,35 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          hash_value_type seed,
-                                         cudf::device_span<hash_value_type const> parmA,
-                                         cudf::device_span<hash_value_type const> parmB,
+                                         cudf::device_span<hash_value_type const> parameter_a,
+                                         cudf::device_span<hash_value_type const> parameter_b,
                                          cudf::size_type width,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!parmA.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
+  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
                "Parameter width should be an integer value of 2 or greater",
                std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * parmA.size()) <
+  CUDF_EXPECTS(parameter_a.size() == parameter_b.size(),
+               "Parameters A and B should have the same number of elements",
+               std::invalid_argument);
+  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * parameter_a.size()) <
                  static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
                "The number of seeds times the number of input rows exceeds the column size limit",
                std::overflow_error);
-  CUDF_EXPECTS(parmA.size() == parmB.size(),
-               "Parameters A and B should have the same number of elements",
-               std::invalid_argument);
 
   auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto hashes   = cudf::make_numeric_column(output_type,
-                                          input.size() * static_cast<cudf::size_type>(parmA.size()),
-                                          cudf::mask_state::UNALLOCATED,
-                                          stream,
-                                          mr);
+  auto hashes =
+    cudf::make_numeric_column(output_type,
+                              input.size() * static_cast<cudf::size_type>(parameter_a.size()),
+                              cudf::mask_state::UNALLOCATED,
+                              stream,
+                              mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
   cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
@@ -409,7 +412,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 #if 0
   minhash_permuted_kernel<HashFunction>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, seed, parmA, parmB, width, d_hashes);
+      *d_strings, seed, parameter_a, parameter_b, width, d_hashes);
 
 #else
   auto const wm_size  = input.chars_size(stream);
@@ -421,7 +424,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 
   minhash_permuted_kernel<hash_value_type>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, parmA, parmB, width, working_memory.data(), d_hashes);
+      *d_strings, parameter_a, parameter_b, width, working_memory.data(), d_hashes);
 #endif
 
   return hashes;
@@ -576,15 +579,16 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 
 std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
                                       uint32_t seed,
-                                      cudf::device_span<uint32_t const> parmA,
-                                      cudf::device_span<uint32_t const> parmB,
+                                      cudf::device_span<uint32_t const> parameter_a,
+                                      cudf::device_span<uint32_t const> parameter_b,
                                       cudf::size_type width,
                                       rmm::cuda_stream_view stream,
                                       rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
-  auto hashes = detail::minhash_fn<HashFunction>(input, seed, parmA, parmB, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr);
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -613,15 +617,16 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
                                         uint64_t seed,
-                                        cudf::device_span<uint64_t const> parmA,
-                                        cudf::device_span<uint64_t const> parmB,
+                                        cudf::device_span<uint64_t const> parameter_a,
+                                        cudf::device_span<uint64_t const> parameter_b,
                                         cudf::size_type width,
                                         rmm::cuda_stream_view stream,
                                         rmm::device_async_resource_ref mr)
 {
   using HashFunction = cudf::hashing::detail::MurmurHash3_x64_128<cudf::string_view>;
-  auto hashes = detail::minhash_fn<HashFunction>(input, seed, parmA, parmB, width, stream, mr);
-  return build_list_result(input.parent(), std::move(hashes), parmA.size(), stream, mr);
+  auto hashes =
+    detail::minhash_fn<HashFunction>(input, seed, parameter_a, parameter_b, width, stream, mr);
+  return build_list_result(input.parent(), std::move(hashes), parameter_a.size(), stream, mr);
 }
 
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,
@@ -667,14 +672,14 @@ std::unique_ptr<cudf::column> minhash(cudf::strings_column_view const& input,
 
 std::unique_ptr<cudf::column> minhash_permuted(cudf::strings_column_view const& input,
                                                uint32_t seed,
-                                               cudf::device_span<uint32_t const> parmA,
-                                               cudf::device_span<uint32_t const> parmB,
+                                               cudf::device_span<uint32_t const> parameter_a,
+                                               cudf::device_span<uint32_t const> parameter_b,
                                                cudf::size_type width,
                                                rmm::cuda_stream_view stream,
                                                rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash(input, seed, parmA, parmB, width, stream, mr);
+  return detail::minhash(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
@@ -699,14 +704,14 @@ std::unique_ptr<cudf::column> minhash64(cudf::strings_column_view const& input,
 
 std::unique_ptr<cudf::column> minhash64_permuted(cudf::strings_column_view const& input,
                                                  uint64_t seed,
-                                                 cudf::device_span<uint64_t const> parmA,
-                                                 cudf::device_span<uint64_t const> parmB,
+                                                 cudf::device_span<uint64_t const> parameter_a,
+                                                 cudf::device_span<uint64_t const> parameter_b,
                                                  cudf::size_type width,
                                                  rmm::cuda_stream_view stream,
                                                  rmm::device_async_resource_ref mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minhash64(input, seed, parmA, parmB, width, stream, mr);
+  return detail::minhash64(input, seed, parameter_a, parameter_b, width, stream, mr);
 }
 
 std::unique_ptr<cudf::column> word_minhash(cudf::lists_column_view const& input,

From 1f4d441f73399cf211c0ee22c09bbcc142683ef9 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 22 Oct 2024 18:48:05 -0400
Subject: [PATCH 14/25] support for super-wide strings

---
 cpp/src/text/minhash.cu | 153 ++++++++++++++++++++++++++++++----------
 1 file changed, 117 insertions(+), 36 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 3b08088ffc2..549b9faab32 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -38,8 +38,11 @@
 #include <rmm/exec_policy.hpp>
 
 #include <cuda/atomic>
+#include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
 #include <thrust/fill.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
 
 #include <limits>
 
@@ -229,6 +232,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       } else {
         hv = thrust::get<0>(hasher(hash_str));
       }
+      hv = cuda::std::max(hv, hash_value_type{1});
 
       for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
         // permutation formula used by datatrove
@@ -251,43 +255,53 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   }
 }
 
+// values determined using redpajama and books_sample datasets
+constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
+constexpr cudf::size_type blocks_per_string     = 64;
+
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
                                      hash_value_type seed,
                                      cudf::size_type width,
-                                     hash_value_type* working_memory)
+                                     hash_value_type* working_memory,
+                                     cudf::size_type* count,
+                                     cudf::size_type param_count,
+                                     hash_value_type* d_results)
 {
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / tile_size;
+  auto const tid     = cudf::detail::grid_1d::global_thread_id();
+  auto const str_idx = tid / tile_size;
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
     cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
-  auto const offset0    = offsets_itr[0];
   auto const offset     = offsets_itr[str_idx];
   auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
   if (size_bytes == 0) { return; }
-  auto const seed_hashes = working_memory + offset - offset0;
+
+  auto const seed_hashes = working_memory + offset - offsets_itr[0];
 
   auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
-  auto const lane_idx = idx % tile_size;
+  auto const lane_idx = tid % tile_size;
 
   auto const begin  = d_str.data() + lane_idx;
   auto const end    = d_str.data() + d_str.size_bytes();
   auto const hasher = HashFunction(seed);
 
-  auto d_output = seed_hashes + lane_idx;
-  for (auto itr = begin; itr < end; itr += tile_size, d_output += tile_size) {
+  auto d_hashes = seed_hashes + lane_idx;
+  for (auto itr = begin; itr < end; itr += tile_size, d_hashes += tile_size) {
     if (cudf::strings::detail::is_utf8_continuation_char(*itr)) {
-      *d_output = 0;
+      *d_hashes = 0;
       continue;
     }
     auto const check_str =  // used for counting 'width' characters
       cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
     auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
-    if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
+    if ((itr != d_str.data()) && (left > 0)) {  // true if past the end of the string
+      *d_hashes = 0;
+      continue;
+    }
 
     auto const hash_str = cudf::string_view(itr, bytes);
     hash_value_type hv;
@@ -296,35 +310,59 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
     } else {
       hv = thrust::get<0>(hasher(hash_str));
     }
-    *d_output = hv;
+    *d_hashes = cuda::std::max(hv, hash_value_type{1});
+  }
+
+  // logic appended here so an extra kernel is not required
+  if (size_bytes >= wide_string_threshold) {
+    if (lane_idx == 0) {
+      // count the number of wider strings
+      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_block> ref{*count};
+      ref.fetch_add(1, cuda::std::memory_order_relaxed);
+    }
+    // initialize the output for wider strings
+    auto d_output = d_results + (str_idx * param_count);
+    for (auto i = lane_idx; i < param_count; i += tile_size) {
+      d_output[i] = std::numeric_limits<hash_value_type>::max();
+    }
   }
 }
 
-template <typename hash_value_type>
+template <typename hash_value_type, int blocks_per_string>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
+                                         cudf::device_span<cudf::size_type const> indices,
                                          cudf::device_span<hash_value_type const> parameter_a,
                                          cudf::device_span<hash_value_type const> parameter_b,
                                          cudf::size_type width,
                                          hash_value_type* working_memory,
                                          hash_value_type* d_hashes)
 {
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / block_size;
-  if (str_idx >= d_strings.size()) { return; }
+  auto const tid = cudf::detail::grid_1d::global_thread_id();
+  auto const idx = (tid / blocks_per_string) / block_size;
+  if (idx >= indices.size()) { return; }
+  auto const str_idx = indices[idx];
   if (d_strings.is_null(str_idx)) { return; }
+  int const section_idx = blockIdx.x % blocks_per_string;
 
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
     cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
-  auto const offset0     = offsets_itr[0];
-  auto const offset      = offsets_itr[str_idx];
-  auto const size_bytes  = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
-  auto const seed_hashes = working_memory + offset - offset0;
+  auto const offset     = offsets_itr[str_idx];
+  auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
+
+  auto const section_size =
+    (size_bytes / blocks_per_string) +
+    (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
+  auto const section_offset = section_idx * (size_bytes / blocks_per_string);
+
+  auto const seed_hashes = working_memory + offset - offsets_itr[0] + section_offset;
   auto const hashes_size =
-    cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), size_bytes - width + 1);
+    section_idx < (blocks_per_string - 1)
+      ? section_size
+      : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
 
   auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % block_size;
+  auto const lane_idx = tid % block_size;
   auto const d_output = d_hashes + (str_idx * parameter_a.size());
 
   auto const begin = seed_hashes + lane_idx;
@@ -367,7 +405,12 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
       auto const values = block_hashes + (lane_idx * block_size);
       auto const minv =
         thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
-      d_output[lane_idx + i] = minv;
+      if constexpr (blocks_per_string > 1) {
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
+        ref.fetch_min(minv, cuda::std::memory_order_relaxed);
+      } else {
+        d_output[lane_idx + i] = minv;
+      }
     }
     __syncthreads();
   }
@@ -409,23 +452,61 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 
   cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
                              block_size};
-#if 0
-  minhash_permuted_kernel<HashFunction>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, seed, parameter_a, parameter_b, width, d_hashes);
-
-#else
-  auto const wm_size  = input.chars_size(stream);
-  auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
+  auto const wm_size     = input.chars_size(stream);
+  auto working_memory    = rmm::device_uvector<hash_value_type>(wm_size, stream);
+  auto d_threshold_count = rmm::device_scalar<cudf::size_type>(0, stream);
 
   minhash_seed_kernel<HashFunction>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, seed, width, working_memory.data());
+    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
+                                                                         seed,
+                                                                         width,
+                                                                         working_memory.data(),
+                                                                         d_threshold_count.data(),
+                                                                         parameter_a.size(),
+                                                                         d_hashes);
+  auto const threshold_count = d_threshold_count.value(stream);
+
+  auto indices = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+  thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
+  cudf::size_type threshold_index = input.size();
+  if (threshold_count < input.size()) {
+    auto sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
+    thrust::transform(rmm::exec_policy_nosync(stream),
+                      thrust::counting_iterator<cudf::size_type>(0),
+                      thrust::counting_iterator<cudf::size_type>(input.size()),
+                      sizes.data(),
+                      cuda::proclaim_return_type<cudf::size_type>(
+                        [d_strings = *d_strings] __device__(auto idx) -> cudf::size_type {
+                          if (d_strings.is_null(idx)) { return 0; }
+                          return d_strings.element<cudf::string_view>(idx).size_bytes();
+                        }));
+    thrust::sort_by_key(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
+    auto bp = thrust::lower_bound(
+      rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold);
+    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), bp));
+  }
 
-  minhash_permuted_kernel<hash_value_type>
-    <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-      *d_strings, parameter_a, parameter_b, width, working_memory.data(), d_hashes);
-#endif
+  // handle the strings below the threshold width
+  if (threshold_index > 0) {
+    auto d_indices = cudf::device_span<cudf::size_type const>(indices.data(), threshold_index);
+    cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(d_indices.size()) * block_size,
+                               block_size};
+    minhash_permuted_kernel<hash_value_type, 1>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes);
+  }
+
+  // handle the strings above the threshold width
+  if (threshold_index < input.size()) {
+    auto count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
+    auto d_indices =
+      cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
+    cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
+    minhash_permuted_kernel<hash_value_type, blocks_per_string>
+      <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
+        *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes);
+  }
 
   return hashes;
 }

From 58f9206dc8903ea8aac76631073a4cd577752b5b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 23 Oct 2024 13:24:32 -0400
Subject: [PATCH 15/25] fix threshold-index init logic

---
 cpp/src/text/minhash.cu | 186 +++++++++++++++++++++++++++-------------
 1 file changed, 127 insertions(+), 59 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 549b9faab32..ddfd81de310 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -37,6 +37,7 @@
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
 
+#include <cooperative_groups.h>
 #include <cuda/atomic>
 #include <thrust/binary_search.h>
 #include <thrust/execution_policy.h>
@@ -170,11 +171,13 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 }
 
 // Number of parameter a/b values to process per thread.
-// The intermediate values are stored in shared-memory and therefore limits the count.
-// Regardless, this value was found to be the most efficient size for both uint32 and uint64
-// hash types based on benchmarks
+// The intermediate values are stored in shared-memory and therefore limits this count.
+// This value was found to be the most efficient size for both uint32 and uint64
+// hash types based on benchmarks.
 constexpr cuda::std::size_t calc_chunk_size = 16;
 
+// this is deprecated and to be removed in the future;
+// keeping it for now for verifying results from the faster kernels below
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
                                          hash_value_type seed,
@@ -188,9 +191,11 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
+  auto const block = cooperative_groups::this_thread_block();
+
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
   auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % block_size;
+  auto const lane_idx = block.thread_rank();  // idx % block_size;
 
   auto const d_output = d_hashes + (str_idx * parameter_a.size());
 
@@ -203,15 +208,15 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
 
   // Found to be the most efficient shared memory size for both hash types
   __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)];
-  auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
+  auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
 
   auto const hasher = HashFunction(seed);
 
   for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) {
     // initialize working memory
-    auto const tile_hashes = block_hashes + (lane_idx * calc_chunk_size);
+    auto const tile_hashes = block_values + (lane_idx * calc_chunk_size);
     thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + calc_chunk_size, init);
-    __syncthreads();
+    block.sync();
 
     auto const param_count =
       cuda::std::min(static_cast<cuda::std::size_t>(calc_chunk_size), parameter_a.size() - i);
@@ -239,32 +244,55 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
         hash_value_type const v =
           ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
         auto const block_idx    = ((param_idx % calc_chunk_size) * block_size) + lane_idx;
-        block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]);
+        block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
       }
     }
-    __syncthreads();
+    block.sync();
 
     // reduce each parameter values vector to a single min value
     if (lane_idx < param_count) {
-      auto const values = block_hashes + (lane_idx * block_size);
+      auto const values = block_values + (lane_idx * block_size);
       auto const minv =
         thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
       d_output[lane_idx + i] = minv;
     }
-    __syncthreads();
+    block.sync();
   }
 }
 
-// values determined using redpajama and books_sample datasets
+// Separate kernels are used to process strings above and below this value (in bytes).
 constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
-constexpr cudf::size_type blocks_per_string     = 64;
+// The number of blocks per string for the above-threshold kernel processing.
+constexpr cudf::size_type blocks_per_string = 64;
+// These values determined using redpajama and books_sample datasets
 
+/**
+ * @brief Hashing kernel launched as a thread per tile-size (block or warp)
+ *
+ * This kernel computes the hashes for each string using the seed and the specified
+ * hash function. The width is used to compute rolling substrings to hash over.
+ * The hashes are stored in d_hashes to be used in the minhash_permuted_kernel.
+ *
+ * This kernel also counts the number of strings above the wide_string_threshold
+ * and proactively initializes the output values for those strings.
+ *
+ * @tparam HashFunction The hash function to use for this kernel
+ * @tparam hash_value_type Derived from HashFunction result_type
+ *
+ * @param d_strings The input strings to hash
+ * @param seed The seed used for the hash function
+ * @param width Width in characters used for determining substrings to hash
+ * @param d_hashes The resulting hash values are stored here
+ * @param count Stores the number of strings above wide_string_threshold
+ * @param param_count Number of parameters (used for proactive initialize)
+ * @param d_results Final results vector (used for proactive initialize)
+ */
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
                                      hash_value_type seed,
                                      cudf::size_type width,
-                                     hash_value_type* working_memory,
-                                     cudf::size_type* count,
+                                     hash_value_type* d_hashes,
+                                     cudf::size_type* threshold_count,
                                      cudf::size_type param_count,
                                      hash_value_type* d_results)
 {
@@ -273,6 +301,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
+  // retrieve this strings offset to locate the output position in d_hashes
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
     cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
@@ -280,26 +309,26 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
   if (size_bytes == 0) { return; }
 
-  auto const seed_hashes = working_memory + offset - offsets_itr[0];
-
   auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
   auto const lane_idx = tid % tile_size;
 
+  // hashes for this string/thread stored here
+  auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx;
+
   auto const begin  = d_str.data() + lane_idx;
   auto const end    = d_str.data() + d_str.size_bytes();
   auto const hasher = HashFunction(seed);
 
-  auto d_hashes = seed_hashes + lane_idx;
-  for (auto itr = begin; itr < end; itr += tile_size, d_hashes += tile_size) {
+  for (auto itr = begin; itr < end; itr += tile_size, seed_hashes += tile_size) {
     if (cudf::strings::detail::is_utf8_continuation_char(*itr)) {
-      *d_hashes = 0;
+      *seed_hashes = 0;
       continue;
     }
     auto const check_str =  // used for counting 'width' characters
       cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
     auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
     if ((itr != d_str.data()) && (left > 0)) {  // true if past the end of the string
-      *d_hashes = 0;
+      *seed_hashes = 0;
       continue;
     }
 
@@ -310,14 +339,15 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
     } else {
       hv = thrust::get<0>(hasher(hash_str));
     }
-    *d_hashes = cuda::std::max(hv, hash_value_type{1});
+    // disallowing hash to zero case
+    *seed_hashes = cuda::std::max(hv, hash_value_type{1});
   }
 
   // logic appended here so an extra kernel is not required
   if (size_bytes >= wide_string_threshold) {
     if (lane_idx == 0) {
-      // count the number of wider strings
-      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_block> ref{*count};
+      // count the number of wide strings
+      cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
       ref.fetch_add(1, cuda::std::memory_order_relaxed);
     }
     // initialize the output for wider strings
@@ -328,21 +358,46 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   }
 }
 
+/**
+ * @brief Permutation calculation kernel
+ *
+ * This kernel computes uses the hashes from the minhash_seed_kernel to
+ * compute the results using the parameter_a and parameter_b values.
+ * The output is the number of input rows (N) by the number of parameter values (M).
+ * Each output[i] is the calculated result for parameter_a/b[0:M].
+ *
+ * This kernel is launched with either blocks per strings of 1 for strings
+ * below the wide_strings_threshold or blocks per string = blocks_per_strings
+ * for strings above wide_strings_threshold.
+ *
+ * @tparam hash_value_type Derived from HashFunction result_type
+ * @tparam blocks_per_string Number of blocks used to process each string
+ *
+ * @param d_strings The input strings to hash
+ * @param indices The indices of the strings in d_strings to process here
+ * @param parameter_a 1st set of parameters for the calculation result
+ * @param parameter_b 2nd set of parameters for the calculation result
+ * @param width Width used for calculating the number of available hashes in each string
+ * @param d_hashes The hash values computed in minhash_seed_kernel
+ * @param d_results Final results vector of calculate values
+ */
 template <typename hash_value_type, int blocks_per_string>
 CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
                                          cudf::device_span<cudf::size_type const> indices,
                                          cudf::device_span<hash_value_type const> parameter_a,
                                          cudf::device_span<hash_value_type const> parameter_b,
                                          cudf::size_type width,
-                                         hash_value_type* working_memory,
-                                         hash_value_type* d_hashes)
+                                         hash_value_type const* d_hashes,
+                                         hash_value_type* d_results)
 {
   auto const tid = cudf::detail::grid_1d::global_thread_id();
   auto const idx = (tid / blocks_per_string) / block_size;
   if (idx >= indices.size()) { return; }
   auto const str_idx = indices[idx];
   if (d_strings.is_null(str_idx)) { return; }
-  int const section_idx = blockIdx.x % blocks_per_string;
+
+  auto const block      = cooperative_groups::this_thread_block();
+  int const section_idx = block.group_index().x % blocks_per_string;
 
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
@@ -350,20 +405,24 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   auto const offset     = offsets_itr[str_idx];
   auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
 
+  // number of items to process in this block;
+  // last block includes any remainder values from the size_bytes/blocks_per_string truncation
   auto const section_size =
     (size_bytes / blocks_per_string) +
     (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
   auto const section_offset = section_idx * (size_bytes / blocks_per_string);
 
-  auto const seed_hashes = working_memory + offset - offsets_itr[0] + section_offset;
+  // hash values for this block
+  auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset;
+  // width used here as a max value since a string's char-count <= byte-count
   auto const hashes_size =
     section_idx < (blocks_per_string - 1)
       ? section_size
       : cuda::std::max(static_cast<cudf::size_type>(size_bytes > 0), section_size - width + 1);
 
   auto const init     = size_bytes == 0 ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = tid % block_size;
-  auto const d_output = d_hashes + (str_idx * parameter_a.size());
+  auto const lane_idx = block.thread_rank();
+  auto const d_output = d_results + (str_idx * parameter_a.size());
 
   auto const begin = seed_hashes + lane_idx;
   auto const end   = seed_hashes + hashes_size;
@@ -372,15 +431,16 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
-  // Found to be the most efficient shared memory size for both hash types
+  // found to be the most efficient shared memory size for both hash types
   __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)];
-  auto const block_hashes = reinterpret_cast<hash_value_type*>(shmem);
+  auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
 
   for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) {
-    // initialize working memory
-    auto const tiles = block_hashes + (lane_idx * calc_chunk_size);
-    thrust::uninitialized_fill(thrust::seq, tiles, tiles + calc_chunk_size, init);
-    __syncthreads();
+    // initialize our chunk of shared memory
+    // each thread handles calc_chunk_size of values
+    auto const chunk_values = block_values + (lane_idx * calc_chunk_size);
+    thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + calc_chunk_size, init);
+    block.sync();
 
     auto const param_count =
       cuda::std::min(static_cast<cuda::std::size_t>(calc_chunk_size), parameter_a.size() - i);
@@ -388,31 +448,35 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
     // each lane accumulates min hashes in its spot in shared memory
     for (auto itr = begin; itr < end; itr += block_size) {
       auto const hv = *itr;
-      if (hv == 0) { continue; }  // skip intermediate UTF-8 bytes
+      if (hv == 0) { continue; }  // 0 is used as a skip sentinel
 
       for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
         // permutation formula used by datatrove
         hash_value_type const v =
           ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
         auto const block_idx    = ((param_idx % calc_chunk_size) * block_size) + lane_idx;
-        block_hashes[block_idx] = cuda::std::min(v, block_hashes[block_idx]);
+        block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
       }
     }
-    __syncthreads();
+    block.sync();
 
     // reduce each parameter values vector to a single min value
+    // assumes that the block_size > calc_chunk_size
+    // each thread reduces a block_size of parameter values (thread per parameter)
     if (lane_idx < param_count) {
-      auto const values = block_hashes + (lane_idx * block_size);
+      auto const values = block_values + (lane_idx * block_size);
+      // cooperative groups does not have a min function and cub::BlockReduce was slower
       auto const minv =
         thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
       if constexpr (blocks_per_string > 1) {
+        // accumulates mins for each block into d_output
         cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{d_output[lane_idx + i]};
         ref.fetch_min(minv, cuda::std::memory_order_relaxed);
       } else {
         d_output[lane_idx + i] = minv;
       }
     }
-    __syncthreads();
+    block.sync();
   }
 }
 
@@ -425,51 +489,55 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          rmm::cuda_stream_view stream,
                                          rmm::device_async_resource_ref mr)
 {
-  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(width >= 2,
                "Parameter width should be an integer value of 2 or greater",
                std::invalid_argument);
+  CUDF_EXPECTS(!parameter_a.empty(), "Parameters A and B cannot be empty", std::invalid_argument);
   CUDF_EXPECTS(parameter_a.size() == parameter_b.size(),
                "Parameters A and B should have the same number of elements",
                std::invalid_argument);
-  CUDF_EXPECTS((static_cast<std::size_t>(input.size()) * parameter_a.size()) <
-                 static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
-               "The number of seeds times the number of input rows exceeds the column size limit",
-               std::overflow_error);
+  CUDF_EXPECTS(
+    (static_cast<std::size_t>(input.size()) * parameter_a.size()) <
+      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max()),
+    "The number of parameters times the number of input rows exceeds the column size limit",
+    std::overflow_error);
 
   auto const output_type = cudf::data_type{cudf::type_to_id<hash_value_type>()};
   if (input.is_empty()) { return cudf::make_empty_column(output_type); }
 
   auto const d_strings = cudf::column_device_view::create(input.parent(), stream);
 
-  auto hashes =
+  auto results =
     cudf::make_numeric_column(output_type,
                               input.size() * static_cast<cudf::size_type>(parameter_a.size()),
                               cudf::mask_state::UNALLOCATED,
                               stream,
                               mr);
-  auto d_hashes = hashes->mutable_view().data<hash_value_type>();
+  auto d_results = results->mutable_view().data<hash_value_type>();
 
   cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * block_size,
                              block_size};
-  auto const wm_size     = input.chars_size(stream);
-  auto working_memory    = rmm::device_uvector<hash_value_type>(wm_size, stream);
+  auto const hashes_size = input.chars_size(stream);
+  auto d_hashes          = rmm::device_uvector<hash_value_type>(hashes_size, stream);
   auto d_threshold_count = rmm::device_scalar<cudf::size_type>(0, stream);
 
   minhash_seed_kernel<HashFunction>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,
                                                                          seed,
                                                                          width,
-                                                                         working_memory.data(),
+                                                                         d_hashes.data(),
                                                                          d_threshold_count.data(),
                                                                          parameter_a.size(),
-                                                                         d_hashes);
+                                                                         d_results);
   auto const threshold_count = d_threshold_count.value(stream);
 
   auto indices = rmm::device_uvector<cudf::size_type>(input.size(), stream);
   thrust::sequence(rmm::exec_policy(stream), indices.begin(), indices.end());
-  cudf::size_type threshold_index = input.size();
-  if (threshold_count < input.size()) {
+  cudf::size_type threshold_index = threshold_count < input.size() ? input.size() : 0;
+
+  // if we counted a split of above/below threshold then
+  // compute partitions based on the size of each string
+  if ((threshold_count > 0) && (threshold_count < input.size())) {
     auto sizes = rmm::device_uvector<cudf::size_type>(input.size(), stream);
     thrust::transform(rmm::exec_policy_nosync(stream),
                       thrust::counting_iterator<cudf::size_type>(0),
@@ -482,9 +550,9 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                         }));
     thrust::sort_by_key(
       rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), indices.begin());
-    auto bp = thrust::lower_bound(
+    auto const lb = thrust::lower_bound(
       rmm::exec_policy_nosync(stream), sizes.begin(), sizes.end(), wide_string_threshold);
-    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), bp));
+    threshold_index = static_cast<cudf::size_type>(thrust::distance(sizes.begin(), lb));
   }
 
   // handle the strings below the threshold width
@@ -494,7 +562,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                block_size};
     minhash_permuted_kernel<hash_value_type, 1>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-        *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes);
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
 
   // handle the strings above the threshold width
@@ -505,10 +573,10 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
     cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
     minhash_permuted_kernel<hash_value_type, blocks_per_string>
       <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-        *d_strings, d_indices, parameter_a, parameter_b, width, working_memory.data(), d_hashes);
+        *d_strings, d_indices, parameter_a, parameter_b, width, d_hashes.data(), d_results);
   }
 
-  return hashes;
+  return results;
 }
 
 /**

From fef6e0e6bb38b66fbbf458f5e9f5874f21ef86b6 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 23 Oct 2024 15:56:02 -0400
Subject: [PATCH 16/25] use cudf::detail::device_scalar

---
 cpp/src/text/minhash.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index ddfd81de310..fa6e7a30a62 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -18,6 +18,7 @@
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/copy.hpp>
+#include <cudf/detail/device_scalar.hpp>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sequence.hpp>
@@ -519,7 +520,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                              block_size};
   auto const hashes_size = input.chars_size(stream);
   auto d_hashes          = rmm::device_uvector<hash_value_type>(hashes_size, stream);
-  auto d_threshold_count = rmm::device_scalar<cudf::size_type>(0, stream);
+  auto d_threshold_count = cudf::detail::device_scalar<cudf::size_type>(0, stream);
 
   minhash_seed_kernel<HashFunction>
     <<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(*d_strings,

From e1067b354922cfcdc6a98a122ed2d410816094ae Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 24 Oct 2024 12:58:25 -0400
Subject: [PATCH 17/25] fix benchmarks; add gtest

---
 cpp/benchmarks/text/minhash.cpp  | 31 +++++++-------
 cpp/include/nvtext/minhash.hpp   | 12 +++---
 cpp/src/text/minhash.cu          | 44 +++++++++----------
 cpp/tests/text/minhash_tests.cpp | 73 +++++++++++++++++++++++---------
 4 files changed, 97 insertions(+), 63 deletions(-)

diff --git a/cpp/benchmarks/text/minhash.cpp b/cpp/benchmarks/text/minhash.cpp
index a0f9196d05f..a80d0dcbdb8 100644
--- a/cpp/benchmarks/text/minhash.cpp
+++ b/cpp/benchmarks/text/minhash.cpp
@@ -20,8 +20,6 @@
 
 #include <nvtext/minhash.hpp>
 
-#include <rmm/device_buffer.hpp>
-
 #include <nvbench/nvbench.cuh>
 
 static void bench_minhash(nvbench::state& state)
@@ -29,25 +27,25 @@ static void bench_minhash(nvbench::state& state)
   auto const num_rows   = static_cast<cudf::size_type>(state.get_int64("num_rows"));
   auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
   auto const hash_width = static_cast<cudf::size_type>(state.get_int64("hash_width"));
-  auto const seed_count = static_cast<cudf::size_type>(state.get_int64("seed_count"));
+  auto const parameters = static_cast<cudf::size_type>(state.get_int64("parameters"));
   auto const base64     = state.get_int64("hash_type") == 64;
 
-  if ((num_rows * seed_count * (base64 ? sizeof(int64_t) : sizeof(int32_t)) * 32L) >=
-      static_cast<std::size_t>(std::numeric_limits<cudf::size_type>::max())) {
-    state.skip("Skip benchmarks requiring more than 2GB working memory");
-  }
-
   data_profile const strings_profile = data_profile_builder().distribution(
     cudf::type_id::STRING, distribution_id::NORMAL, 0, row_width);
   auto const strings_table =
     create_random_table({cudf::type_id::STRING}, row_count{num_rows}, strings_profile);
   cudf::strings_column_view input(strings_table->view().column(0));
 
-  data_profile const seeds_profile = data_profile_builder().no_validity().distribution(
-    cudf::type_to_id<cudf::hash_value_type>(), distribution_id::NORMAL, 0, row_width);
-  auto const seed_type   = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
-  auto const seeds_table = create_random_table({seed_type}, row_count{seed_count}, seeds_profile);
-  auto seeds             = seeds_table->get_column(0);
+  data_profile const param_profile = data_profile_builder().no_validity().distribution(
+    cudf::type_to_id<cudf::hash_value_type>(),
+    distribution_id::NORMAL,
+    0u,
+    std::numeric_limits<cudf::hash_value_type>::max());
+  auto const param_type = base64 ? cudf::type_id::UINT64 : cudf::type_id::UINT32;
+  auto const param_table =
+    create_random_table({param_type, param_type}, row_count{parameters}, param_profile);
+  auto const parameters_a = param_table->view().column(0);
+  auto const parameters_b = param_table->view().column(1);
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
 
@@ -56,8 +54,9 @@ static void bench_minhash(nvbench::state& state)
   state.add_global_memory_writes<nvbench::int32_t>(num_rows);  // output are hashes
 
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto result = base64 ? nvtext::minhash64(input, seeds.view(), hash_width)
-                         : nvtext::minhash(input, seeds.view(), hash_width);
+    auto result = base64
+                    ? nvtext::minhash64_permuted(input, 0, parameters_a, parameters_b, hash_width)
+                    : nvtext::minhash_permuted(input, 0, parameters_a, parameters_b, hash_width);
   });
 }
 
@@ -66,5 +65,5 @@ NVBENCH_BENCH(bench_minhash)
   .add_int64_axis("num_rows", {15000, 30000, 60000})
   .add_int64_axis("row_width", {6000, 28000, 50000})
   .add_int64_axis("hash_width", {12, 24})
-  .add_int64_axis("seed_count", {26, 260})
+  .add_int64_axis("parameters", {26, 260})
   .add_int64_axis("hash_type", {32, 64});
diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 0e352cf9b74..95d98b9aef4 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -97,12 +97,12 @@ std::unique_ptr<cudf::column> minhash(
  *
  * The input strings are first hashed using the given `seed` over substrings
  * of `width` characters. These hash values are then combined with the `a`
- * and `b` values using the following formula:
+ * and `b` parameter values using the following formula:
  * ```
  *   max_hash = max of uint32
  *   mp = (1 << 61) - 1
- *   hv = hash value of a substring
- *   pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
  * ```
  *
  * This calculation is performed on each substring and the minimum value is computed
@@ -205,12 +205,12 @@ std::unique_ptr<cudf::column> minhash64(
  *
  * The input strings are first hashed using the given `seed` over substrings
  * of `width` characters. These hash values are then combined with the `a`
- * and `b` values using the following formula:
+ * and `b` parameter values using the following formula:
  * ```
  *   max_hash = max of uint64
  *   mp = (1 << 61) - 1
- *   hv = hash value of a substring
- *   pv[i] = ((hv * a[i] + b[i]) % mp) & max_hash
+ *   hv[i] = hash value of a substring at i
+ *   pv[i] = ((hv[i] * a[i] + b[i]) % mp) & max_hash
  * ```
  *
  * This calculation is performed on each substring and the minimum value is computed
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index fa6e7a30a62..7e45e3ad3fd 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -175,7 +175,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 // The intermediate values are stored in shared-memory and therefore limits this count.
 // This value was found to be the most efficient size for both uint32 and uint64
 // hash types based on benchmarks.
-constexpr cuda::std::size_t calc_chunk_size = 16;
+constexpr cuda::std::size_t params_per_thread = 16;
 
 // this is deprecated and to be removed in the future;
 // keeping it for now for verifying results from the faster kernels below
@@ -208,19 +208,19 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
   // Found to be the most efficient shared memory size for both hash types
-  __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)];
+  __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)];
   auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
 
   auto const hasher = HashFunction(seed);
 
-  for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) {
+  for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
     // initialize working memory
-    auto const tile_hashes = block_values + (lane_idx * calc_chunk_size);
-    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + calc_chunk_size, init);
+    auto const tile_hashes = block_values + (lane_idx * params_per_thread);
+    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + params_per_thread, init);
     block.sync();
 
     auto const param_count =
-      cuda::std::min(static_cast<cuda::std::size_t>(calc_chunk_size), parameter_a.size() - i);
+      cuda::std::min(static_cast<cuda::std::size_t>(params_per_thread), parameter_a.size() - i);
 
     // each lane hashes 'width' substrings of d_str
     for (auto itr = begin; itr < end; itr += block_size) {
@@ -244,7 +244,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
         // permutation formula used by datatrove
         hash_value_type const v =
           ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((param_idx % calc_chunk_size) * block_size) + lane_idx;
+        auto const block_idx    = ((param_idx % params_per_thread) * block_size) + lane_idx;
         block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
       }
     }
@@ -284,7 +284,7 @@ constexpr cudf::size_type blocks_per_string = 64;
  * @param seed The seed used for the hash function
  * @param width Width in characters used for determining substrings to hash
  * @param d_hashes The resulting hash values are stored here
- * @param count Stores the number of strings above wide_string_threshold
+ * @param threshold_count Stores the number of strings above wide_string_threshold
  * @param param_count Number of parameters (used for proactive initialize)
  * @param d_results Final results vector (used for proactive initialize)
  */
@@ -351,7 +351,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
       cuda::atomic_ref<cudf::size_type, cuda::thread_scope_device> ref{*threshold_count};
       ref.fetch_add(1, cuda::std::memory_order_relaxed);
     }
-    // initialize the output for wider strings
+    // initialize the output -- only needed for wider strings
     auto d_output = d_results + (str_idx * param_count);
     for (auto i = lane_idx; i < param_count; i += tile_size) {
       d_output[i] = std::numeric_limits<hash_value_type>::max();
@@ -362,8 +362,8 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
 /**
  * @brief Permutation calculation kernel
  *
- * This kernel computes uses the hashes from the minhash_seed_kernel to
- * compute the results using the parameter_a and parameter_b values.
+ * This kernel uses the hashes from the minhash_seed_kernel and the parameter_a and
+ * parameter_b values to compute the final output results.
  * The output is the number of input rows (N) by the number of parameter values (M).
  * Each output[i] is the calculated result for parameter_a/b[0:M].
  *
@@ -433,20 +433,20 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
   // found to be the most efficient shared memory size for both hash types
-  __shared__ char shmem[block_size * calc_chunk_size * sizeof(hash_value_type)];
+  __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)];
   auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
 
-  for (std::size_t i = 0; i < parameter_a.size(); i += calc_chunk_size) {
-    // initialize our chunk of shared memory
-    // each thread handles calc_chunk_size of values
-    auto const chunk_values = block_values + (lane_idx * calc_chunk_size);
-    thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + calc_chunk_size, init);
+  for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
+    // initialize this block's chunk of shared memory
+    // each thread handles params_per_thread of values
+    auto const chunk_values = block_values + (lane_idx * params_per_thread);
+    thrust::uninitialized_fill(thrust::seq, chunk_values, chunk_values + params_per_thread, init);
     block.sync();
 
     auto const param_count =
-      cuda::std::min(static_cast<cuda::std::size_t>(calc_chunk_size), parameter_a.size() - i);
+      cuda::std::min(static_cast<cuda::std::size_t>(params_per_thread), parameter_a.size() - i);
 
-    // each lane accumulates min hashes in its spot in shared memory
+    // each lane accumulates min hashes in its shared memory
     for (auto itr = begin; itr < end; itr += block_size) {
       auto const hv = *itr;
       if (hv == 0) { continue; }  // 0 is used as a skip sentinel
@@ -455,14 +455,14 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
         // permutation formula used by datatrove
         hash_value_type const v =
           ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((param_idx % calc_chunk_size) * block_size) + lane_idx;
+        auto const block_idx    = ((param_idx % params_per_thread) * block_size) + lane_idx;
         block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
       }
     }
     block.sync();
 
     // reduce each parameter values vector to a single min value
-    // assumes that the block_size > calc_chunk_size
+    // assumes that the block_size > params_per_thread
     // each thread reduces a block_size of parameter values (thread per parameter)
     if (lane_idx < param_count) {
       auto const values = block_values + (lane_idx * block_size);
@@ -568,7 +568,7 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 
   // handle the strings above the threshold width
   if (threshold_index < input.size()) {
-    auto count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
+    auto const count = static_cast<cudf::thread_index_type>(input.size() - threshold_index);
     auto d_indices =
       cudf::device_span<cudf::size_type const>(indices.data() + threshold_index, count);
     cudf::detail::grid_1d grid{count * block_size * blocks_per_string, block_size};
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 4ca0607ede8..0f4d0ce0dc2 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -169,7 +169,7 @@ TEST_F(MinHashTest, MultiSeedWithNullInputRow)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, PermutedMultiSeed)
+TEST_F(MinHashTest, Permuted)
 {
   auto input =
     cudf::test::strings_column_wrapper({"doc 1",
@@ -185,31 +185,31 @@ TEST_F(MinHashTest, PermutedMultiSeed)
 
   auto view = cudf::strings_column_view(input);
 
-  auto first = thrust::counting_iterator<uint32_t>(10);
-  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto first  = thrust::counting_iterator<uint32_t>(10);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
   auto results =
-    nvtext::minhash_permuted(view, 0, cudf::column_view(seeds), cudf::column_view(seeds), 4);
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
 
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
   // clang-format off
-  LCW expected({
-    LCW{1392101586u,  394869177u,  811528444u},
-    LCW{ 211415830u,  187088503u,  130291444u},
-    LCW{2098117052u,  394869177u,  799753544u},
-    LCW{2264583304u, 2920538364u, 3576493424u},
-    LCW{ 253327882u,   41747273u,  302030804u},
-    LCW{2109809594u, 1017470651u,  326988172u},
-    LCW{1303819864u,  850676747u,  147107852u},
-    LCW{ 736021564u,  720812292u, 1405158760u},
-    LCW{ 902780242u,  134064807u, 1613944636u},
-    LCW{ 547084870u, 1748895564u,  656501844u}
+  LCW32 expected({
+    LCW32{1392101586u,  394869177u,  811528444u},
+    LCW32{ 211415830u,  187088503u,  130291444u},
+    LCW32{2098117052u,  394869177u,  799753544u},
+    LCW32{2264583304u, 2920538364u, 3576493424u},
+    LCW32{ 253327882u,   41747273u,  302030804u},
+    LCW32{2109809594u, 1017470651u,  326988172u},
+    LCW32{1303819864u,  850676747u,  147107852u},
+    LCW32{ 736021564u,  720812292u, 1405158760u},
+    LCW32{ 902780242u,  134064807u, 1613944636u},
+    LCW32{ 547084870u, 1748895564u,  656501844u}
   });
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
-  auto results64 =
-    nvtext::minhash64_permuted(view, 0, cudf::column_view(seeds64), cudf::column_view(seeds64), 4);
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
 
   using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
   // clang-format off
@@ -229,6 +229,41 @@ TEST_F(MinHashTest, PermutedMultiSeed)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, PermutedWide)
+{
+  std::string const small(2 << 10, 'x');  // below wide_string_threshold
+  std::string const wide(2 << 19, 'y');   // above wide_string_threshold
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
+
+  auto first  = thrust::counting_iterator<uint32_t>(20);
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 3);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u},
+    LCW32{1293098788u, 2860992281u,  133918478u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 3);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+   LCW64 expected64({
+     LCW64{1818322427062143853ul, 641024893347719371ul, 1769570368846988848ul},
+     LCW64{1389920339306667795ul, 421787002125838902ul, 1759496674158703968ul}
+   });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, WordsMinHash)
 {
   using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;

From 081546f960d2efa1ca4ce01ef3ec89ab676c4e5c Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 25 Oct 2024 16:04:47 -0400
Subject: [PATCH 18/25] reinstate original non-permuted code

---
 cpp/benchmarks/CMakeLists.txt    |   4 +-
 cpp/src/text/minhash.cu          | 176 +++++++-----------------
 cpp/tests/CMakeLists.txt         |   1 +
 cpp/tests/text/minhash_tests.cpp | 223 +++++--------------------------
 4 files changed, 86 insertions(+), 318 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 2a4ac789046..11819dfe281 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -348,8 +348,8 @@ ConfigureNVBench(BINARYOP_NVBENCH binaryop/binaryop.cpp binaryop/compiled_binary
 ConfigureBench(TEXT_BENCH text/subword.cpp)
 
 ConfigureNVBench(
-  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/ngrams.cpp
-  text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
+  TEXT_NVBENCH text/edit_distance.cpp text/hash_ngrams.cpp text/jaccard.cpp text/minhash.cpp
+  text/ngrams.cpp text/normalize.cpp text/replace.cpp text/tokenize.cpp text/vocab.cpp
 )
 
 # ##################################################################################################
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 7e45e3ad3fd..88b61fb6c8c 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -52,56 +52,54 @@ namespace nvtext {
 namespace detail {
 namespace {
 
-constexpr cudf::thread_index_type block_size = 256;
-// for tuning independently from block_size
-constexpr cudf::thread_index_type tile_size = block_size;
-
 /**
  * @brief Compute the minhash of each string for each seed
  *
- * This is a block-per-string algorithm where parallel threads within a block
- * work on a single string row.
+ * This is a warp-per-string algorithm where parallel threads within a warp
+ * work on substrings of a single string row.
  *
  * @tparam HashFunction hash function to use on each substring
  *
  * @param d_strings Strings column to process
  * @param seeds Seeds for hashing each string
  * @param width Substring window size in characters
- * @param working_memory Memory used to hold intermediate hash values
  * @param d_hashes Minhash output values for each string
  */
-template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
 CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
                                 cudf::device_span<hash_value_type const> seeds,
                                 cudf::size_type width,
-                                hash_value_type* working_memory,
                                 hash_value_type* d_hashes)
 {
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / block_size;
-  if (str_idx >= d_strings.size()) { return; }
+  auto const idx = static_cast<std::size_t>(threadIdx.x + blockIdx.x * blockDim.x);
+  if (idx >= (static_cast<std::size_t>(d_strings.size()) *
+              static_cast<std::size_t>(cudf::detail::warp_size))) {
+    return;
+  }
+
+  auto const str_idx  = static_cast<cudf::size_type>(idx / cudf::detail::warp_size);
+  auto const lane_idx = static_cast<cudf::size_type>(idx % cudf::detail::warp_size);
+
   if (d_strings.is_null(str_idx)) { return; }
 
   auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-  auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = idx % block_size;
-
-  auto tile_hashes = working_memory + (str_idx * block_size * seeds.size());
+  auto const d_output = d_hashes + (str_idx * seeds.size());
 
-  // initialize working memory
-  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) {
-    auto begin = tile_hashes + (seed_idx * block_size);
-    thrust::uninitialized_fill(thrust::seq, begin, begin + block_size, init);
+  // initialize hashes output for this string
+  if (lane_idx == 0) {
+    auto const init = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
+    thrust::fill(thrust::seq, d_output, d_output + seeds.size(), init);
   }
-  __syncthreads();
-
-  auto const d_output = d_hashes + (str_idx * seeds.size());
+  __syncwarp();
 
   auto const begin = d_str.data() + lane_idx;
   auto const end   = d_str.data() + d_str.size_bytes();
 
   // each lane hashes 'width' substrings of d_str
-  for (auto itr = begin; itr < end; itr += block_size) {
+  for (auto itr = begin; itr < end; itr += cudf::detail::warp_size) {
     if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
     auto const check_str =  // used for counting 'width' characters
       cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
@@ -109,29 +107,30 @@ CUDF_KERNEL void minhash_kernel(cudf::column_device_view const d_strings,
     if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
 
     auto const hash_str = cudf::string_view(itr, bytes);
+    // hashing with each seed on the same section of the string is 10x faster than
+    // computing the substrings for each seed
     for (std::size_t seed_idx = 0; seed_idx < seeds.size(); ++seed_idx) {
       auto const hasher = HashFunction(seeds[seed_idx]);
-      hash_value_type hv;
+      // hash substring and store the min value
       if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        hv = hasher(hash_str);
+        auto const hvalue = hasher(hash_str);
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
       } else {
-        hv = thrust::get<0>(hasher(hash_str));
+        // This code path assumes the use of MurmurHash3_x64_128 which produces 2 uint64 values
+        // but only uses the first uint64 value as requested by the LLM team.
+        auto const hvalue = thrust::get<0>(hasher(hash_str));
+        cuda::atomic_ref<hash_value_type, cuda::thread_scope_block> ref{*(d_output + seed_idx)};
+        ref.fetch_min(hvalue, cuda::std::memory_order_relaxed);
       }
-      tile_hashes[(seed_idx * block_size) + lane_idx] =
-        cuda::std::min(hv, tile_hashes[(seed_idx * block_size) + lane_idx]);
     }
   }
-  __syncthreads();
-
-  // compute final result
-  for (std::size_t seed_idx = lane_idx; seed_idx < seeds.size(); seed_idx += block_size) {
-    auto begin = tile_hashes + (seed_idx * block_size);
-    auto hv    = thrust::reduce(thrust::seq, begin, begin + block_size, init, thrust::minimum{});
-    d_output[seed_idx] = hv;
-  }
 }
 
-template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
+template <
+  typename HashFunction,
+  typename hash_value_type = std::
+    conditional_t<std::is_same_v<typename HashFunction::result_type, uint32_t>, uint32_t, uint64_t>>
 std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                          cudf::device_span<hash_value_type const> seeds,
                                          cudf::size_type width,
@@ -159,108 +158,25 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
                                           mr);
   auto d_hashes = hashes->mutable_view().data<hash_value_type>();
 
-  auto const wm_size  = cudf::util::round_up_safe(seeds.size() * tile_size * input.size(),
-                                                 static_cast<std::size_t>(block_size));
-  auto working_memory = rmm::device_uvector<hash_value_type>(wm_size, stream);
-
-  cudf::detail::grid_1d grid{static_cast<cudf::thread_index_type>(input.size()) * tile_size,
-                             block_size};
+  constexpr cudf::thread_index_type block_size = 256;
+  cudf::detail::grid_1d grid{
+    static_cast<cudf::thread_index_type>(input.size()) * cudf::detail::warp_size, block_size};
   minhash_kernel<HashFunction><<<grid.num_blocks, grid.num_threads_per_block, 0, stream.value()>>>(
-    *d_strings, seeds, width, working_memory.data(), d_hashes);
+    *d_strings, seeds, width, d_hashes);
 
   return hashes;
 }
 
-// Number of parameter a/b values to process per thread.
+constexpr cudf::thread_index_type block_size = 256;
+// for tuning independently from block_size
+constexpr cudf::thread_index_type tile_size = block_size;
+
+// Number of a/b parameter values to process per thread.
 // The intermediate values are stored in shared-memory and therefore limits this count.
 // This value was found to be the most efficient size for both uint32 and uint64
 // hash types based on benchmarks.
 constexpr cuda::std::size_t params_per_thread = 16;
 
-// this is deprecated and to be removed in the future;
-// keeping it for now for verifying results from the faster kernels below
-template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
-CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_strings,
-                                         hash_value_type seed,
-                                         cudf::device_span<hash_value_type const> parameter_a,
-                                         cudf::device_span<hash_value_type const> parameter_b,
-                                         cudf::size_type width,
-                                         hash_value_type* d_hashes)
-{
-  auto const idx     = cudf::detail::grid_1d::global_thread_id();
-  auto const str_idx = idx / block_size;
-  if (str_idx >= d_strings.size()) { return; }
-  if (d_strings.is_null(str_idx)) { return; }
-
-  auto const block = cooperative_groups::this_thread_block();
-
-  auto const d_str    = d_strings.element<cudf::string_view>(str_idx);
-  auto const init     = d_str.empty() ? 0 : std::numeric_limits<hash_value_type>::max();
-  auto const lane_idx = block.thread_rank();  // idx % block_size;
-
-  auto const d_output = d_hashes + (str_idx * parameter_a.size());
-
-  auto const begin = d_str.data() + lane_idx;
-  auto const end   = d_str.data() + d_str.size_bytes();
-
-  // constants used for the permutation calculations
-  constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
-  constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
-
-  // Found to be the most efficient shared memory size for both hash types
-  __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)];
-  auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
-
-  auto const hasher = HashFunction(seed);
-
-  for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
-    // initialize working memory
-    auto const tile_hashes = block_values + (lane_idx * params_per_thread);
-    thrust::uninitialized_fill(thrust::seq, tile_hashes, tile_hashes + params_per_thread, init);
-    block.sync();
-
-    auto const param_count =
-      cuda::std::min(static_cast<cuda::std::size_t>(params_per_thread), parameter_a.size() - i);
-
-    // each lane hashes 'width' substrings of d_str
-    for (auto itr = begin; itr < end; itr += block_size) {
-      if (cudf::strings::detail::is_utf8_continuation_char(*itr)) { continue; }
-      auto const check_str =  // used for counting 'width' characters
-        cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
-      auto const [bytes, left] =
-        cudf::strings::detail::bytes_to_character_position(check_str, width);
-      if ((itr != d_str.data()) && (left > 0)) { continue; }  // true if past the end of the string
-
-      auto const hash_str = cudf::string_view(itr, bytes);
-      hash_value_type hv;
-      if constexpr (std::is_same_v<hash_value_type, uint32_t>) {
-        hv = hasher(hash_str);
-      } else {
-        hv = thrust::get<0>(hasher(hash_str));
-      }
-      hv = cuda::std::max(hv, hash_value_type{1});
-
-      for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
-        // permutation formula used by datatrove
-        hash_value_type const v =
-          ((hv * parameter_a[param_idx] + parameter_b[param_idx]) % mersenne_prime) & hash_max;
-        auto const block_idx    = ((param_idx % params_per_thread) * block_size) + lane_idx;
-        block_values[block_idx] = cuda::std::min(v, block_values[block_idx]);
-      }
-    }
-    block.sync();
-
-    // reduce each parameter values vector to a single min value
-    if (lane_idx < param_count) {
-      auto const values = block_values + (lane_idx * block_size);
-      auto const minv =
-        thrust::reduce(thrust::seq, values, values + block_size, init, thrust::minimum{});
-      d_output[lane_idx + i] = minv;
-    }
-    block.sync();
-  }
-}
-
 // Separate kernels are used to process strings above and below this value (in bytes).
 constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
 // The number of blocks per string for the above-threshold kernel processing.
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b78a64d0e55..a4213dcbe94 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -611,6 +611,7 @@ ConfigureTest(
   text/bpe_tests.cpp
   text/edit_distance_tests.cpp
   text/jaccard_tests.cpp
+  text/minhash_tests.cpp
   text/ngrams_tests.cpp
   text/ngrams_tokenize_tests.cpp
   text/normalize_tests.cpp
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 0f4d0ce0dc2..3c0c0068a49 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -32,143 +32,6 @@
 
 struct MinHashTest : public cudf::test::BaseFixture {};
 
-TEST_F(MinHashTest, Basic)
-{
-  auto validity = cudf::test::iterators::null_at(1);
-  auto input =
-    cudf::test::strings_column_wrapper({"doc 1",
-                                        "",
-                                        "this is doc 2",
-                                        "",
-                                        "doc 3",
-                                        "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog.",
-                                        "line eight",
-                                        "line nine",
-                                        "line ten"},
-                                       validity);
-
-  auto view = cudf::strings_column_view(input);
-
-  auto results = nvtext::minhash(view);
-
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>({1207251914u,
-                                                                    0u,
-                                                                    21141582u,
-                                                                    0u,
-                                                                    1207251914u,
-                                                                    655955059u,
-                                                                    86520422u,
-                                                                    304329233u,
-                                                                    640477688u,
-                                                                    640477688u},
-                                                                   validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto results64  = nvtext::minhash64(view);
-  auto expected64 = cudf::test::fixed_width_column_wrapper<uint64_t>({774489391575805754ul,
-                                                                      0ul,
-                                                                      3232308021562742685ul,
-                                                                      0ul,
-                                                                      13145552576991307582ul,
-                                                                      14660046701545912182ul,
-                                                                      398062025280761388ul,
-                                                                      1273320923074904938ul,
-                                                                      3456065052701055601ul,
-                                                                      10664519708968191209ul},
-                                                                     validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
-TEST_F(MinHashTest, LengthEqualsWidth)
-{
-  auto input   = cudf::test::strings_column_wrapper({"abcdé", "fghjk", "lmnop", "qrstu", "vwxyz"});
-  auto view    = cudf::strings_column_view(input);
-  auto results = nvtext::minhash(view, 0, 5);
-  auto expected = cudf::test::fixed_width_column_wrapper<uint32_t>(
-    {3825281041u, 2728681928u, 1984332911u, 3965004915u, 192452857u});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-}
-
-TEST_F(MinHashTest, MultiSeed)
-{
-  auto input =
-    cudf::test::strings_column_wrapper({"doc 1",
-                                        "this is doc 2",
-                                        "doc 3",
-                                        "d",
-                                        "The quick brown fox jumpéd over the lazy brown dog.",
-                                        "line six",
-                                        "line seven",
-                                        "line eight",
-                                        "line nine",
-                                        "line ten"});
-
-  auto view = cudf::strings_column_view(input);
-
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({0, 1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
-
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
-  // clang-format off
-  LCW expected({LCW{1207251914u, 1677652962u, 1061355987u},
-                LCW{  21141582u,  580916568u, 1258052021u},
-                LCW{1207251914u,  943567174u, 1109272887u},
-                LCW{ 655955059u,  488346356u, 2394664816u},
-                LCW{  86520422u,  236622901u,  102546228u},
-                LCW{ 640477688u,  198451716u,  136303992u},
-                LCW{ 640477688u,  198451716u,  577802054u},
-                LCW{ 304329233u,  198451716u,  714941560u},
-                LCW{ 640477688u,  198451716u,  261342259u},
-                LCW{ 640477688u,  198451716u,  139988887u}});
-  // clang-format on
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({0, 1, 2});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
-
-  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
-  // clang-format off
-  LCW64 expected64({LCW64{  774489391575805754ul, 10435654231793485448ul,  1188598072697676120ul},
-                    LCW64{ 3232308021562742685ul,  4445611509348165860ul,  1188598072697676120ul},
-                    LCW64{13145552576991307582ul,  6846192680998069919ul,  1188598072697676120ul},
-                    LCW64{14660046701545912182ul, 17106501326045553694ul, 17713478494106035784ul},
-                    LCW64{  398062025280761388ul,   377720198157450084ul,   984941365662009329ul},
-                    LCW64{ 2837259098848821044ul,   650799815433771163ul,  2428991957842356245ul},
-                    LCW64{ 2105419906076957667ul,   650799815433771163ul,  2428991957842356245ul},
-                    LCW64{ 1273320923074904938ul,   650799815433771163ul,  2428991957842356245ul},
-                    LCW64{ 3456065052701055601ul,   650799815433771163ul,  2428991957842356245ul},
-                    LCW64{10664519708968191209ul,   650799815433771163ul,  2428991957842356245ul}});
-  // clang-format on
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
-TEST_F(MinHashTest, MultiSeedWithNullInputRow)
-{
-  auto validity = cudf::test::iterators::null_at(1);
-  auto input    = cudf::test::strings_column_wrapper({"abcdéfgh", "", "", "stuvwxyz"}, validity);
-  auto view     = cudf::strings_column_view(input);
-
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::minhash(view, cudf::column_view(seeds));
-
-  using LCW = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW expected({LCW{484984072u, 1074168784u}, LCW{}, LCW{0u, 0u}, LCW{571652169u, 173528385u}},
-               validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::minhash64(view, cudf::column_view(seeds64));
-
-  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{2597399324547032480ul, 4461410998582111052ul},
-                    LCW64{},
-                    LCW64{0ul, 0ul},
-                    LCW64{2717781266371273264ul, 6977325820868387259ul}},
-                   validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
 TEST_F(MinHashTest, Permuted)
 {
   auto input =
@@ -264,48 +127,17 @@ TEST_F(MinHashTest, PermutedWide)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
-TEST_F(MinHashTest, WordsMinHash)
-{
-  using LCWS    = cudf::test::lists_column_wrapper<cudf::string_view>;
-  auto validity = cudf::test::iterators::null_at(1);
-
-  LCWS input(
-    {LCWS({"hello", "abcdéfgh"}),
-     LCWS{},
-     LCWS({"rapids", "moré", "test", "text"}),
-     LCWS({"The", "quick", "brown", "fox", "jumpéd", "over", "the", "lazy", "brown", "dog"})},
-    validity);
-
-  auto view = cudf::lists_column_view(input);
-
-  auto seeds   = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2});
-  auto results = nvtext::word_minhash(view, cudf::column_view(seeds));
-  using LCW32  = cudf::test::lists_column_wrapper<uint32_t>;
-  LCW32 expected({LCW32{2069617641u, 1975382903u},
-                  LCW32{},
-                  LCW32{657297235u, 1010955999u},
-                  LCW32{644643885u, 310002789u}},
-                 validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-
-  auto seeds64   = cudf::test::fixed_width_column_wrapper<uint64_t>({11, 22});
-  auto results64 = nvtext::word_minhash64(view, cudf::column_view(seeds64));
-  using LCW64    = cudf::test::lists_column_wrapper<uint64_t>;
-  LCW64 expected64({LCW64{1940333969930105370ul, 272615362982418219ul},
-                    LCW64{},
-                    LCW64{5331949571924938590ul, 2088583894581919741ul},
-                    LCW64{3400468157617183341ul, 2398577492366130055ul}},
-                   validity);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
-}
-
 TEST_F(MinHashTest, EmptyTest)
 {
-  auto input   = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
-  auto view    = cudf::strings_column_view(input->view());
-  auto results = nvtext::minhash(view);
+  auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
+  auto view   = cudf::strings_column_view(input->view());
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>({1, 2, 3});
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
   EXPECT_EQ(results->size(), 0);
-  results = nvtext::minhash64(view);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>({1, 2, 3});
+  results       = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
   EXPECT_EQ(results->size(), 0);
 }
 
@@ -313,20 +145,39 @@ TEST_F(MinHashTest, ErrorsTest)
 {
   auto input = cudf::test::strings_column_wrapper({"this string intentionally left blank"});
   auto view  = cudf::strings_column_view(input);
-  EXPECT_THROW(nvtext::minhash(view, 0, 0), std::invalid_argument);
-  EXPECT_THROW(nvtext::minhash64(view, 0, 0), std::invalid_argument);
-  auto seeds = cudf::test::fixed_width_column_wrapper<uint32_t>();
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::invalid_argument);
-  auto seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::invalid_argument);
+  auto empty = cudf::test::fixed_width_column_wrapper<uint32_t>();
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 0),
+    std::invalid_argument);
+  auto empty64 = cudf::test::fixed_width_column_wrapper<uint64_t>();
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 0),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(empty), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(empty64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 
   std::vector<std::string> h_input(50000, "");
   input = cudf::test::strings_column_wrapper(h_input.begin(), h_input.end());
   view  = cudf::strings_column_view(input);
 
   auto const zeroes = thrust::constant_iterator<uint32_t>(0);
-  seeds             = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash(view, cudf::column_view(seeds)), std::overflow_error);
-  seeds64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
-  EXPECT_THROW(nvtext::minhash64(view, cudf::column_view(seeds64)), std::overflow_error);
+  auto params       = cudf::test::fixed_width_column_wrapper<uint32_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4),
+    std::overflow_error);
+  auto params64 = cudf::test::fixed_width_column_wrapper<uint64_t>(zeroes, zeroes + 50000);
+  EXPECT_THROW(nvtext::minhash64_permuted(
+                 view, 0, cudf::column_view(params64), cudf::column_view(params64), 4),
+               std::overflow_error);
+
+  EXPECT_THROW(
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(empty), 4),
+    std::invalid_argument);
+  EXPECT_THROW(
+    nvtext::minhash64_permuted(view, 0, cudf::column_view(params64), cudf::column_view(empty64), 4),
+    std::invalid_argument);
 }

From ef7bb46cc72061a1cfb7e252e65a714f3e62994b Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 28 Oct 2024 11:14:19 -0400
Subject: [PATCH 19/25] add gtest for parameter chunking

---
 cpp/src/text/minhash.cu          | 37 ++++++++++++---------
 cpp/tests/text/minhash_tests.cpp | 57 ++++++++++++++++++++++++++++++++
 2 files changed, 78 insertions(+), 16 deletions(-)

diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 88b61fb6c8c..362672e74bb 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -168,12 +168,12 @@ std::unique_ptr<cudf::column> minhash_fn(cudf::strings_column_view const& input,
 }
 
 constexpr cudf::thread_index_type block_size = 256;
-// for tuning independently from block_size
+// for potentially tuning minhash_seed_kernel independently from block_size
 constexpr cudf::thread_index_type tile_size = block_size;
 
 // Number of a/b parameter values to process per thread.
 // The intermediate values are stored in shared-memory and therefore limits this count.
-// This value was found to be the most efficient size for both uint32 and uint64
+// This value was found to be an efficient size for both uint32 and uint64
 // hash types based on benchmarks.
 constexpr cuda::std::size_t params_per_thread = 16;
 
@@ -181,7 +181,7 @@ constexpr cuda::std::size_t params_per_thread = 16;
 constexpr cudf::size_type wide_string_threshold = 1 << 18;  // 256K
 // The number of blocks per string for the above-threshold kernel processing.
 constexpr cudf::size_type blocks_per_string = 64;
-// These values determined using redpajama and books_sample datasets
+// The above values were determined using the redpajama and books_sample datasets
 
 /**
  * @brief Hashing kernel launched as a thread per tile-size (block or warp)
@@ -201,8 +201,8 @@ constexpr cudf::size_type blocks_per_string = 64;
  * @param width Width in characters used for determining substrings to hash
  * @param d_hashes The resulting hash values are stored here
  * @param threshold_count Stores the number of strings above wide_string_threshold
- * @param param_count Number of parameters (used for proactive initialize)
- * @param d_results Final results vector (used for proactive initialize)
+ * @param param_count Number of parameters (used for the proactive initialize)
+ * @param d_results Final results vector (used for the proactive initialize)
  */
 template <typename HashFunction, typename hash_value_type = typename HashFunction::result_type>
 CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
@@ -218,7 +218,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   if (str_idx >= d_strings.size()) { return; }
   if (d_strings.is_null(str_idx)) { return; }
 
-  // retrieve this strings offset to locate the output position in d_hashes
+  // retrieve this string's offset to locate the output position in d_hashes
   auto const offsets = d_strings.child(cudf::strings_column_view::offsets_column_index);
   auto const offsets_itr =
     cudf::detail::input_offsetalator(offsets.head(), offsets.type(), d_strings.offset());
@@ -229,7 +229,7 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
   auto const d_str    = cudf::string_view(d_strings.head<char>() + offset, size_bytes);
   auto const lane_idx = tid % tile_size;
 
-  // hashes for this string/thread stored here
+  // hashes for this string/thread are stored here
   auto seed_hashes = d_hashes + offset - offsets_itr[0] + lane_idx;
 
   auto const begin  = d_str.data() + lane_idx;
@@ -244,7 +244,8 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
     auto const check_str =  // used for counting 'width' characters
       cudf::string_view(itr, static_cast<cudf::size_type>(thrust::distance(itr, end)));
     auto const [bytes, left] = cudf::strings::detail::bytes_to_character_position(check_str, width);
-    if ((itr != d_str.data()) && (left > 0)) {  // true if past the end of the string
+    if ((itr != d_str.data()) && (left > 0)) {
+      // true itr+width is past the end of the string
       *seed_hashes = 0;
       continue;
     }
@@ -291,10 +292,10 @@ CUDF_KERNEL void minhash_seed_kernel(cudf::column_device_view const d_strings,
  * @tparam blocks_per_string Number of blocks used to process each string
  *
  * @param d_strings The input strings to hash
- * @param indices The indices of the strings in d_strings to process here
+ * @param indices The indices of the strings in d_strings to process
  * @param parameter_a 1st set of parameters for the calculation result
  * @param parameter_b 2nd set of parameters for the calculation result
- * @param width Width used for calculating the number of available hashes in each string
+ * @param width Used for calculating the number of available hashes in each string
  * @param d_hashes The hash values computed in minhash_seed_kernel
  * @param d_results Final results vector of calculate values
  */
@@ -323,13 +324,16 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   auto const size_bytes = static_cast<cudf::size_type>(offsets_itr[str_idx + 1] - offset);
 
   // number of items to process in this block;
-  // last block includes any remainder values from the size_bytes/blocks_per_string truncation
+  // last block also includes any remainder values from the size_bytes/blocks_per_string truncation
+  // example:
+  //  each section_size for string with size 588090 and blocks_per_string=64 is 9188
+  //  except the last section which is 9188 + (588090 % 64) = 9246
   auto const section_size =
     (size_bytes / blocks_per_string) +
     (section_idx < (blocks_per_string - 1) ? 0 : size_bytes % blocks_per_string);
   auto const section_offset = section_idx * (size_bytes / blocks_per_string);
 
-  // hash values for this block
+  // hash values for this block/section
   auto const seed_hashes = d_hashes + offset - offsets_itr[0] + section_offset;
   // width used here as a max value since a string's char-count <= byte-count
   auto const hashes_size =
@@ -348,7 +352,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr uint64_t mersenne_prime  = (1UL << 61) - 1;
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
-  // found to be the most efficient shared memory size for both hash types
+  // found to be an efficient shared memory size for both hash types
   __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)];
   auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
 
@@ -365,7 +369,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
     // each lane accumulates min hashes in its shared memory
     for (auto itr = begin; itr < end; itr += block_size) {
       auto const hv = *itr;
-      if (hv == 0) { continue; }  // 0 is used as a skip sentinel
+      // 0 is used as a skip sentinel for UTF-8 and trailing bytes
+      if (hv == 0) { continue; }
 
       for (std::size_t param_idx = i; param_idx < (i + param_count); ++param_idx) {
         // permutation formula used by datatrove
@@ -377,8 +382,8 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
     }
     block.sync();
 
-    // reduce each parameter values vector to a single min value
-    // assumes that the block_size > params_per_thread
+    // reduce each parameter values vector to a single min value;
+    // assumes that the block_size > params_per_thread;
     // each thread reduces a block_size of parameter values (thread per parameter)
     if (lane_idx < param_count) {
       auto const values = block_values + (lane_idx * block_size);
diff --git a/cpp/tests/text/minhash_tests.cpp b/cpp/tests/text/minhash_tests.cpp
index 3c0c0068a49..1332edb4675 100644
--- a/cpp/tests/text/minhash_tests.cpp
+++ b/cpp/tests/text/minhash_tests.cpp
@@ -127,6 +127,63 @@ TEST_F(MinHashTest, PermutedWide)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
 }
 
+TEST_F(MinHashTest, PermutedManyParameters)
+{
+  std::string const small(2 << 10, 'x');
+  std::string const wide(2 << 19, 'y');
+  auto input = cudf::test::strings_column_wrapper({small, wide});
+  auto view  = cudf::strings_column_view(input);
+
+  auto first = thrust::counting_iterator<uint32_t>(20);
+  // more than params_per_thread
+  auto params = cudf::test::fixed_width_column_wrapper<uint32_t>(first, first + 31);
+  auto results =
+    nvtext::minhash_permuted(view, 0, cudf::column_view(params), cudf::column_view(params), 4);
+
+  using LCW32 = cudf::test::lists_column_wrapper<uint32_t>;
+  // clang-format off
+  LCW32 expected({
+    LCW32{1731998032u,  315359380u, 3193688024u, 1777049372u,  360410720u, 3238739364u, 1822100712u,  405462060u,
+          3283790704u, 1867152052u,  450513400u, 3328842044u, 1912203392u,  495564740u, 3373893384u, 1957254732u,
+           540616080u, 3418944724u, 2002306072u,  585667420u, 3463996064u, 2047357412u,  630718760u, 3509047404u,
+          2092408752u,  675770100u, 3554098744u, 2137460092u,  720821440u, 3599150084u, 2182511432u},
+    LCW32{1293098788u, 2860992281u,  133918478u, 1701811971u, 3269705464u,  542631661u, 2110525154u, 3678418647u,
+           951344844u, 2519238337u, 4087131830u, 1360058027u, 2927951520u,  200877717u, 1768771210u, 3336664703u,
+           609590900u, 2177484393u, 3745377886u, 1018304083u, 2586197576u, 4154091069u, 1427017266u, 2994910759u,
+           267836956u, 1835730449u, 3403623942u,  676550139u, 2244443632u, 3812337125u, 1085263322u}
+  });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  // more than params_per_thread
+  auto params64  = cudf::test::fixed_width_column_wrapper<uint64_t, uint32_t>(first, first + 31);
+  auto results64 = nvtext::minhash64_permuted(
+    view, 0, cudf::column_view(params64), cudf::column_view(params64), 4);
+
+  using LCW64 = cudf::test::lists_column_wrapper<uint64_t>;
+  // clang-format off
+   LCW64 expected64({
+     LCW64{1818322427062143853,  641024893347719371, 1769570368846988848, 592272835132564366,
+           1720818310631833835,  543520776917409353, 1672066252416678822, 494768718702254348,
+           1623314194201523817,  446016660487099335, 1574562135986368804, 397264602271944322,
+           1525810077771213799,  348512544056789317, 1477058019556058786, 299760485841634304,
+           1428305961340903773,  251008427626479291, 1379553903125748768, 202256369411324286,
+           1330801844910593755,  153504311196169273, 1282049786695438742, 104752252981014268,
+           1233297728480283737,   56000194765859255, 1184545670265128724,   7248136550704242,
+           1135793612049973719, 2264339087549243188, 1087041553834818706},
+     LCW64{1389920339306667795,  421787002125838902, 1759496674158703968,  791363336977875075,
+           2129073009010740141, 1160939671829911248,  192806334649082363, 1530516006681947421,
+            562382669501118536, 1900092341533983602,  931959004353154709, 2269668676386019775,
+           1301535339205190882,  333402002024361997, 1671111674057227055,  702978336876398170,
+           2040688008909263228, 1072554671728434343,  104421334547605450, 1442131006580470516,
+            473997669399641631, 1811707341432506689,  843574004251677804, 2181283676284542862,
+           1213150339103713977,  245017001922885084, 1582726673955750150,  614593336774921257,
+           1952303008807786323,  984169671626957438,   16036334446128545}
+   });
+  // clang-format on
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results64, expected64);
+}
+
 TEST_F(MinHashTest, EmptyTest)
 {
   auto input  = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});

From 2a9928a09c104d68e0ed99f8409eb9ecdc8b6402 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Mon, 28 Oct 2024 11:15:34 -0400
Subject: [PATCH 20/25] move pytests to use permuted api

---
 .../cudf/cudf/tests/text/test_text_methods.py | 46 ++++++-------------
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 12 ++---
 .../pylibcudf/tests/test_nvtext_minhash.py    | 10 ++--
 3 files changed, 26 insertions(+), 42 deletions(-)

diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 997ca357986..6ec9348dc8d 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -885,65 +885,45 @@ def test_is_vowel_consonant():
 def test_minhash():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
+    params = cudf.Series([1, 2, 3], dtype=np.uint32)
     expected = cudf.Series(
         [
-            cudf.Series([21141582], dtype=np.uint32),
-            cudf.Series([962346254], dtype=np.uint32),
-            None,
-            cudf.Series([0], dtype=np.uint32),
-        ]
-    )
-    actual = strings.str.minhash()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-    expected = cudf.Series(
-        [
-            cudf.Series([1305480167, 668155704, 34311509], dtype=np.uint32),
-            cudf.Series([32665384, 3470118, 363147162], dtype=np.uint32),
+            cudf.Series([1305480168, 462824406, 74608229], dtype=np.uint32),
+            cudf.Series([32665385, 65330770, 97996155], dtype=np.uint32),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint32),
         ]
     )
-    actual = strings.str.minhash(seeds=seeds, width=5)
+    actual = strings.str.minhash_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
-    expected = cudf.Series(
-        [
-            cudf.Series([3232308021562742685], dtype=np.uint64),
-            cudf.Series([23008204270530356], dtype=np.uint64),
-            None,
-            cudf.Series([0], dtype=np.uint64),
-        ]
-    )
-    actual = strings.str.minhash64()
-    assert_eq(expected, actual)
-    seeds = cudf.Series([0, 1, 2], dtype=np.uint64)
+    params = cudf.Series([1, 2, 3], dtype=np.uint64)
     expected = cudf.Series(
         [
             cudf.Series(
-                [7082801294247314046, 185949556058924788, 167570629329462454],
+                [105531920695060180, 172452388517576009, 316595762085180524],
                 dtype=np.uint64,
             ),
             cudf.Series(
-                [382665377781028452, 86243762733551437, 7688750597953083512],
+                [35713768479063122, 71427536958126236, 58787297728258212],
                 dtype=np.uint64,
             ),
             None,
             cudf.Series([0, 0, 0], dtype=np.uint64),
         ]
     )
-    actual = strings.str.minhash64(seeds=seeds, width=5)
+    actual = strings.str.minhash64_permuted(0, a=params, b=params, width=5)
     assert_eq(expected, actual)
 
     # test wrong seed types
     with pytest.raises(ValueError):
-        strings.str.minhash(seeds="a")
+        strings.str.minhash_permuted(1, a="a", b="b", width=7)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.int32)
-        strings.str.minhash(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.int32)
+        strings.str.minhash_permuted(1, a=params, b=params, width=6)
     with pytest.raises(ValueError):
-        seeds = cudf.Series([0, 1, 2], dtype=np.uint32)
-        strings.str.minhash64(seeds=seeds)
+        params = cudf.Series([0, 1, 2], dtype=np.uint32)
+        strings.str.minhash64_permuted(1, a=params, b=params, width=8)
 
 
 def test_word_minhash():
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 8c2c6cc7f4d..574d586b9da 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -68,7 +68,7 @@ cpdef Column minhash_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x86_32 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash`.
+    For details, see :cpp:func:`minhash_permuted`.
 
     Parameters
     ----------
@@ -77,9 +77,9 @@ cpdef Column minhash_permuted(
     seed : uint32_t
         Seed used for the hash function
     a : Column
-        Seed value(s) used for the hash algorithm.
+        1st parameter value used for the minhash algorithm.
     b : Column
-        2nd seed value(s) used for the hash algorithm.
+        2nd parameter value used for the minhash algorithm.
     width : size_type
         Character width used for apply substrings;
 
@@ -151,7 +151,7 @@ cpdef Column minhash64_permuted(
     Returns the minhash values for each string.
     This function uses MurmurHash3_x64_128 for the hash algorithm.
 
-    For details, see :cpp:func:`minhash`.
+    For details, see :cpp:func:`minhash64_permuted`.
 
     Parameters
     ----------
@@ -160,9 +160,9 @@ cpdef Column minhash64_permuted(
     seed : uint64_t
         Seed used for the hash function
     a : Column
-        Seed value(s) used for the hash algorithm.
+        1st parameter value used for the minhash algorithm.
     b : Column
-        2nd seed value(s) used for the hash algorithm.
+        2nd parameter value used for the minhash algorithm.
     width : size_type
         Character width used for apply substrings;
 
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index ead9ee094af..918d73c2c30 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -24,12 +24,16 @@ def word_minhash_input_data(request):
 def test_minhash(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
-        plc.nvtext.minhash.minhash
+        plc.nvtext.minhash.minhash_permuted
         if seed_type == pa.uint32()
-        else plc.nvtext.minhash.minhash64
+        else plc.nvtext.minhash.minhash64_permuted
     )
     result = minhash_func(
-        plc.interop.from_arrow(input_arr), plc.interop.from_arrow(seeds), width
+        plc.interop.from_arrow(input_arr),
+        0,
+        plc.interop.from_arrow(seeds),
+        plc.interop.from_arrow(seeds),
+        width,
     )
     pa_result = plc.interop.to_arrow(result)
     assert all(len(got) == len(seeds) for got, s in zip(pa_result, input_arr))

From a39b7388428f0d09fc805310efbd7d82ead18479 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Tue, 29 Oct 2024 11:54:19 -0400
Subject: [PATCH 21/25] add docstring for new APIs

---
 python/cudf/cudf/_lib/nvtext/minhash.pyx      | 30 ++++----
 python/cudf/cudf/core/column/string.py        | 71 ++++++++++++++++++-
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 28 ++++----
 .../pylibcudf/tests/test_nvtext_minhash.py    |  2 +-
 4 files changed, 98 insertions(+), 33 deletions(-)

diff --git a/python/cudf/cudf/_lib/nvtext/minhash.pyx b/python/cudf/cudf/_lib/nvtext/minhash.pyx
index a95f8edb3c8..25cfcf99ca6 100644
--- a/python/cudf/cudf/_lib/nvtext/minhash.pyx
+++ b/python/cudf/cudf/_lib/nvtext/minhash.pyx
@@ -21,14 +21,15 @@ def minhash(Column input, Column seeds, int width=4):
 
 @acquire_spill_lock()
 def minhash_permuted(Column input, uint32_t seed, Column a, Column b, int width):
-    result = nvtext.minhash.minhash_permuted(
-        input.to_pylibcudf(mode="read"),
-        seed,
-        a.to_pylibcudf(mode="read"),
-        b.to_pylibcudf(mode="read"),
-        width,
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
     )
-    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
@@ -43,14 +44,15 @@ def minhash64(Column input, Column seeds, int width=4):
 
 @acquire_spill_lock()
 def minhash64_permuted(Column input, uint64_t seed, Column a, Column b, int width):
-    result = nvtext.minhash.minhash64_permuted(
-        input.to_pylibcudf(mode="read"),
-        seed,
-        a.to_pylibcudf(mode="read"),
-        b.to_pylibcudf(mode="read"),
-        width,
+    return Column.from_pylibcudf(
+        nvtext.minhash.minhash64_permuted(
+            input.to_pylibcudf(mode="read"),
+            seed,
+            a.to_pylibcudf(mode="read"),
+            b.to_pylibcudf(mode="read"),
+            width,
+        )
     )
-    return Column.from_pylibcudf(result)
 
 
 @acquire_spill_lock()
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index f52a564c052..f8d20f9d21c 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5351,8 +5351,41 @@ def minhash(
         )
 
     def minhash_permuted(
-        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int = 4
+        self, seed: np.uint32, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x86_32 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint32
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint32.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book'])
+        >>> a = cudf.Series([1, 2, 3], dtype=np.uint32)
+        >>> b = cudf.Series([4, 5, 6], dtype=np.uint32)
+        >>> s.str.minhash_permuted(0, a=a, b=b, width=5)
+        0    [1305480171, 462824409, 74608232]
+        1       [32665388, 65330773, 97996158]
+        dtype: list
+        """
         a_column = column.as_column(a)
         if a_column.dtype != np.uint32:
             raise ValueError(
@@ -5410,8 +5443,42 @@ def minhash64(
         )
 
     def minhash64_permuted(
-        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int = 4
+        self, seed: np.uint64, a: ColumnLike, b: ColumnLike, width: int
     ) -> SeriesOrIndex:
+        """
+        Compute the minhash of a strings column.
+        This uses the MurmurHash3_x64_128 algorithm for the hash function.
+
+        Calculation uses the formula (hv * a + b) % mersenne_prime
+        where hv is the hash of a substring of width characters,
+        a and b are provided values and mersenne_prime is 2^61-1.
+
+        Parameters
+        ----------
+        seed : uint64
+            The seed used for the hash algorithm.
+        a : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        b : ColumnLike
+            Values for minhash calculation.
+            Must be of type uint64.
+        width : int
+            The width of the substring to hash.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import numpy as np
+        >>> s = cudf.Series(['this is my', 'favorite book', 'to read'])
+        >>> a = cudf.Series([2, 3], dtype=np.uint64)
+        >>> b = cudf.Series([5, 6], dtype=np.uint64)
+        >>> s.str.minhash64_permuted(0, a=a, b=b, width=5)
+        0    [172452388517576012, 316595762085180527]
+        1      [71427536958126239, 58787297728258215]
+        2    [423885828176437114, 1140588505926961370]
+        dtype: list
+        """
         a_column = column.as_column(a)
         if a_column.dtype != np.uint64:
             raise ValueError(
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 574d586b9da..ef5b53ddea0 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -91,14 +91,12 @@ cpdef Column minhash_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_minhash_permuted(
-                input.view(),
-                seed,
-                a.view(),
-                b.view(),
-                width
-            )
+        c_result = cpp_minhash_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
         )
 
     return Column.from_libcudf(move(c_result))
@@ -174,14 +172,12 @@ cpdef Column minhash64_permuted(
     cdef unique_ptr[column] c_result
 
     with nogil:
-        c_result = move(
-            cpp_minhash64_permuted(
-                input.view(),
-                seed,
-                a.view(),
-                b.view(),
-                width
-            )
+        c_result = cpp_minhash64_permuted(
+            input.view(),
+            seed,
+            a.view(),
+            b.view(),
+            width
         )
 
     return Column.from_libcudf(move(c_result))
diff --git a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
index 918d73c2c30..ec533e64307 100644
--- a/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
+++ b/python/pylibcudf/pylibcudf/tests/test_nvtext_minhash.py
@@ -21,7 +21,7 @@ def word_minhash_input_data(request):
 
 
 @pytest.mark.parametrize("width", [5, 12])
-def test_minhash(minhash_input_data, width):
+def test_minhash_permuted(minhash_input_data, width):
     input_arr, seeds, seed_type = minhash_input_data
     minhash_func = (
         plc.nvtext.minhash.minhash_permuted

From 1989c62a03d4809bc79ee3df7cae6e71f9e3b34a Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 6 Nov 2024 10:15:30 -0500
Subject: [PATCH 22/25] change test_minhash to test_minhash_permuted

---
 python/cudf/cudf/tests/text/test_text_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index 6ec9348dc8d..47e541fdcef 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -882,7 +882,7 @@ def test_is_vowel_consonant():
     assert_eq(expected, actual)
 
 
-def test_minhash():
+def test_minhash_permuted():
     strings = cudf.Series(["this is my", "favorite book", None, ""])
 
     params = cudf.Series([1, 2, 3], dtype=np.uint32)

From 2804015acdcded842dbd9035111f0922d37f7728 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Wed, 6 Nov 2024 15:26:16 -0500
Subject: [PATCH 23/25] add deprecation warnings

---
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index ef5b53ddea0..0751db5e423 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -18,6 +18,7 @@ from pylibcudf.libcudf.types cimport size_type
 from pylibcudf.scalar cimport Scalar
 
 from cython.operator import dereference
+import warnings
 
 
 cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
@@ -42,6 +43,12 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash_permuted.",
+        DeprecationWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):
@@ -123,6 +130,12 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     Column
         List column of minhash values for each string per seed
     """
+    warnings.warn(
+        "Starting in version 25.02, the signature of this function will "
+        "be changed to match pylibcudf.nvtext.minhash64_permuted.",
+        DeprecationWarning
+    )
+
     cdef unique_ptr[column] c_result
 
     if not isinstance(seeds, (Column, Scalar)):

From 1ed78a4f1f62fafa22dd5a3e806913bde8b7cc95 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Thu, 7 Nov 2024 09:16:07 -0500
Subject: [PATCH 24/25] change DeprecationWarning to FutureWarning

---
 python/cudf/cudf/core/column/string.py        | 2 ++
 python/pylibcudf/pylibcudf/nvtext/minhash.pyx | 4 ++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index f8d20f9d21c..3d70b01b7e4 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -5355,6 +5355,7 @@ def minhash_permuted(
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+
         This uses the MurmurHash3_x86_32 algorithm for the hash function.
 
         Calculation uses the formula (hv * a + b) % mersenne_prime
@@ -5407,6 +5408,7 @@ def minhash64(
     ) -> SeriesOrIndex:
         """
         Compute the minhash of a strings column.
+
         This uses the MurmurHash3_x64_128 algorithm for the hash function.
         This function generates 2 uint64 values but only the first
         uint64 value is used.
diff --git a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
index 0751db5e423..5a51e32b287 100644
--- a/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
+++ b/python/pylibcudf/pylibcudf/nvtext/minhash.pyx
@@ -46,7 +46,7 @@ cpdef Column minhash(Column input, ColumnOrScalar seeds, size_type width=4):
     warnings.warn(
         "Starting in version 25.02, the signature of this function will "
         "be changed to match pylibcudf.nvtext.minhash_permuted.",
-        DeprecationWarning
+        FutureWarning
     )
 
     cdef unique_ptr[column] c_result
@@ -133,7 +133,7 @@ cpdef Column minhash64(Column input, ColumnOrScalar seeds, size_type width=4):
     warnings.warn(
         "Starting in version 25.02, the signature of this function will "
         "be changed to match pylibcudf.nvtext.minhash64_permuted.",
-        DeprecationWarning
+        FutureWarning
     )
 
     cdef unique_ptr[column] c_result

From 056eb79bb7d5dc893394d6730c2e2e2eb974e728 Mon Sep 17 00:00:00 2001
From: David Wendt <dwendt@nvidia.com>
Date: Fri, 8 Nov 2024 08:43:03 -0500
Subject: [PATCH 25/25] fix shared-memory variable type

---
 cpp/include/nvtext/minhash.hpp | 4 ++--
 cpp/src/text/minhash.cu        | 3 +--
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/cpp/include/nvtext/minhash.hpp b/cpp/include/nvtext/minhash.hpp
index 4ff863842a2..b2c1a23f57e 100644
--- a/cpp/include/nvtext/minhash.hpp
+++ b/cpp/include/nvtext/minhash.hpp
@@ -124,7 +124,7 @@ namespace CUDF_EXPORT nvtext {
  * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
+ * @param seed Seed value used for the hash algorithm
  * @param parameter_a Values used for the permuted calculation
  * @param parameter_b Values used for the permuted calculation
  * @param width The character width of substrings to hash for each row
@@ -236,7 +236,7 @@ std::unique_ptr<cudf::column> minhash_permuted(
  * @throw std::overflow_error if `parameter_a.size() * input.size()` exceeds the column size limit
  *
  * @param input Strings column to compute minhash
- * @param seed  Seed value used for the hash algorithm
+ * @param seed Seed value used for the hash algorithm
  * @param parameter_a Values used for the permuted calculation
  * @param parameter_b Values used for the permuted calculation
  * @param width The character width of substrings to hash for each row
diff --git a/cpp/src/text/minhash.cu b/cpp/src/text/minhash.cu
index 362672e74bb..aee83ab35ed 100644
--- a/cpp/src/text/minhash.cu
+++ b/cpp/src/text/minhash.cu
@@ -353,8 +353,7 @@ CUDF_KERNEL void minhash_permuted_kernel(cudf::column_device_view const d_string
   constexpr hash_value_type hash_max = std::numeric_limits<hash_value_type>::max();
 
   // found to be an efficient shared memory size for both hash types
-  __shared__ char shmem[block_size * params_per_thread * sizeof(hash_value_type)];
-  auto const block_values = reinterpret_cast<hash_value_type*>(shmem);
+  __shared__ hash_value_type block_values[block_size * params_per_thread];
 
   for (std::size_t i = 0; i < parameter_a.size(); i += params_per_thread) {
     // initialize this block's chunk of shared memory