From dd58dc4e9dae387c878afbe6cb32a311ce76fe68 Mon Sep 17 00:00:00 2001
From: Ben Jarmak <104460670+jarmak-nv@users.noreply.github.com>
Date: Fri, 22 Sep 2023 07:58:56 -0500
Subject: [PATCH 1/7] Remove outdated GitHub project actions (#14161)

This PR removes two GitHub Actions that are no-longer needed:
- `.github/workflows/add_to_project.yml`
   - This automatically adds issues and PRs to the cuDF/Dask/Numba/UCX project, but this is now a built-in functionality to projects
- `.github/workflows/new-issues-to-triage-projects.yml`
   - This tries to add issues to a now closed project

Authors:
   - Ben Jarmak (https://github.com/jarmak-nv)

Approvers:
   - AJ Schmidt (https://github.com/ajschmidt8)
---
 .github/workflows/add_to_project.yml          | 20 -----------
 .../new-issues-to-triage-projects.yml         | 35 -------------------
 2 files changed, 55 deletions(-)
 delete mode 100644 .github/workflows/add_to_project.yml
 delete mode 100644 .github/workflows/new-issues-to-triage-projects.yml

diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml
deleted file mode 100644
index b301c56a999..00000000000
--- a/.github/workflows/add_to_project.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: Add new issue/PR to project
-
-on:
-  issues:
-    types:
-      - opened
-
-  pull_request_target:
-    types:
-      - opened
-
-jobs:
-  add-to-project:
-    name: Add issue or PR to project
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/add-to-project@v0.3.0
-        with:
-          project-url: https://github.com/orgs/rapidsai/projects/51
-          github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }}
diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml
deleted file mode 100644
index cf9b0c379f1..00000000000
--- a/.github/workflows/new-issues-to-triage-projects.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Auto Assign New Issues to Triage Project
-
-on:
-  issues:
-    types: [opened]
-
-env:
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-jobs:
-  assign_one_project:
-    runs-on: ubuntu-latest
-    name: Assign to New Issues to Triage Project
-    steps:
-    - name: Process bug issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/1
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process feature issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/9
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process other issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request'))
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/10
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'

From 98b1bc6c1ef1233a6c71c3b24fc8f88d591a4639 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:07:37 -0400
Subject: [PATCH 2/7] Fix calls to copy_bitmask to pass stream parameter
 (#14158)

Fixes a couple places where `cudf::copy_bitmask` was called instead of `cudf::detail::copy_bitmask` to pass the available stream (and mr) parameters.

Found while reviewing #14121
Reference: https://github.com/rapidsai/cudf/pull/14121#discussion_r1332332391

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14158
---
 cpp/src/lists/count_elements.cu | 12 ++++++------
 cpp/src/replace/clamp.cu        |  4 +++-
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index f8e7b4c6126..40a14d805e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,12 +36,12 @@ namespace cudf {
 namespace lists {
 namespace detail {
 /**
- * @brief Returns a numeric column containing lengths of each element.
+ * @brief Returns a numeric column containing lengths of each element
  *
- * @param input Input lists column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with lengths.
+ * @return New size_type column with lengths
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
@@ -52,7 +52,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
   // create output column
   auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                         input.size(),
-                                        copy_bitmask(input.parent()),
+                                        cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                         input.null_count(),
                                         stream,
                                         mr);
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 2b48aed2d29..950cb484ddf 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -163,7 +163,9 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
   // mask will not change
-  if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); }
+  if (input.nullable()) {
+    output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  }
 
   auto output_device_view =
     cudf::mutable_column_device_view::create(output->mutable_view(), stream);

From f865c871cd0f9b9c596476d9d98aafaf9cc46bb1 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Sep 2023 11:08:11 -0400
Subject: [PATCH 3/7] Expose stream parameter in public nvtext ngram APIs
 (#14061)

Add stream parameter to public APIs:

- `nvtext::generate_ngrams()`
- `nvtext::generate_character_ngrams()`
- `nvtext::hash_character_ngrams()`
- `nvtext::ngrams_tokenize()`

Also cleaned up some of the doxygen comments.
And also fixed a spelling mistake in the jaccard.cu source that was bothering me.

Reference #13744

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Yunsong Wang (https://github.com/PointKernel)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14061
---
 cpp/benchmarks/text/ngrams.cpp           |  3 +-
 cpp/benchmarks/text/tokenize.cpp         |  7 ++-
 cpp/include/nvtext/generate_ngrams.hpp   | 38 ++++++++-------
 cpp/include/nvtext/ngrams_tokenize.hpp   | 28 +++++------
 cpp/src/text/generate_ngrams.cu          |  9 ++--
 cpp/src/text/jaccard.cu                  |  4 +-
 cpp/src/text/ngrams_tokenize.cu          |  4 +-
 cpp/tests/CMakeLists.txt                 |  1 +
 cpp/tests/streams/text/ngrams_test.cpp   | 59 ++++++++++++++++++++++++
 cpp/tests/text/ngrams_tests.cpp          | 28 ++++++-----
 cpp/tests/text/ngrams_tokenize_tests.cpp | 11 +++--
 11 files changed, 135 insertions(+), 57 deletions(-)
 create mode 100644 cpp/tests/streams/text/ngrams_test.cpp

diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 0319577f6b9..f3fd5cc5729 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
   cudf::strings_column_view input(column->view());
+  auto const separator = cudf::string_scalar("_");
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
       case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
     }
   }
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 423fe667b05..b556a84c541 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state)
       auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
     });
   } else if (tokenize_type == "ngrams") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
+    auto const delimiter = cudf::string_scalar("");
+    auto const separator = cudf::string_scalar("_");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator);
+    });
   } else if (tokenize_type == "characters") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 5d66401df9d..46f2c0e7bc9 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -47,19 +47,19 @@ namespace nvtext {
  * @throw cudf::logic_error if `separator` is invalid
  * @throw cudf::logic_error if there are not enough strings to generate any ngrams
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generates ngrams of characters within each string.
@@ -79,15 +79,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 2,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,14 +115,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 5,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 17f20f7ea4c..9d76ef8689f 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,22 +66,22 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
  * @param delimiter UTF-8 characters used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ *                  An empty string will separate tokens using whitespace.
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& delimiter,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 938fd45246d..5f2f4d021a4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
+  return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
 }
 
 namespace detail {
@@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::generate_character_ngrams(strings, ngrams, stream, mr);
 }
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 5b55745c2c7..95324847ea0 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
  *
  * This is called with a warp per row
  */
-struct sorted_interset_fn {
+struct sorted_intersect_fn {
   cudf::column_device_view const d_input1;
   cudf::column_device_view const d_input2;
   cudf::size_type* d_results;
@@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
   auto const d_input1 = cudf::column_device_view::create(input1, stream);
   auto const d_input2 = cudf::column_device_view::create(input2, stream);
   auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<cudf::size_type>(0),
                      input1.size() * cudf::detail::warp_size,
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index fd1cbf99221..73d85513e95 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
+  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index d1e50442058..ba4921848d7 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -632,6 +632,7 @@ ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
new file mode 100644
index 00000000000..bce0d2b680b
--- /dev/null
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/ngrams_tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextNGramsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextNGramsTest, GenerateNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::generate_ngrams(
+    cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, GenerateCharacterNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::generate_character_ngrams(
+    cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, HashCharacterNgrams)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  nvtext::hash_character_ngrams(
+    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, NgramsTokenize)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::ngrams_tokenize(
+    cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 323b3eed3e2..7b179588385 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 {
   cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"};
   cudf::strings_column_view strings_view(strings);
+  auto const separator = cudf::string_scalar("_");
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view);
+    auto const results = nvtext::generate_ngrams(strings_view, 2, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto const separator = cudf::string_scalar("_");
 
   cudf::strings_column_view strings_view(strings);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
-  auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
+  auto const separator = cudf::string_scalar("_");
+
+  auto results =
+    nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator);
   cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
   cudf::test::expect_column_empty(results->view());
@@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty)
 TEST_F(TextGenerateNgramsTest, Errors)
 {
   cudf::test::strings_column_wrapper strings{""};
+  auto const separator = cudf::string_scalar("_");
   // invalid parameter value
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
                cudf::logic_error);
   // not enough strings to generate ngrams
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
                cudf::logic_error);
 
-  std::vector<char const*> h_strings{"", nullptr, "", nullptr};
-  cudf::test::strings_column_wrapper strings_no_tokens(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)),
+  cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0});
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator),
                cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)),
                cudf::logic_error);
diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp
index 5879bec3e64..c6fb886f7e5 100644
--- a/cpp/tests/text/ngrams_tokenize_tests.cpp
+++ b/cpp/tests/text/ngrams_tokenize_tests.cpp
@@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize)
                                                 "mousé_ate",
                                                 "ate_the",
                                                 "the_cheese"};
-    auto results = nvtext::ngrams_tokenize(strings_view);
+    auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram)
 {
   cudf::test::strings_column_wrapper strings{"aaa bbb", "  ccc  ddd  ", "eee"};
   cudf::strings_column_view strings_view(strings);
+  auto const empty = cudf::string_scalar("");
 
   cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"};
-  auto results = nvtext::ngrams_tokenize(strings_view, 1);
+  auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest)
 {
   auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
   cudf::strings_column_view strings_view(strings->view());
-  auto results = nvtext::ngrams_tokenize(strings_view);
+  auto const empty = cudf::string_scalar("");
+  auto results     = nvtext::ngrams_tokenize(strings_view, 2, empty, empty);
   EXPECT_EQ(results->size(), 0);
   EXPECT_EQ(results->has_nulls(), false);
 }
@@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest)
 {
   cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
   cudf::strings_column_view strings_view(strings);
-  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error);
+  auto const empty = cudf::string_scalar("");
+  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error);
 }

From a6d014e632ecad86cef486402dbe53acee191a1d Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lmitchell@nvidia.com>
Date: Fri, 22 Sep 2023 16:24:33 +0100
Subject: [PATCH 4/7] Support callables in DataFrame.assign (#14142)

While here, change the way the initial copied frame is constructed:
callables are allowed to refer to columns already in the dataframe,
even if they overwrite them.

- Closes #12936

Authors:
  - Lawrence Mitchell (https://github.com/wence-)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: https://github.com/rapidsai/cudf/pull/14142
---
 python/cudf/cudf/core/dataframe.py       | 23 ++++++++++++++---------
 python/cudf/cudf/tests/test_dataframe.py | 19 +++++++++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1a780cc9e9f..8a3dbe77787 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1390,10 +1390,21 @@ def _get_numeric_data(self):
         return self[columns]
 
     @_cudf_nvtx_annotate
-    def assign(self, **kwargs):
+    def assign(self, **kwargs: Union[Callable[[Self], Any], Any]):
         """
         Assign columns to DataFrame from keyword arguments.
 
+        Parameters
+        ----------
+        **kwargs: dict mapping string column names to values
+            The value for each key can either be a literal column (or
+            something that can be converted to a column), or
+            a callable of one argument that will be given the
+            dataframe as an argument and should return the new column
+            (without modifying the input argument).
+            Columns are added in-order, so callables can refer to
+            column names constructed in the assignment.
+
         Examples
         --------
         >>> import cudf
@@ -1405,15 +1416,9 @@ def assign(self, **kwargs):
         1  1  4
         2  2  5
         """
-        new_df = cudf.DataFrame(index=self.index.copy())
-        for name, col in self._data.items():
-            if name in kwargs:
-                new_df[name] = kwargs.pop(name)
-            else:
-                new_df._data[name] = col.copy()
-
+        new_df = self.copy(deep=False)
         for k, v in kwargs.items():
-            new_df[k] = v
+            new_df[k] = v(new_df) if callable(v) else v
         return new_df
 
     @classmethod
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6180162ecdd..2f531afdeb7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1327,6 +1327,25 @@ def test_assign():
     np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4])
 
 
+@pytest.mark.parametrize(
+    "mapping",
+    [
+        {"y": 1, "z": lambda df: df["x"] + df["y"]},
+        {
+            "x": lambda df: df["x"] * 2,
+            "y": lambda df: 2,
+            "z": lambda df: df["x"] / df["y"],
+        },
+    ],
+)
+def test_assign_callable(mapping):
+    df = pd.DataFrame({"x": [1, 2, 3]})
+    cdf = cudf.from_pandas(df)
+    expect = df.assign(**mapping)
+    actual = cdf.assign(**mapping)
+    assert_eq(expect, actual)
+
+
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
 @pytest.mark.parametrize("method", ["murmur3", "md5"])
 @pytest.mark.parametrize("seed", [None, 42])

From 40bdd8ae4d89d2ea1f466c579d56f2c9ca1b014d Mon Sep 17 00:00:00 2001
From: Peter Andreas Entschev <peter@entschev.com>
Date: Fri, 22 Sep 2023 19:20:18 +0200
Subject: [PATCH 5/7] Pin to `aws-sdk-cpp<1.11` (#14173)

Pin conda packages to `aws-sdk-cpp<1.11`. The recent upgrade in version `1.11.*` has caused several issues with cleaning up (more details on changes can be read in [this link](https://github.com/aws/aws-sdk-cpp#version-111-is-now-available)), leading to Distributed and Dask-CUDA processes to segfault. The stack for one of those crashes looks like the following:

```
(gdb) bt
#0  0x00007f5125359a0c in Aws::Utils::Logging::s_aws_logger_redirect_get_log_level(aws_logger*, unsigned int) () from /opt/conda/envs/dask/lib/python3.9/site-packages/pyarrow/../../.././libaws-cpp-sdk-core.so
#1  0x00007f5124968f83 in aws_event_loop_thread () from /opt/conda/envs/dask/lib/python3.9/site-packages/pyarrow/../../../././libaws-c-io.so.1.0.0
#2  0x00007f5124ad9359 in thread_fn () from /opt/conda/envs/dask/lib/python3.9/site-packages/pyarrow/../../../././libaws-c-common.so.1
#3  0x00007f519958f6db in start_thread () from /lib/x86_64-linux-gnu/libpthread.so.0
#4  0x00007f5198b1361f in clone () from /lib/x86_64-linux-gnu/libc.so.6
```

Such segfaults now manifest frequently in CI, and in some cases are reproducible with a hit rate of ~30%. Given the approaching release time, it's probably the safest option to just pin to an older version of the package while we don't pinpoint the exact cause for the issue and a patched build is released upstream.

The `aws-sdk-cpp` is statically-linked in the `pyarrow` pip package, which prevents us from using the same pinning technique. cuDF is currently pinned to `pyarrow=12.0.1` which seems to be built against `aws-sdk-cpp=1.10.*`, as per [recent build logs](https://github.com/apache/arrow/actions/runs/6276453828/job/17046177335?pr=37792#step:6:1372).

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/14173
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 1 +
 conda/environments/all_cuda-120_arch-x86_64.yaml | 1 +
 conda/recipes/libcudf/conda_build_config.yaml    | 3 +++
 conda/recipes/libcudf/meta.yaml                  | 2 ++
 dependencies.yaml                                | 1 +
 5 files changed, 8 insertions(+)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d4abc28cf13..9fb991f9075 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9a98e400e6d..9ba0dd8dc38 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 25b3f19de77..b1f5b083e06 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -22,6 +22,9 @@ gbench_version:
 gtest_version:
   - ">=1.13.0"
 
+aws_sdk_cpp_version:
+  - "<1.11"
+
 libarrow_version:
   - "=12"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 627065817ba..28357f0d96d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,6 +74,7 @@ requirements:
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
+    - aws-sdk-cpp {{ aws_sdk_cpp_version }}
 
 outputs:
   - name: libcudf
@@ -107,6 +108,7 @@ outputs:
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
+        - aws-sdk-cpp {{ aws_sdk_cpp_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/dependencies.yaml b/dependencies.yaml
index 376e43094a7..5586f54348c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -218,6 +218,7 @@ dependencies:
           - libkvikio==23.10.*
       - output_types: conda
         packages:
+          - aws-sdk-cpp<1.11
           - fmt>=9.1.0,<10
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0

From c7dd6b48684028a65b1d19d5d5b04060f6a4fe19 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 22 Sep 2023 14:15:31 -0400
Subject: [PATCH 6/7] Refactor libcudf indexalator to typed normalator (#14043)

Creates generic normalizing-iterator for integer types for use by the `indexalator` and the future offsets normalizing iterator.
Mostly code has been moved around or renamed so the normalizing-iterator part can take type template parameter to identify which integer type to normalize to. For the `indexalator`, this type is `cudf::size_type` and for the offsets iterator this type would be `int64`.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - MithunR (https://github.com/mythrocks)

URL: https://github.com/rapidsai/cudf/pull/14043
---
 cpp/include/cudf/detail/indexalator.cuh       | 332 +---------------
 .../cudf/detail/normalizing_iterator.cuh      | 367 ++++++++++++++++++
 2 files changed, 374 insertions(+), 325 deletions(-)
 create mode 100644 cpp/include/cudf/detail/normalizing_iterator.cuh

diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 4731c4919e3..6532dae3695 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -16,14 +16,13 @@
 
 #pragma once
 
+#include <cudf/detail/normalizing_iterator.cuh>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
@@ -32,193 +31,6 @@
 namespace cudf {
 namespace detail {
 
-/**
- * @brief The base class for the input or output index normalizing iterator.
- *
- * This implementation uses CRTP to define the `input_indexalator` and the
- * `output_indexalator` classes. This is so this class can manipulate the
- * uniquely typed subclass member variable `p_` directly without requiring
- * virtual functions since iterator instances will be copied to device memory.
- *
- * The base class mainly manages updating the `p_` member variable while the
- * subclasses handle accessing individual elements in device memory.
- *
- * @tparam T The derived class type for the iterator.
- */
-template <class T>
-struct base_indexalator {
-  using difference_type   = ptrdiff_t;
-  using value_type        = size_type;
-  using pointer           = size_type*;
-  using iterator_category = std::random_access_iterator_tag;
-
-  base_indexalator()                                   = default;
-  base_indexalator(base_indexalator const&)            = default;
-  base_indexalator(base_indexalator&&)                 = default;
-  base_indexalator& operator=(base_indexalator const&) = default;
-  base_indexalator& operator=(base_indexalator&&)      = default;
-
-  /**
-   * @brief Prefix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator++()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T operator++(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator++();
-    return tmp;
-  }
-
-  /**
-   * @brief Prefix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator--()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T operator--(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator--();
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by sum operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Increment by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ += (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Addition assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ += (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by difference operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Decrement by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ -= (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Subtraction assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ -= (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compute offset from iterator difference operator.
-   */
-  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
-  {
-    return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
-  }
-
-  /**
-   * @brief Equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
-  {
-    return rhs.p_ == static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Not equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
-  {
-    return rhs.p_ != static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Less than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ < rhs.p_;
-  }
-  /**
-   * @brief Greater than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ > rhs.p_;
-  }
-  /**
-   * @brief Less than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ <= rhs.p_;
-  }
-  /**
-   * @brief Greater than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ >= rhs.p_;
-  }
-
- protected:
-  /**
-   * @brief Constructor assigns width and type member variables for base class.
-   */
-  base_indexalator(int32_t width, data_type dtype) : width_(width), dtype_(dtype) {}
-
-  int width_;        /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
 /**
  * @brief The index normalizing input iterator.
  *
@@ -244,65 +56,7 @@ struct base_indexalator {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-struct input_indexalator : base_indexalator<input_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<input_indexalator>;  // for CRTP
-
-  using reference = size_type const;  // this keeps STL and thrust happy
-
-  input_indexalator()                                    = default;
-  input_indexalator(input_indexalator const&)            = default;
-  input_indexalator(input_indexalator&&)                 = default;
-  input_indexalator& operator=(input_indexalator const&) = default;
-  input_indexalator& operator=(input_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position.
-   */
-  __device__ inline size_type operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a size_type value from any index type.
-   */
-  struct index_as_size_type {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      return static_cast<size_type>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-  /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `size_type` value.
-   */
-  __device__ inline size_type operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * width_);
-    return type_dispatcher(dtype_, index_as_size_type{}, tp);
-  }
-
- protected:
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  input_indexalator(void const* data, int width, data_type dtype)
-    : base_indexalator<input_indexalator>(width, dtype), p_{static_cast<char const*>(data)}
-  {
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
+using input_indexalator = input_normalator<cudf::size_type>;
 
 /**
  * @brief The index normalizing output iterator.
@@ -328,79 +82,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
  *                      thrust::less<Element>());
  * @endcode
  */
-struct output_indexalator : base_indexalator<output_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<output_indexalator>;  // for CRTP
-
-  using reference = output_indexalator const&;  // required for output iterators
-
-  output_indexalator()                                     = default;
-  output_indexalator(output_indexalator const&)            = default;
-  output_indexalator(output_indexalator&&)                 = default;
-  output_indexalator& operator=(output_indexalator const&) = default;
-  output_indexalator& operator=(output_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(size_type)` calls.
-   */
-  __device__ inline output_indexalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(size_type)` call in this class.
-   */
-  __device__ inline output_indexalator const operator[](size_type idx) const
-  {
-    output_indexalator tmp{*this};
-    tmp.p_ += (idx * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct size_type_to_index {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign a size_type value to the current iterator position.
-   */
-  __device__ inline output_indexalator const& operator=(size_type const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(dtype_, size_type_to_index{}, tp, value);
-    return *this;
-  }
-
- protected:
-  /**
-   * @brief Create an output index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  output_indexalator(void* data, int width, data_type dtype)
-    : base_indexalator<output_indexalator>(width, dtype), p_{static_cast<char*>(data)}
-  {
-  }
-
-  char* p_;  /// pointer to the integer data in device memory
-};
+using output_indexalator = output_normalator<cudf::size_type>;
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -413,7 +95,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     input_indexalator operator()(column_view const& indices)
     {
-      return input_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return input_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -433,7 +115,7 @@ struct indexalator_factory {
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
-      return input_indexalator(scalar_impl->data(), sizeof(IndexType), index.type());
+      return input_indexalator(scalar_impl->data(), index.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -451,7 +133,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     output_indexalator operator()(mutable_column_view const& indices)
     {
-      return output_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return output_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
new file mode 100644
index 00000000000..51b3133f84f
--- /dev/null
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The base class for the input or output normalizing iterator
+ *
+ * The base class mainly manages updating the `p_` member variable while the
+ * subclasses handle accessing individual elements in device memory.
+ *
+ * @tparam Derived The derived class type for the iterator
+ * @tparam Integer The type the iterator normalizes to
+ */
+template <class Derived, typename Integer>
+struct base_normalator {
+  static_assert(std::is_integral_v<Integer>);
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = Integer;
+  using pointer           = Integer*;
+  using iterator_category = std::random_access_iterator_tag;
+
+  base_normalator()                                  = default;
+  base_normalator(base_normalator const&)            = default;
+  base_normalator(base_normalator&&)                 = default;
+  base_normalator& operator=(base_normalator const&) = default;
+  base_normalator& operator=(base_normalator&&)      = default;
+
+  /**
+   * @brief Prefix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator++()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator++(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator++();
+    return tmp;
+  }
+
+  /**
+   * @brief Prefix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator--()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator--(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator--();
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by sum operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Increment by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator+(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ += (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Addition assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator+(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ += (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by difference operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Decrement by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator-(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ -= (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Subtraction assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator-(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ -= (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compute offset from iterator difference operator.
+   */
+  CUDF_HOST_DEVICE inline difference_type operator-(Derived const& rhs) const
+  {
+    return (static_cast<Derived const&>(*this).p_ - rhs.p_) / width_;
+  }
+
+  /**
+   * @brief Equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator==(Derived const& rhs) const
+  {
+    return rhs.p_ == static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Not equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator!=(Derived const& rhs) const
+  {
+    return rhs.p_ != static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Less than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ < rhs.p_;
+  }
+
+  /**
+   * @brief Greater than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ > rhs.p_;
+  }
+
+  /**
+   * @brief Less than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ <= rhs.p_;
+  }
+
+  /**
+   * @brief Greater than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ >= rhs.p_;
+  }
+
+ protected:
+  /**
+   * @brief Constructor assigns width and type member variables for base class.
+   */
+  explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {}
+
+  int width_;        /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
+};
+
+/**
+ * @brief The integer normalizing input iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for reading an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Reading specific elements always return a type of `Integer`
+ *
+ * @tparam Integer Type returned by all read functions
+ */
+template <typename Integer>
+struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
+  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = Integer const;  // this keeps STL and thrust happy
+
+  input_normalator()                                   = default;
+  input_normalator(input_normalator const&)            = default;
+  input_normalator(input_normalator&&)                 = default;
+  input_normalator& operator=(input_normalator const&) = default;
+  input_normalator& operator=(input_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline Integer operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const* tp)
+    {
+      return static_cast<Integer>(*static_cast<T const*>(tp));
+    }
+    template <typename T, std::enable_if_t<not cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline Integer operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  input_normalator(void const* data, data_type dtype)
+    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
+  {
+  }
+
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The integer normalizing output iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for writing an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Setting specific elements always accept the `Integer` type values.
+ *
+ * @tparam Integer The type used for all write functions
+ */
+template <typename Integer>
+struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
+  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = output_normalator const&;  // required for output iterators
+
+  output_normalator()                                    = default;
+  output_normalator(output_normalator const&)            = default;
+  output_normalator(output_normalator&&)                 = default;
+  output_normalator& operator=(output_normalator const&) = default;
+  output_normalator& operator=(output_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline output_normalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_normalator const operator[](size_type idx) const
+  {
+    output_normalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void* tp, Integer const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void*, Integer const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline output_normalator const& operator=(Integer const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  output_normalator(void* data, data_type dtype)
+    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf

From 517d1239c913c86f7c1d9dc6642434e73aa2b14c Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Fri, 22 Sep 2023 12:40:09 -0700
Subject: [PATCH 7/7] Expose streams in all public sorting APIs (#14146)

Contributes to #925

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - David Wendt (https://github.com/davidwendt)

URL: https://github.com/rapidsai/cudf/pull/14146
---
 cpp/include/cudf/sorting.hpp          |  44 ++++++---
 cpp/src/lists/segmented_sort.cu       |  30 +++---
 cpp/src/sort/is_sorted.cu             |   5 +-
 cpp/src/sort/rank.cu                  |  11 +--
 cpp/src/sort/segmented_sort.cu        |   8 +-
 cpp/src/sort/segmented_sort_impl.cuh  |   2 +-
 cpp/src/sort/sort.cu                  |  10 +-
 cpp/src/sort/stable_segmented_sort.cu |   8 +-
 cpp/src/sort/stable_sort.cu           |   8 +-
 cpp/tests/CMakeLists.txt              |   7 +-
 cpp/tests/streams/sorting_test.cpp    | 132 ++++++++++++++++++++++++++
 11 files changed, 210 insertions(+), 55 deletions(-)
 create mode 100644 cpp/tests/streams/sorting_test.cpp

diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 6924e77ae9b..e4e803b2d3c 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -43,6 +44,7 @@ namespace cudf {
  * @param null_precedence The desired order of null compared to other elements
  * for each column. Size must be equal to `input.num_columns()` or empty.
  * If empty, all columns will be sorted in `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the permuted row indices of
  * `input` if it were sorted
@@ -51,6 +53,7 @@ std::unique_ptr<column> sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,27 +68,30 @@ std::unique_ptr<column> stable_sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
  *        order.
  *
- * @param[in] table             Table whose rows need to be compared for ordering
- * @param[in] column_order      The expected sort order for each column. Size
- *                              must be equal to `in.num_columns()` or empty. If
- *                              empty, it is expected all columns are in
- *                              ascending order.
- * @param[in] null_precedence   The desired order of null compared to other
- *                              elements for each column. Size must be equal to
- *                              `input.num_columns()` or empty. If empty,
- *                              `null_order::BEFORE` is assumed for all columns.
- *
- * @returns bool                true if sorted as expected, false if not
+ * @param table             Table whose rows need to be compared for ordering
+ * @param column_order      The expected sort order for each column. Size
+ *                          must be equal to `in.num_columns()` or empty. If
+ *                          empty, it is expected all columns are in
+ *                          ascending order.
+ * @param null_precedence   The desired order of null compared to other
+ *                          elements for each column. Size must be equal to
+ *                          `input.num_columns()` or empty. If empty,
+ *                          `null_order::BEFORE` is assumed for all columns.
+ *
+ * @param stream            CUDA stream used for device memory operations and kernel launches
+ * @returns                 true if sorted as expected, false if not
  */
 bool is_sorted(cudf::table_view const& table,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence);
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Performs a lexicographic sort of the rows of a table
@@ -98,6 +104,7 @@ bool is_sorted(cudf::table_view const& table,
  * elements for each column in `input`. Size must be equal to
  * `input.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return New table containing the desired sorted order of `input`
  */
@@ -105,6 +112,7 @@ std::unique_ptr<table> sort(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,6 +132,7 @@ std::unique_ptr<table> sort(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -133,6 +142,7 @@ std::unique_ptr<table> sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -154,6 +164,7 @@ std::unique_ptr<table> sort_by_key(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -163,6 +174,7 @@ std::unique_ptr<table> stable_sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +201,7 @@ std::unique_ptr<table> stable_sort_by_key(
  * @param null_precedence The desired order of null compared to other elements
  * for column
  * @param percentage flag to convert ranks to percentage in range (0,1]
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A column of containing the rank of the each element of the column of `input`. The output
  * column type will be `size_type`column by default or else `double` when
@@ -201,6 +214,7 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +255,7 @@ std::unique_ptr<column> rank(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return sorted order of the segment sorted table
  *
@@ -250,6 +265,7 @@ std::unique_ptr<column> segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -262,6 +278,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -306,6 +323,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return table with elements in each segment sorted
  *
@@ -316,6 +334,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -329,6 +348,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 260636a61cf..49054ebb046 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,13 +70,13 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = segmented_sort_by_key(table_view{{child}},
-                                                        table_view{{child}},
-                                                        output_offset->view(),
-                                                        {column_order},
-                                                        {null_precedence},
-                                                        stream,
-                                                        mr);
+  auto const sorted_child_table = cudf::detail::segmented_sort_by_key(table_view{{child}},
+                                                                      table_view{{child}},
+                                                                      output_offset->view(),
+                                                                      {column_order},
+                                                                      {null_precedence},
+                                                                      stream,
+                                                                      mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
@@ -98,13 +98,13 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
-                                                               table_view{{child}},
-                                                               output_offset->view(),
-                                                               {column_order},
-                                                               {null_precedence},
-                                                               stream,
-                                                               mr);
+  auto const sorted_child_table = cudf::detail::stable_segmented_sort_by_key(table_view{{child}},
+                                                                             table_view{{child}},
+                                                                             output_offset->view(),
+                                                                             {column_order},
+                                                                             {null_precedence},
+                                                                             stream,
+                                                                             mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 25c594e9e74..39476a2f534 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -73,7 +73,8 @@ bool is_sorted(cudf::table_view const& in,
 
 bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence)
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   if (in.num_columns() == 0 || in.num_rows() == 0) { return true; }
@@ -89,7 +90,7 @@ bool is_sorted(cudf::table_view const& in,
       "Number of columns in the table doesn't match the vector null_precedence's size .\n");
   }
 
-  return detail::is_sorted(in, column_order, null_precedence, cudf::get_default_stream());
+  return detail::is_sorted(in, column_order, null_precedence, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index fd65e38d467..3ead8cfcbaa 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -366,16 +366,11 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rank(input,
-                      method,
-                      column_order,
-                      null_handling,
-                      null_precedence,
-                      percentage,
-                      cudf::get_default_stream(),
-                      mr);
+  return detail::rank(
+    input, method, column_order, null_handling, null_precedence, percentage, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 38d008c120c..d9457341bd2 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,11 +81,12 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                column_view const& segment_offsets,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
@@ -93,11 +94,12 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              column_view const& segment_offsets,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 37664f33762..5d11bf055f1 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -166,7 +166,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
   auto sorted_indices =
-    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0}, stream, mr);
+    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0, true, stream}, stream, mr);
   auto indices_view = sorted_indices->mutable_view();
 
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 25b95af4f83..46edae798d4 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -109,30 +109,32 @@ std::unique_ptr<table> sort(table_view const& input,
 std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
                                    table_view const& keys,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 40df1b50279..4725d65e05d 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,11 +55,12 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
@@ -67,11 +68,12 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     column_view const& segment_offsets,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 6f5678c4168..cf602dcf1a9 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -62,22 +62,22 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
 std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sorted_order(
-    input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           table_view const& keys,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index ba4921848d7..c7d3e2af19f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -621,17 +621,18 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
-ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
diff --git a/cpp/tests/streams/sorting_test.cpp b/cpp/tests/streams/sorting_test.cpp
new file mode 100644
index 00000000000..e481f95bded
--- /dev/null
+++ b/cpp/tests/streams/sorting_test.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class SortingTest : public cudf::test::BaseFixture {};
+
+TEST_F(SortingTest, SortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::stable_sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, IsSorted)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::is_sorted(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Sort)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sort(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::stable_sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Rank)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::rank(column,
+             cudf::rank_method::AVERAGE,
+             cudf::order::ASCENDING,
+             cudf::null_policy::EXCLUDE,
+             cudf::null_order::AFTER,
+             false,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::segmented_sorted_order(keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::stable_segmented_sorted_order(
+    keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::stable_segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}