diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml
deleted file mode 100644
index b301c56a999..00000000000
--- a/.github/workflows/add_to_project.yml
+++ /dev/null
@@ -1,20 +0,0 @@
-name: Add new issue/PR to project
-
-on:
-  issues:
-    types:
-      - opened
-
-  pull_request_target:
-    types:
-      - opened
-
-jobs:
-  add-to-project:
-    name: Add issue or PR to project
-    runs-on: ubuntu-latest
-    steps:
-      - uses: actions/add-to-project@v0.3.0
-        with:
-          project-url: https://github.com/orgs/rapidsai/projects/51
-          github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }}
diff --git a/.github/workflows/new-issues-to-triage-projects.yml b/.github/workflows/new-issues-to-triage-projects.yml
deleted file mode 100644
index cf9b0c379f1..00000000000
--- a/.github/workflows/new-issues-to-triage-projects.yml
+++ /dev/null
@@ -1,35 +0,0 @@
-name: Auto Assign New Issues to Triage Project
-
-on:
-  issues:
-    types: [opened]
-
-env:
-  GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-
-jobs:
-  assign_one_project:
-    runs-on: ubuntu-latest
-    name: Assign to New Issues to Triage Project
-    steps:
-    - name: Process bug issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'bug') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/1
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process feature issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, 'feature request') && contains(github.event.issue.labels.*.name, '? - Needs Triage')
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/9
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
-    - name: Process other issues
-      uses: docker://takanabe/github-actions-automate-projects:v0.0.1
-      if: contains(github.event.issue.labels.*.name, '? - Needs Triage') && (!contains(github.event.issue.labels.*.name, 'bug') && !contains(github.event.issue.labels.*.name, 'feature request'))
-      env:
-        GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
-        GITHUB_PROJECT_URL: https://github.com/rapidsai/cudf/projects/10
-        GITHUB_PROJECT_COLUMN_NAME: 'Needs prioritizing'
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index d4abc28cf13..9fb991f9075 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 9a98e400e6d..9ba0dd8dc38 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -9,6 +9,7 @@ channels:
 - nvidia
 dependencies:
 - aiobotocore>=2.2.0
+- aws-sdk-cpp<1.11
 - benchmark==1.8.0
 - boto3>=1.21.21
 - botocore>=1.24.21
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 25b3f19de77..b1f5b083e06 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -22,6 +22,9 @@ gbench_version:
 gtest_version:
   - ">=1.13.0"
 
+aws_sdk_cpp_version:
+  - "<1.11"
+
 libarrow_version:
   - "=12"
 
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index 627065817ba..28357f0d96d 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -74,6 +74,7 @@ requirements:
     - gtest {{ gtest_version }}
     - gmock {{ gtest_version }}
     - zlib {{ zlib_version }}
+    - aws-sdk-cpp {{ aws_sdk_cpp_version }}
 
 outputs:
   - name: libcudf
@@ -107,6 +108,7 @@ outputs:
         - dlpack {{ dlpack_version }}
         - gtest {{ gtest_version }}
         - gmock {{ gtest_version }}
+        - aws-sdk-cpp {{ aws_sdk_cpp_version }}
     test:
       commands:
         - test -f $PREFIX/lib/libcudf.so
diff --git a/cpp/benchmarks/text/ngrams.cpp b/cpp/benchmarks/text/ngrams.cpp
index 0319577f6b9..f3fd5cc5729 100644
--- a/cpp/benchmarks/text/ngrams.cpp
+++ b/cpp/benchmarks/text/ngrams.cpp
@@ -36,11 +36,12 @@ static void BM_ngrams(benchmark::State& state, ngrams_type nt)
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
   cudf::strings_column_view input(column->view());
+  auto const separator = cudf::string_scalar("_");
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);
     switch (nt) {
-      case ngrams_type::tokens: nvtext::generate_ngrams(input); break;
+      case ngrams_type::tokens: nvtext::generate_ngrams(input, 2, separator); break;
       case ngrams_type::characters: nvtext::generate_character_ngrams(input); break;
     }
   }
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 423fe667b05..b556a84c541 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -67,8 +67,11 @@ static void bench_tokenize(nvbench::state& state)
       auto result = nvtext::count_tokens(input, cudf::strings_column_view(delimiters));
     });
   } else if (tokenize_type == "ngrams") {
-    state.exec(nvbench::exec_tag::sync,
-               [&](nvbench::launch& launch) { auto result = nvtext::ngrams_tokenize(input); });
+    auto const delimiter = cudf::string_scalar("");
+    auto const separator = cudf::string_scalar("_");
+    state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+      auto result = nvtext::ngrams_tokenize(input, 2, delimiter, separator);
+    });
   } else if (tokenize_type == "characters") {
     state.exec(nvbench::exec_tag::sync,
                [&](nvbench::launch& launch) { auto result = nvtext::character_tokenize(input); });
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 4731c4919e3..6532dae3695 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -16,14 +16,13 @@
 
 #pragma once
 
+#include <cudf/detail/normalizing_iterator.cuh>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/traits.hpp>
 
-#include <thrust/binary_search.h>
-#include <thrust/execution_policy.h>
-#include <thrust/functional.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/optional.h>
@@ -32,193 +31,6 @@
 namespace cudf {
 namespace detail {
 
-/**
- * @brief The base class for the input or output index normalizing iterator.
- *
- * This implementation uses CRTP to define the `input_indexalator` and the
- * `output_indexalator` classes. This is so this class can manipulate the
- * uniquely typed subclass member variable `p_` directly without requiring
- * virtual functions since iterator instances will be copied to device memory.
- *
- * The base class mainly manages updating the `p_` member variable while the
- * subclasses handle accessing individual elements in device memory.
- *
- * @tparam T The derived class type for the iterator.
- */
-template <class T>
-struct base_indexalator {
-  using difference_type   = ptrdiff_t;
-  using value_type        = size_type;
-  using pointer           = size_type*;
-  using iterator_category = std::random_access_iterator_tag;
-
-  base_indexalator()                                   = default;
-  base_indexalator(base_indexalator const&)            = default;
-  base_indexalator(base_indexalator&&)                 = default;
-  base_indexalator& operator=(base_indexalator const&) = default;
-  base_indexalator& operator=(base_indexalator&&)      = default;
-
-  /**
-   * @brief Prefix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator++()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix increment operator.
-   */
-  CUDF_HOST_DEVICE inline T operator++(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator++();
-    return tmp;
-  }
-
-  /**
-   * @brief Prefix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator--()
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= width_;
-    return derived;
-  }
-
-  /**
-   * @brief Postfix decrement operator.
-   */
-  CUDF_HOST_DEVICE inline T operator--(int)
-  {
-    T tmp{static_cast<T&>(*this)};
-    operator--();
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by sum operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator+=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ += offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Increment by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator+(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ += (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Addition assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator+(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ += (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compound assignment by difference operator.
-   */
-  CUDF_HOST_DEVICE inline T& operator-=(difference_type offset)
-  {
-    T& derived = static_cast<T&>(*this);
-    derived.p_ -= offset * width_;
-    return derived;
-  }
-
-  /**
-   * @brief Decrement by offset operator.
-   */
-  CUDF_HOST_DEVICE inline T operator-(difference_type offset) const
-  {
-    auto tmp = T{static_cast<T const&>(*this)};
-    tmp.p_ -= (offset * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Subtraction assignment operator.
-   */
-  CUDF_HOST_DEVICE inline friend T operator-(difference_type offset, T const& rhs)
-  {
-    T tmp{rhs};
-    tmp.p_ -= (offset * rhs.width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Compute offset from iterator difference operator.
-   */
-  CUDF_HOST_DEVICE inline difference_type operator-(T const& rhs) const
-  {
-    return (static_cast<T const&>(*this).p_ - rhs.p_) / width_;
-  }
-
-  /**
-   * @brief Equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator==(T const& rhs) const
-  {
-    return rhs.p_ == static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Not equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator!=(T const& rhs) const
-  {
-    return rhs.p_ != static_cast<T const&>(*this).p_;
-  }
-  /**
-   * @brief Less than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ < rhs.p_;
-  }
-  /**
-   * @brief Greater than operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ > rhs.p_;
-  }
-  /**
-   * @brief Less than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator<=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ <= rhs.p_;
-  }
-  /**
-   * @brief Greater than or equals to operator.
-   */
-  CUDF_HOST_DEVICE inline bool operator>=(T const& rhs) const
-  {
-    return static_cast<T const&>(*this).p_ >= rhs.p_;
-  }
-
- protected:
-  /**
-   * @brief Constructor assigns width and type member variables for base class.
-   */
-  base_indexalator(int32_t width, data_type dtype) : width_(width), dtype_(dtype) {}
-
-  int width_;        /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
 /**
  * @brief The index normalizing input iterator.
  *
@@ -244,65 +56,7 @@ struct base_indexalator {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-struct input_indexalator : base_indexalator<input_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<input_indexalator>;  // for CRTP
-
-  using reference = size_type const;  // this keeps STL and thrust happy
-
-  input_indexalator()                                    = default;
-  input_indexalator(input_indexalator const&)            = default;
-  input_indexalator(input_indexalator&&)                 = default;
-  input_indexalator& operator=(input_indexalator const&) = default;
-  input_indexalator& operator=(input_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position.
-   */
-  __device__ inline size_type operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a size_type value from any index type.
-   */
-  struct index_as_size_type {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      return static_cast<size_type>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ size_type operator()(void const* tp)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-  /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `size_type` value.
-   */
-  __device__ inline size_type operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * width_);
-    return type_dispatcher(dtype_, index_as_size_type{}, tp);
-  }
-
- protected:
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  input_indexalator(void const* data, int width, data_type dtype)
-    : base_indexalator<input_indexalator>(width, dtype), p_{static_cast<char const*>(data)}
-  {
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
+using input_indexalator = input_normalator<cudf::size_type>;
 
 /**
  * @brief The index normalizing output iterator.
@@ -328,79 +82,7 @@ struct input_indexalator : base_indexalator<input_indexalator> {
  *                      thrust::less<Element>());
  * @endcode
  */
-struct output_indexalator : base_indexalator<output_indexalator> {
-  friend struct indexalator_factory;
-  friend struct base_indexalator<output_indexalator>;  // for CRTP
-
-  using reference = output_indexalator const&;  // required for output iterators
-
-  output_indexalator()                                     = default;
-  output_indexalator(output_indexalator const&)            = default;
-  output_indexalator(output_indexalator&&)                 = default;
-  output_indexalator& operator=(output_indexalator const&) = default;
-  output_indexalator& operator=(output_indexalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(size_type)` calls.
-   */
-  __device__ inline output_indexalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(size_type)` call in this class.
-   */
-  __device__ inline output_indexalator const operator[](size_type idx) const
-  {
-    output_indexalator tmp{*this};
-    tmp.p_ += (idx * width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct size_type_to_index {
-    template <typename T, std::enable_if_t<is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, size_type const value)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign a size_type value to the current iterator position.
-   */
-  __device__ inline output_indexalator const& operator=(size_type const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(dtype_, size_type_to_index{}, tp, value);
-    return *this;
-  }
-
- protected:
-  /**
-   * @brief Create an output index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param width     The width of the integer type (1, 2, 4, or 8)
-   * @param data_type Index integer type of width `width`
-   */
-  output_indexalator(void* data, int width, data_type dtype)
-    : base_indexalator<output_indexalator>(width, dtype), p_{static_cast<char*>(data)}
-  {
-  }
-
-  char* p_;  /// pointer to the integer data in device memory
-};
+using output_indexalator = output_normalator<cudf::size_type>;
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -413,7 +95,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     input_indexalator operator()(column_view const& indices)
     {
-      return input_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return input_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -433,7 +115,7 @@ struct indexalator_factory {
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
-      return input_indexalator(scalar_impl->data(), sizeof(IndexType), index.type());
+      return input_indexalator(scalar_impl->data(), index.type());
     }
     template <typename IndexType,
               typename... Args,
@@ -451,7 +133,7 @@ struct indexalator_factory {
     template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
     output_indexalator operator()(mutable_column_view const& indices)
     {
-      return output_indexalator(indices.data<IndexType>(), sizeof(IndexType), indices.type());
+      return output_indexalator(indices.data<IndexType>(), indices.type());
     }
     template <typename IndexType,
               typename... Args,
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
new file mode 100644
index 00000000000..51b3133f84f
--- /dev/null
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -0,0 +1,367 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/utilities/type_dispatcher.hpp>
+
+#include <type_traits>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The base class for the input or output normalizing iterator
+ *
+ * The base class mainly manages updating the `p_` member variable while the
+ * subclasses handle accessing individual elements in device memory.
+ *
+ * @tparam Derived The derived class type for the iterator
+ * @tparam Integer The type the iterator normalizes to
+ */
+template <class Derived, typename Integer>
+struct base_normalator {
+  static_assert(std::is_integral_v<Integer>);
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = Integer;
+  using pointer           = Integer*;
+  using iterator_category = std::random_access_iterator_tag;
+
+  base_normalator()                                  = default;
+  base_normalator(base_normalator const&)            = default;
+  base_normalator(base_normalator&&)                 = default;
+  base_normalator& operator=(base_normalator const&) = default;
+  base_normalator& operator=(base_normalator&&)      = default;
+
+  /**
+   * @brief Prefix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator++()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix increment operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator++(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator++();
+    return tmp;
+  }
+
+  /**
+   * @brief Prefix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator--()
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= width_;
+    return derived;
+  }
+
+  /**
+   * @brief Postfix decrement operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator--(int)
+  {
+    Derived tmp{static_cast<Derived&>(*this)};
+    operator--();
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by sum operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator+=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ += offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Increment by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator+(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ += (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Addition assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator+(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ += (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compound assignment by difference operator.
+   */
+  CUDF_HOST_DEVICE inline Derived& operator-=(difference_type offset)
+  {
+    Derived& derived = static_cast<Derived&>(*this);
+    derived.p_ -= offset * width_;
+    return derived;
+  }
+
+  /**
+   * @brief Decrement by offset operator.
+   */
+  CUDF_HOST_DEVICE inline Derived operator-(difference_type offset) const
+  {
+    auto tmp = Derived{static_cast<Derived const&>(*this)};
+    tmp.p_ -= (offset * width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Subtraction assignment operator.
+   */
+  CUDF_HOST_DEVICE inline friend Derived operator-(difference_type offset, Derived const& rhs)
+  {
+    Derived tmp{rhs};
+    tmp.p_ -= (offset * rhs.width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Compute offset from iterator difference operator.
+   */
+  CUDF_HOST_DEVICE inline difference_type operator-(Derived const& rhs) const
+  {
+    return (static_cast<Derived const&>(*this).p_ - rhs.p_) / width_;
+  }
+
+  /**
+   * @brief Equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator==(Derived const& rhs) const
+  {
+    return rhs.p_ == static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Not equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator!=(Derived const& rhs) const
+  {
+    return rhs.p_ != static_cast<Derived const&>(*this).p_;
+  }
+
+  /**
+   * @brief Less than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ < rhs.p_;
+  }
+
+  /**
+   * @brief Greater than operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ > rhs.p_;
+  }
+
+  /**
+   * @brief Less than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator<=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ <= rhs.p_;
+  }
+
+  /**
+   * @brief Greater than or equals to operator.
+   */
+  CUDF_HOST_DEVICE inline bool operator>=(Derived const& rhs) const
+  {
+    return static_cast<Derived const&>(*this).p_ >= rhs.p_;
+  }
+
+ protected:
+  /**
+   * @brief Constructor assigns width and type member variables for base class.
+   */
+  explicit base_normalator(data_type dtype) : width_(size_of(dtype)), dtype_(dtype) {}
+
+  int width_;        /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
+};
+
+/**
+ * @brief The integer normalizing input iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for reading an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Reading specific elements always return a type of `Integer`
+ *
+ * @tparam Integer Type returned by all read functions
+ */
+template <typename Integer>
+struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
+  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = Integer const;  // this keeps STL and thrust happy
+
+  input_normalator()                                   = default;
+  input_normalator(input_normalator const&)            = default;
+  input_normalator(input_normalator&&)                 = default;
+  input_normalator& operator=(input_normalator const&) = default;
+  input_normalator& operator=(input_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline Integer operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const* tp)
+    {
+      return static_cast<Integer>(*static_cast<T const*>(tp));
+    }
+    template <typename T, std::enable_if_t<not cuda::std::is_integral_v<T>>* = nullptr>
+    __device__ Integer operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline Integer operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  input_normalator(void const* data, data_type dtype)
+    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
+  {
+  }
+
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The integer normalizing output iterator
+ *
+ * This is an iterator that can be used for index types (integers) without
+ * requiring a type-specific instance. It can be used for any iterator
+ * interface for writing an array of integer values of type
+ * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
+ * Setting specific elements always accept the `Integer` type values.
+ *
+ * @tparam Integer The type used for all write functions
+ */
+template <typename Integer>
+struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
+  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
+
+  using reference = output_normalator const&;  // required for output iterators
+
+  output_normalator()                                    = default;
+  output_normalator(output_normalator const&)            = default;
+  output_normalator(output_normalator&&)                 = default;
+  output_normalator& operator=(output_normalator const&) = default;
+  output_normalator& operator=(output_normalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline output_normalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_normalator const operator[](size_type idx) const
+  {
+    output_normalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, std::enable_if_t<std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void* tp, Integer const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, std::enable_if_t<not std::is_integral_v<T>>* = nullptr>
+    __device__ void operator()(void*, Integer const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline output_normalator const& operator=(Integer const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param data_type Type of data in data
+   */
+  output_normalator(void* data, data_type dtype)
+    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index 6924e77ae9b..e4e803b2d3c 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -18,6 +18,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/types.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
 
@@ -43,6 +44,7 @@ namespace cudf {
  * @param null_precedence The desired order of null compared to other elements
  * for each column. Size must be equal to `input.num_columns()` or empty.
  * If empty, all columns will be sorted in `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A non-nullable column of elements containing the permuted row indices of
  * `input` if it were sorted
@@ -51,6 +53,7 @@ std::unique_ptr<column> sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,27 +68,30 @@ std::unique_ptr<column> stable_sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Checks whether the rows of a `table` are sorted in a lexicographical
  *        order.
  *
- * @param[in] table             Table whose rows need to be compared for ordering
- * @param[in] column_order      The expected sort order for each column. Size
- *                              must be equal to `in.num_columns()` or empty. If
- *                              empty, it is expected all columns are in
- *                              ascending order.
- * @param[in] null_precedence   The desired order of null compared to other
- *                              elements for each column. Size must be equal to
- *                              `input.num_columns()` or empty. If empty,
- *                              `null_order::BEFORE` is assumed for all columns.
- *
- * @returns bool                true if sorted as expected, false if not
+ * @param table             Table whose rows need to be compared for ordering
+ * @param column_order      The expected sort order for each column. Size
+ *                          must be equal to `in.num_columns()` or empty. If
+ *                          empty, it is expected all columns are in
+ *                          ascending order.
+ * @param null_precedence   The desired order of null compared to other
+ *                          elements for each column. Size must be equal to
+ *                          `input.num_columns()` or empty. If empty,
+ *                          `null_order::BEFORE` is assumed for all columns.
+ *
+ * @param stream            CUDA stream used for device memory operations and kernel launches
+ * @returns                 true if sorted as expected, false if not
  */
 bool is_sorted(cudf::table_view const& table,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence);
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Performs a lexicographic sort of the rows of a table
@@ -98,6 +104,7 @@ bool is_sorted(cudf::table_view const& table,
  * elements for each column in `input`. Size must be equal to
  * `input.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return New table containing the desired sorted order of `input`
  */
@@ -105,6 +112,7 @@ std::unique_ptr<table> sort(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -124,6 +132,7 @@ std::unique_ptr<table> sort(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -133,6 +142,7 @@ std::unique_ptr<table> sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -154,6 +164,7 @@ std::unique_ptr<table> sort_by_key(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The reordering of `values` determined by the lexicographic order of
  * the rows of `keys`.
@@ -163,6 +174,7 @@ std::unique_ptr<table> stable_sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,6 +201,7 @@ std::unique_ptr<table> stable_sort_by_key(
  * @param null_precedence The desired order of null compared to other elements
  * for column
  * @param percentage flag to convert ranks to percentage in range (0,1]
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return A column of containing the rank of the each element of the column of `input`. The output
  * column type will be `size_type`column by default or else `double` when
@@ -201,6 +214,7 @@ std::unique_ptr<column> rank(
   null_policy null_handling,
   null_order null_precedence,
   bool percentage,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -241,6 +255,7 @@ std::unique_ptr<column> rank(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return sorted order of the segment sorted table
  *
@@ -250,6 +265,7 @@ std::unique_ptr<column> segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -262,6 +278,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -306,6 +323,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
  * elements for each column in `keys`. Size must be equal to
  * `keys.num_columns()` or empty. If empty, all columns will be sorted with
  * `null_order::BEFORE`.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to allocate any returned objects
  * @return table with elements in each segment sorted
  *
@@ -316,6 +334,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -329,6 +348,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/generate_ngrams.hpp b/cpp/include/nvtext/generate_ngrams.hpp
index 5d66401df9d..46f2c0e7bc9 100644
--- a/cpp/include/nvtext/generate_ngrams.hpp
+++ b/cpp/include/nvtext/generate_ngrams.hpp
@@ -47,19 +47,19 @@ namespace nvtext {
  * @throw cudf::logic_error if `separator` is invalid
  * @throw cudf::logic_error if there are not enough strings to generate any ngrams
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
  * @brief Generates ngrams of characters within each string.
@@ -79,15 +79,17 @@ std::unique_ptr<cudf::column> generate_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate.
  *               Default is 2 = bigram.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> generate_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 2,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -113,14 +115,16 @@ std::unique_ptr<cudf::column> generate_character_ngrams(
  * @throw cudf::logic_error if `ngrams < 2`
  * @throw cudf::logic_error if there are not enough characters to generate any ngrams
  *
- * @param strings Strings column to produce ngrams from.
+ * @param input Strings column to produce ngrams from
  * @param ngrams The ngram number to generate. Default is 5.
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return A lists column of hash values
  */
 std::unique_ptr<cudf::column> hash_character_ngrams(
-  cudf::strings_column_view const& strings,
+  cudf::strings_column_view const& input,
   cudf::size_type ngrams              = 5,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/nvtext/ngrams_tokenize.hpp b/cpp/include/nvtext/ngrams_tokenize.hpp
index 17f20f7ea4c..9d76ef8689f 100644
--- a/cpp/include/nvtext/ngrams_tokenize.hpp
+++ b/cpp/include/nvtext/ngrams_tokenize.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -66,22 +66,22 @@ namespace nvtext {
  *
  * All null row entries are ignored and the output contains all valid rows.
  *
- * @param strings Strings column to tokenize and produce ngrams from.
- * @param ngrams The ngram number to generate.
- *               Default is 2 = bigram.
+ * @param input Strings column to tokenize and produce ngrams from
+ * @param ngrams The ngram number to generate
  * @param delimiter UTF-8 characters used to separate each string into tokens.
- *                  The default of empty string will separate tokens using whitespace.
- * @param separator The string to use for separating ngram tokens.
- *                  Default is "_" character.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New strings columns of tokens.
+ *                  An empty string will separate tokens using whitespace.
+ * @param separator The string to use for separating ngram tokens
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings columns of tokens
  */
 std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+  cudf::strings_column_view const& input,
+  cudf::size_type ngrams,
+  cudf::string_scalar const& delimiter,
+  cudf::string_scalar const& separator,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
 }  // namespace nvtext
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index f8e7b4c6126..40a14d805e1 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -36,12 +36,12 @@ namespace cudf {
 namespace lists {
 namespace detail {
 /**
- * @brief Returns a numeric column containing lengths of each element.
+ * @brief Returns a numeric column containing lengths of each element
  *
- * @param input Input lists column.
- * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @param input Input lists column
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return New INT32 column with lengths.
+ * @return New size_type column with lengths
  */
 std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::cuda_stream_view stream,
@@ -52,7 +52,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
   // create output column
   auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
                                         input.size(),
-                                        copy_bitmask(input.parent()),
+                                        cudf::detail::copy_bitmask(input.parent(), stream, mr),
                                         input.null_count(),
                                         stream,
                                         mr);
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index 260636a61cf..49054ebb046 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -70,13 +70,13 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = segmented_sort_by_key(table_view{{child}},
-                                                        table_view{{child}},
-                                                        output_offset->view(),
-                                                        {column_order},
-                                                        {null_precedence},
-                                                        stream,
-                                                        mr);
+  auto const sorted_child_table = cudf::detail::segmented_sort_by_key(table_view{{child}},
+                                                                      table_view{{child}},
+                                                                      output_offset->view(),
+                                                                      {column_order},
+                                                                      {null_precedence},
+                                                                      stream,
+                                                                      mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
@@ -98,13 +98,13 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
   auto output_offset = build_output_offsets(input, stream, mr);
   auto const child   = input.get_sliced_child(stream);
 
-  auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
-                                                               table_view{{child}},
-                                                               output_offset->view(),
-                                                               {column_order},
-                                                               {null_precedence},
-                                                               stream,
-                                                               mr);
+  auto const sorted_child_table = cudf::detail::stable_segmented_sort_by_key(table_view{{child}},
+                                                                             table_view{{child}},
+                                                                             output_offset->view(),
+                                                                             {column_order},
+                                                                             {null_precedence},
+                                                                             stream,
+                                                                             mr);
 
   return make_lists_column(input.size(),
                            std::move(output_offset),
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index 2b48aed2d29..950cb484ddf 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -163,7 +163,9 @@ std::enable_if_t<cudf::is_fixed_width<T>(), std::unique_ptr<cudf::column>> clamp
   auto output =
     detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream, mr);
   // mask will not change
-  if (input.nullable()) { output->set_null_mask(copy_bitmask(input), input.null_count()); }
+  if (input.nullable()) {
+    output->set_null_mask(cudf::detail::copy_bitmask(input, stream, mr), input.null_count());
+  }
 
   auto output_device_view =
     cudf::mutable_column_device_view::create(output->mutable_view(), stream);
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 25c594e9e74..39476a2f534 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -73,7 +73,8 @@ bool is_sorted(cudf::table_view const& in,
 
 bool is_sorted(cudf::table_view const& in,
                std::vector<order> const& column_order,
-               std::vector<null_order> const& null_precedence)
+               std::vector<null_order> const& null_precedence,
+               rmm::cuda_stream_view stream)
 {
   CUDF_FUNC_RANGE();
   if (in.num_columns() == 0 || in.num_rows() == 0) { return true; }
@@ -89,7 +90,7 @@ bool is_sorted(cudf::table_view const& in,
       "Number of columns in the table doesn't match the vector null_precedence's size .\n");
   }
 
-  return detail::is_sorted(in, column_order, null_precedence, cudf::get_default_stream());
+  return detail::is_sorted(in, column_order, null_precedence, stream);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index fd65e38d467..3ead8cfcbaa 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -366,16 +366,11 @@ std::unique_ptr<column> rank(column_view const& input,
                              null_policy null_handling,
                              null_order null_precedence,
                              bool percentage,
+                             rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rank(input,
-                      method,
-                      column_order,
-                      null_handling,
-                      null_precedence,
-                      percentage,
-                      cudf::get_default_stream(),
-                      mr);
+  return detail::rank(
+    input, method, column_order, null_handling, null_precedence, percentage, stream, mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 38d008c120c..d9457341bd2 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -81,11 +81,12 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
                                                column_view const& segment_offsets,
                                                std::vector<order> const& column_order,
                                                std::vector<null_order> const& null_precedence,
+                                               rmm::cuda_stream_view stream,
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
@@ -93,11 +94,12 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
                                              column_view const& segment_offsets,
                                              std::vector<order> const& column_order,
                                              std::vector<null_order> const& null_precedence,
+                                             rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort_impl.cuh b/cpp/src/sort/segmented_sort_impl.cuh
index 37664f33762..5d11bf055f1 100644
--- a/cpp/src/sort/segmented_sort_impl.cuh
+++ b/cpp/src/sort/segmented_sort_impl.cuh
@@ -166,7 +166,7 @@ std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
   // Unfortunately, CUB's segmented sort functions cannot accept iterators.
   // We have to build a pre-filled sequence of indices as input.
   auto sorted_indices =
-    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0}, stream, mr);
+    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0, true, stream}, stream, mr);
   auto indices_view = sorted_indices->mutable_view();
 
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 25b95af4f83..46edae798d4 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -109,30 +109,32 @@ std::unique_ptr<table> sort(table_view const& input,
 std::unique_ptr<column> sorted_order(table_view const& input,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
+                                     rmm::cuda_stream_view stream,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort(table_view const& input,
                             std::vector<order> const& column_order,
                             std::vector<null_order> const& null_precedence,
+                            rmm::cuda_stream_view stream,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
                                    table_view const& keys,
                                    std::vector<order> const& column_order,
                                    std::vector<null_order> const& null_precedence,
+                                   rmm::cuda_stream_view stream,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_segmented_sort.cu b/cpp/src/sort/stable_segmented_sort.cu
index 40df1b50279..4725d65e05d 100644
--- a/cpp/src/sort/stable_segmented_sort.cu
+++ b/cpp/src/sort/stable_segmented_sort.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2023, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -55,11 +55,12 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order,
   std::vector<null_order> const& null_precedence,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
@@ -67,11 +68,12 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
                                                     column_view const& segment_offsets,
                                                     std::vector<order> const& column_order,
                                                     std::vector<null_order> const& null_precedence,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
+    values, keys, segment_offsets, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index 6f5678c4168..cf602dcf1a9 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -62,22 +62,22 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
 std::unique_ptr<column> stable_sorted_order(table_view const& input,
                                             std::vector<order> const& column_order,
                                             std::vector<null_order> const& null_precedence,
+                                            rmm::cuda_stream_view stream,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sorted_order(
-    input, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sorted_order(input, column_order, null_precedence, stream, mr);
 }
 
 std::unique_ptr<table> stable_sort_by_key(table_view const& values,
                                           table_view const& keys,
                                           std::vector<order> const& column_order,
                                           std::vector<null_order> const& null_precedence,
+                                          rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::stable_sort_by_key(
-    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
+  return detail::stable_sort_by_key(values, keys, column_order, null_precedence, stream, mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 938fd45246d..5f2f4d021a4 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -150,10 +150,11 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
 std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
+  return detail::generate_ngrams(strings, ngrams, separator, stream, mr);
 }
 
 namespace detail {
@@ -317,18 +318,20 @@ std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view co
 
 std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_view const& strings,
                                                         cudf::size_type ngrams,
+                                                        rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::generate_character_ngrams(strings, ngrams, stream, mr);
 }
 
 std::unique_ptr<cudf::column> hash_character_ngrams(cudf::strings_column_view const& strings,
                                                     cudf::size_type ngrams,
+                                                    rmm::cuda_stream_view stream,
                                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
+  return detail::hash_character_ngrams(strings, ngrams, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/jaccard.cu b/cpp/src/text/jaccard.cu
index 5b55745c2c7..95324847ea0 100644
--- a/cpp/src/text/jaccard.cu
+++ b/cpp/src/text/jaccard.cu
@@ -107,7 +107,7 @@ rmm::device_uvector<cudf::size_type> compute_unique_counts(cudf::column_view con
  *
  * This is called with a warp per row
  */
-struct sorted_interset_fn {
+struct sorted_intersect_fn {
   cudf::column_device_view const d_input1;
   cudf::column_device_view const d_input2;
   cudf::size_type* d_results;
@@ -151,7 +151,7 @@ rmm::device_uvector<cudf::size_type> compute_intersect_counts(cudf::column_view
   auto const d_input1 = cudf::column_device_view::create(input1, stream);
   auto const d_input2 = cudf::column_device_view::create(input2, stream);
   auto d_results      = rmm::device_uvector<cudf::size_type>(input1.size(), stream);
-  sorted_interset_fn fn{*d_input1, *d_input2, d_results.data()};
+  sorted_intersect_fn fn{*d_input1, *d_input2, d_results.data()};
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::counting_iterator<cudf::size_type>(0),
                      input1.size() * cudf::detail::warp_size,
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index fd1cbf99221..73d85513e95 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -265,11 +265,11 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
                                               cudf::size_type ngrams,
                                               cudf::string_scalar const& delimiter,
                                               cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
+  return detail::ngrams_tokenize(strings, ngrams, delimiter, separator, stream, mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e245bb449a0..8bdb61a48e1 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -621,11 +621,11 @@ ConfigureTest(
   STREAM_IDENTIFICATION_TEST identify_stream_usage/test_default_stream_identification.cu
 )
 
-ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
-ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_HASHING_TEST streams/hash_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
@@ -633,6 +633,8 @@ ConfigureTest(
   STREAM_STRINGS_TEST streams/strings/case_test.cpp streams/strings/find_test.cpp STREAM_MODE
   testing
 )
+ConfigureTest(STREAM_SORTING_TEST streams/sorting_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_TEXT_TEST streams/text/ngrams_test.cpp STREAM_MODE testing)
 
 # ##################################################################################################
 # Install tests ####################################################################################
diff --git a/cpp/tests/streams/sorting_test.cpp b/cpp/tests/streams/sorting_test.cpp
new file mode 100644
index 00000000000..e481f95bded
--- /dev/null
+++ b/cpp/tests/streams/sorting_test.cpp
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/sorting.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class SortingTest : public cudf::test::BaseFixture {};
+
+TEST_F(SortingTest, SortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::stable_sorted_order(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, IsSorted)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::is_sorted(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Sort)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+  cudf::table_view const tbl{{column}};
+
+  cudf::sort(tbl, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{10, 20, 30, 40, 50};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{10, 20, 30, 40, 50};
+  cudf::table_view const keys{{keys_col}};
+
+  cudf::stable_sort_by_key(values, keys, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, Rank)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const column{10, 20, 30, 40, 50};
+
+  cudf::rank(column,
+             cudf::rank_method::AVERAGE,
+             cudf::order::ASCENDING,
+             cudf::null_policy::EXCLUDE,
+             cudf::null_order::AFTER,
+             false,
+             cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::segmented_sorted_order(keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortedOrder)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{3, 7};
+
+  cudf::stable_segmented_sorted_order(
+    keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, SegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
+
+TEST_F(SortingTest, StableSegmentedSortByKey)
+{
+  cudf::test::fixed_width_column_wrapper<int32_t> const keys_col{9, 8, 7, 6, 5, 4, 3, 2, 1, 0};
+  cudf::table_view const keys{{keys_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const values_col{7, 6, 9, 3, 4, 5, 1, 2, 0, 4};
+  cudf::table_view const values{{values_col}};
+  cudf::test::fixed_width_column_wrapper<int32_t> const segment_offsets{0, 3, 7, 10};
+
+  cudf::stable_segmented_sort_by_key(
+    values, keys, segment_offsets, {}, {}, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/streams/text/ngrams_test.cpp b/cpp/tests/streams/text/ngrams_test.cpp
new file mode 100644
index 00000000000..bce0d2b680b
--- /dev/null
+++ b/cpp/tests/streams/text/ngrams_test.cpp
@@ -0,0 +1,59 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvtext/generate_ngrams.hpp>
+#include <nvtext/ngrams_tokenize.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+
+class TextNGramsTest : public cudf::test::BaseFixture {};
+
+TEST_F(TextNGramsTest, GenerateNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::generate_ngrams(
+    cudf::strings_column_view(input), 3, separator, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, GenerateCharacterNgrams)
+{
+  auto const input =
+    cudf::test::strings_column_wrapper({"the", "fox", "jumped", "over", "thé", "dog"});
+  nvtext::generate_character_ngrams(
+    cudf::strings_column_view(input), 3, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, HashCharacterNgrams)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  nvtext::hash_character_ngrams(
+    cudf::strings_column_view(input), 5, cudf::test::get_default_stream());
+}
+
+TEST_F(TextNGramsTest, NgramsTokenize)
+{
+  auto input =
+    cudf::test::strings_column_wrapper({"the quick brown fox", "jumped over the lazy dog."});
+  auto const delimiter = cudf::string_scalar{" ", true, cudf::test::get_default_stream()};
+  auto const separator = cudf::string_scalar{"_", true, cudf::test::get_default_stream()};
+  nvtext::ngrams_tokenize(
+    cudf::strings_column_view(input), 2, delimiter, separator, cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 323b3eed3e2..7b179588385 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -34,18 +34,19 @@ TEST_F(TextGenerateNgramsTest, Ngrams)
 {
   cudf::test::strings_column_wrapper strings{"the", "fox", "jumped", "over", "thé", "dog"};
   cudf::strings_column_view strings_view(strings);
+  auto const separator = cudf::string_scalar("_");
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox", "fox_jumped", "jumped_over", "over_thé", "thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view);
+    auto const results = nvtext::generate_ngrams(strings_view, 2, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 
   {
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_thé", "over_thé_dog"};
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -83,10 +84,11 @@ TEST_F(TextGenerateNgramsTest, NgramsWithNulls)
     h_strings.begin(),
     h_strings.end(),
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  auto const separator = cudf::string_scalar("_");
 
   cudf::strings_column_view strings_view(strings);
   {
-    auto const results = nvtext::generate_ngrams(strings_view, 3);
+    auto const results = nvtext::generate_ngrams(strings_view, 3, separator);
     cudf::test::strings_column_wrapper expected{
       "the_fox_jumped", "fox_jumped_over", "jumped_over_the", "over_the_dog"};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -103,7 +105,10 @@ TEST_F(TextGenerateNgramsTest, Empty)
 {
   auto const zero_size_strings_column = cudf::make_empty_column(cudf::type_id::STRING)->view();
 
-  auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
+  auto const separator = cudf::string_scalar("_");
+
+  auto results =
+    nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column), 2, separator);
   cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
   cudf::test::expect_column_empty(results->view());
@@ -112,21 +117,20 @@ TEST_F(TextGenerateNgramsTest, Empty)
 TEST_F(TextGenerateNgramsTest, Errors)
 {
   cudf::test::strings_column_wrapper strings{""};
+  auto const separator = cudf::string_scalar("_");
   // invalid parameter value
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 1, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 1),
                cudf::logic_error);
   // not enough strings to generate ngrams
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3), cudf::logic_error);
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings), 3, separator),
+               cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings), 3),
                cudf::logic_error);
 
-  std::vector<char const*> h_strings{"", nullptr, "", nullptr};
-  cudf::test::strings_column_wrapper strings_no_tokens(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens)),
+  cudf::test::strings_column_wrapper strings_no_tokens({"", "", "", ""}, {1, 0, 1, 0});
+  EXPECT_THROW(nvtext::generate_ngrams(cudf::strings_column_view(strings_no_tokens), 2, separator),
                cudf::logic_error);
   EXPECT_THROW(nvtext::generate_character_ngrams(cudf::strings_column_view(strings_no_tokens)),
                cudf::logic_error);
diff --git a/cpp/tests/text/ngrams_tokenize_tests.cpp b/cpp/tests/text/ngrams_tokenize_tests.cpp
index 5879bec3e64..c6fb886f7e5 100644
--- a/cpp/tests/text/ngrams_tokenize_tests.cpp
+++ b/cpp/tests/text/ngrams_tokenize_tests.cpp
@@ -62,7 +62,7 @@ TEST_F(TextNgramsTokenizeTest, Tokenize)
                                                 "mousé_ate",
                                                 "ate_the",
                                                 "the_cheese"};
-    auto results = nvtext::ngrams_tokenize(strings_view);
+    auto results = nvtext::ngrams_tokenize(strings_view, 2, std::string(), std::string("_"));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -101,9 +101,10 @@ TEST_F(TextNgramsTokenizeTest, TokenizeOneGram)
 {
   cudf::test::strings_column_wrapper strings{"aaa bbb", "  ccc  ddd  ", "eee"};
   cudf::strings_column_view strings_view(strings);
+  auto const empty = cudf::string_scalar("");
 
   cudf::test::strings_column_wrapper expected{"aaa", "bbb", "ccc", "ddd", "eee"};
-  auto results = nvtext::ngrams_tokenize(strings_view, 1);
+  auto results = nvtext::ngrams_tokenize(strings_view, 1, empty, empty);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -111,7 +112,8 @@ TEST_F(TextNgramsTokenizeTest, TokenizeEmptyTest)
 {
   auto strings = cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
   cudf::strings_column_view strings_view(strings->view());
-  auto results = nvtext::ngrams_tokenize(strings_view);
+  auto const empty = cudf::string_scalar("");
+  auto results     = nvtext::ngrams_tokenize(strings_view, 2, empty, empty);
   EXPECT_EQ(results->size(), 0);
   EXPECT_EQ(results->has_nulls(), false);
 }
@@ -120,5 +122,6 @@ TEST_F(TextNgramsTokenizeTest, TokenizeErrorTest)
 {
   cudf::test::strings_column_wrapper strings{"this column intentionally left blank"};
   cudf::strings_column_view strings_view(strings);
-  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0), cudf::logic_error);
+  auto const empty = cudf::string_scalar("");
+  EXPECT_THROW(nvtext::ngrams_tokenize(strings_view, 0, empty, empty), cudf::logic_error);
 }
diff --git a/dependencies.yaml b/dependencies.yaml
index 376e43094a7..5586f54348c 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -218,6 +218,7 @@ dependencies:
           - libkvikio==23.10.*
       - output_types: conda
         packages:
+          - aws-sdk-cpp<1.11
           - fmt>=9.1.0,<10
           - &gbench benchmark==1.8.0
           - &gtest gtest>=1.13.0
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 1a780cc9e9f..8a3dbe77787 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -1390,10 +1390,21 @@ def _get_numeric_data(self):
         return self[columns]
 
     @_cudf_nvtx_annotate
-    def assign(self, **kwargs):
+    def assign(self, **kwargs: Union[Callable[[Self], Any], Any]):
         """
         Assign columns to DataFrame from keyword arguments.
 
+        Parameters
+        ----------
+        **kwargs: dict mapping string column names to values
+            The value for each key can either be a literal column (or
+            something that can be converted to a column), or
+            a callable of one argument that will be given the
+            dataframe as an argument and should return the new column
+            (without modifying the input argument).
+            Columns are added in-order, so callables can refer to
+            column names constructed in the assignment.
+
         Examples
         --------
         >>> import cudf
@@ -1405,15 +1416,9 @@ def assign(self, **kwargs):
         1  1  4
         2  2  5
         """
-        new_df = cudf.DataFrame(index=self.index.copy())
-        for name, col in self._data.items():
-            if name in kwargs:
-                new_df[name] = kwargs.pop(name)
-            else:
-                new_df._data[name] = col.copy()
-
+        new_df = self.copy(deep=False)
         for k, v in kwargs.items():
-            new_df[k] = v
+            new_df[k] = v(new_df) if callable(v) else v
         return new_df
 
     @classmethod
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index 6180162ecdd..2f531afdeb7 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -1327,6 +1327,25 @@ def test_assign():
     np.testing.assert_equal(gdf2.y.to_numpy(), [2, 3, 4])
 
 
+@pytest.mark.parametrize(
+    "mapping",
+    [
+        {"y": 1, "z": lambda df: df["x"] + df["y"]},
+        {
+            "x": lambda df: df["x"] * 2,
+            "y": lambda df: 2,
+            "z": lambda df: df["x"] / df["y"],
+        },
+    ],
+)
+def test_assign_callable(mapping):
+    df = pd.DataFrame({"x": [1, 2, 3]})
+    cdf = cudf.from_pandas(df)
+    expect = df.assign(**mapping)
+    actual = cdf.assign(**mapping)
+    assert_eq(expect, actual)
+
+
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
 @pytest.mark.parametrize("method", ["murmur3", "md5"])
 @pytest.mark.parametrize("seed", [None, 42])