From df8a561d02a9cf313368b2e3a995c2c477d10151 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 16:46:44 -0700
Subject: [PATCH 01/11] Add functors used by compute_shared_memory_aggs

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 85 +++++++++++++++++++
 1 file changed, 85 insertions(+)
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 73791b3aa71..e167df2bec4 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -23,6 +23,91 @@
 #include <cuco/static_set_ref.cuh>
 
 namespace cudf::groupby::detail::hash {
+// TODO: TO BE REMOVED
+template <typename T, cudf::aggregation::Kind k>
+__device__ constexpr bool is_supported()
+{
+  return cudf::is_fixed_width<T>() and
+         ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::SUM_OF_SQUARES) or
+          (k == cudf::aggregation::MIN) or (k == cudf::aggregation::MAX) or
+          (k == cudf::aggregation::COUNT_VALID) or (k == cudf::aggregation::COUNT_ALL) or
+          (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::ARGMAX) or
+          (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or
+          (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported<T>());
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ std::enable_if_t<not std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  using DeviceType = cudf::device_storage_type_t<T>;
+  return cudf::detail::corresponding_operator_t<k>::template identity<DeviceType>();
+}
+
+template <typename T, cudf::aggregation::Kind k, typename Enable = void>
+__device__ std::enable_if_t<std::is_same_v<cudf::detail::corresponding_operator_t<k>, void>, T>
+identity_from_operator()
+{
+  CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator");
+}
+
+template <typename T, cudf::aggregation::Kind k>
+__device__ T get_identity()
+{
+  if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) {
+    if constexpr (cudf::is_timestamp<T>())
+      return k == cudf::aggregation::ARGMAX
+               ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)}
+               : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)};
+    else {
+      using DeviceType = cudf::device_storage_type_t<T>;
+      return k == cudf::aggregation::ARGMAX
+               ? static_cast<DeviceType>(cudf::detail::ARGMAX_SENTINEL)
+               : static_cast<DeviceType>(cudf::detail::ARGMIN_SENTINEL);
+    }
+  }
+  return identity_from_operator<T, k>();
+}
+
+template <typename Target, cudf::aggregation::Kind k, typename Enable = void>
+struct initialize_target_element {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    CUDF_UNREACHABLE("Invalid source type and aggregation combination.");
+  }
+};
+
+template <typename Target, cudf::aggregation::Kind k>
+struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target, k>()>> {
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    using DeviceType          = cudf::device_storage_type_t<Target>;
+    DeviceType* target_casted = reinterpret_cast<DeviceType*>(target);
+
+    target_casted[idx] = get_identity<DeviceType, k>();
+
+    if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) {
+      target_mask[idx] = true;
+    } else {
+      target_mask[idx] = false;
+    }
+  }
+};
+
+struct initialize_shmem {
+  template <typename Target, cudf::aggregation::Kind k>
+  __device__ void operator()(cuda::std::byte* target,
+                             bool* target_mask,
+                             cudf::size_type idx) const noexcept
+  {
+    initialize_target_element<Target, k>{}(target, target_mask, idx);
+  }
+};
+
 /**
  * @brief Computes single-pass aggregations and store results into a sparse `output_values` table,
  * and populate `set` with indices of unique keys

From ec5b70504a4e04aa4e771b107726337142a258ca Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 23 Oct 2024 16:47:06 -0700
Subject: [PATCH 02/11] Add compute_shared_memory_aggs

---
 cpp/CMakeLists.txt                            |   1 +
 .../hash/compute_shared_memory_aggs.cu        | 321 ++++++++++++++++++
 .../hash/compute_shared_memory_aggs.hpp       |  42 +++
 3 files changed, 364 insertions(+)
 create mode 100644 cpp/src/groupby/hash/compute_shared_memory_aggs.cu
 create mode 100644 cpp/src/groupby/hash/compute_shared_memory_aggs.hpp

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index e4b9cbf8921..68e15fbeb06 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -369,6 +369,7 @@ add_library(
   src/filling/sequence.cu
   src/groupby/groupby.cu
   src/groupby/hash/compute_groupby.cu
+  src/groupby/hash/compute_shared_memory_aggs.cu
   src/groupby/hash/compute_single_pass_aggs.cu
   src/groupby/hash/create_sparse_results_table.cu
   src/groupby/hash/flatten_single_pass_aggs.cpp
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
new file mode 100644
index 00000000000..3e961fa1d76
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -0,0 +1,321 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "compute_shared_memory_aggs.hpp"
+#include "global_memory_aggregator.cuh"
+#include "helpers.cuh"
+#include "shared_memory_aggregator.cuh"
+#include "single_pass_functors.cuh"
+
+#include <cudf/aggregation.hpp>
+#include <cudf/detail/utilities/cuda.cuh>
+#include <cudf/detail/utilities/cuda.hpp>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+#include <cudf/utilities/bit.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cooperative_groups.h>
+#include <cuda/std/cstddef>
+
+namespace cudf::groupby::detail::hash {
+namespace {
+/// Functor used by type dispatcher returning the size of the underlying C++ type
+struct size_of_functor {
+  template <typename T>
+  __device__ constexpr cudf::size_type operator()()
+  {
+    return sizeof(T);
+  }
+};
+
+// Prepares shared memory data required by each output column, exits if
+// no enough memory space to perform the shared memory aggregation for the
+// current output column
+__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
+                                               cudf::size_type& col_end,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::size_type output_size,
+                                               cudf::size_type* shmem_agg_res_offsets,
+                                               cudf::size_type* shmem_agg_mask_offsets,
+                                               cudf::size_type cardinality,
+                                               cudf::size_type total_agg_size)
+{
+  col_start                       = col_end;
+  cudf::size_type bytes_allocated = 0;
+
+  auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality);
+
+  while (bytes_allocated < total_agg_size && col_end < output_size) {
+    auto const col_idx = col_end;
+    auto const next_col_size =
+      round_to_multiple_of_8(cudf::type_dispatcher<cudf::dispatch_storage_type>(
+                               output_values.column(col_idx).type(), size_of_functor{}) *
+                             cardinality);
+    auto const next_col_total_size = next_col_size + valid_col_size;
+
+    if (bytes_allocated + next_col_total_size > total_agg_size) {
+      CUDF_UNREACHABLE("No enough memory space for shared memory aggregations");
+    }
+
+    shmem_agg_res_offsets[col_end]  = bytes_allocated;
+    shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size;
+
+    bytes_allocated += next_col_total_size;
+    ++col_end;
+  }
+}
+
+// Each block initialize its own shared memory aggregation results
+__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block,
+                                              cudf::size_type col_start,
+                                              cudf::size_type col_end,
+                                              cudf::mutable_table_device_view output_values,
+                                              cuda::std::byte* shmem_agg_storage,
+                                              cudf::size_type* shmem_agg_res_offsets,
+                                              cudf::size_type* shmem_agg_mask_offsets,
+                                              cudf::size_type cardinality,
+                                              cudf::aggregation::Kind const* d_agg_kinds)
+{
+  for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+    for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+      cuda::std::byte* target =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+      bool* target_mask =
+        reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
+      cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  initialize_shmem{},
+                                                  target,
+                                                  target_mask,
+                                                  idx);
+    }
+  }
+  block.sync();
+}
+
+__device__ void compute_pre_aggregrations(cudf::size_type col_start,
+                                          cudf::size_type col_end,
+                                          bitmask_type const* row_bitmask,
+                                          bool skip_rows_with_nulls,
+                                          cudf::table_device_view source,
+                                          cudf::size_type num_input_rows,
+                                          cudf::size_type* local_mapping_index,
+                                          cuda::std::byte* shmem_agg_storage,
+                                          cudf::size_type* shmem_agg_res_offsets,
+                                          cudf::size_type* shmem_agg_mask_offsets,
+                                          cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // Aggregates global memory sources to shared memory targets
+  for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows;
+       source_idx += cudf::detail::grid_1d::grid_stride()) {
+    if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) {
+      auto const target_idx = local_mapping_index[source_idx];
+      for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+        auto const source_col = source.column(col_idx);
+
+        cuda::std::byte* target =
+          reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
+        bool* target_mask =
+          reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
+
+        cudf::detail::dispatch_type_and_aggregation(source_col.type(),
+                                                    d_agg_kinds[col_idx],
+                                                    shmem_element_aggregator{},
+                                                    target,
+                                                    target_mask,
+                                                    target_idx,
+                                                    source_col,
+                                                    source_idx);
+      }
+    }
+  }
+}
+
+__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block,
+                                           cudf::size_type col_start,
+                                           cudf::size_type col_end,
+                                           cudf::table_device_view input_values,
+                                           cudf::mutable_table_device_view target,
+                                           cudf::size_type cardinality,
+                                           cudf::size_type* global_mapping_index,
+                                           cuda::std::byte* shmem_agg_storage,
+                                           cudf::size_type* agg_res_offsets,
+                                           cudf::size_type* agg_mask_offsets,
+                                           cudf::aggregation::Kind const* d_agg_kinds)
+{
+  // Aggregates shared memory sources to global memory targets
+  for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
+    auto const target_idx =
+      global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx];
+    for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
+      auto target_col = target.column(col_idx);
+
+      cuda::std::byte* source =
+        reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + agg_res_offsets[col_idx]);
+      bool* source_mask = reinterpret_cast<bool*>(shmem_agg_storage + agg_mask_offsets[col_idx]);
+
+      cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(),
+                                                  d_agg_kinds[col_idx],
+                                                  gmem_element_aggregator{},
+                                                  target_col,
+                                                  target_idx,
+                                                  input_values.column(col_idx),
+                                                  source,
+                                                  source_mask,
+                                                  idx);
+    }
+  }
+  block.sync();
+}
+
+/* Takes the local_mapping_index and global_mapping_index to compute
+ * pre (shared) and final (global) aggregates*/
+CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
+                                               bitmask_type const* row_bitmask,
+                                               bool skip_rows_with_nulls,
+                                               cudf::size_type* local_mapping_index,
+                                               cudf::size_type* global_mapping_index,
+                                               cudf::size_type* block_cardinality,
+                                               cudf::table_device_view input_values,
+                                               cudf::mutable_table_device_view output_values,
+                                               cudf::aggregation::Kind const* d_agg_kinds,
+                                               cudf::size_type total_agg_size,
+                                               cudf::size_type offsets_size)
+{
+  auto const block       = cooperative_groups::this_thread_block();
+  auto const cardinality = block_cardinality[block.group_index().x];
+  if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; }
+
+  auto const num_cols = output_values.num_columns();
+
+  __shared__ cudf::size_type col_start;
+  __shared__ cudf::size_type col_end;
+  extern __shared__ cuda::std::byte shmem_agg_storage[];
+
+  cudf::size_type* shmem_agg_res_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size);
+  cudf::size_type* shmem_agg_mask_offsets =
+    reinterpret_cast<cudf::size_type*>(shmem_agg_storage + total_agg_size + offsets_size);
+
+  if (block.thread_rank() == 0) {
+    col_start = 0;
+    col_end   = 0;
+  }
+  block.sync();
+
+  while (col_end < num_cols) {
+    if (block.thread_rank() == 0) {
+      calculate_columns_to_aggregate(col_start,
+                                     col_end,
+                                     output_values,
+                                     num_cols,
+                                     shmem_agg_res_offsets,
+                                     shmem_agg_mask_offsets,
+                                     cardinality,
+                                     total_agg_size);
+    }
+    block.sync();
+
+    initialize_shmem_aggregations(block,
+                                  col_start,
+                                  col_end,
+                                  output_values,
+                                  shmem_agg_storage,
+                                  shmem_agg_res_offsets,
+                                  shmem_agg_mask_offsets,
+                                  cardinality,
+                                  d_agg_kinds);
+
+    compute_pre_aggregrations(col_start,
+                              col_end,
+                              row_bitmask,
+                              skip_rows_with_nulls,
+                              input_values,
+                              num_rows,
+                              local_mapping_index,
+                              shmem_agg_storage,
+                              shmem_agg_res_offsets,
+                              shmem_agg_mask_offsets,
+                              d_agg_kinds);
+    block.sync();
+
+    compute_final_aggregations(block,
+                               col_start,
+                               col_end,
+                               input_values,
+                               output_values,
+                               cardinality,
+                               global_mapping_index,
+                               shmem_agg_storage,
+                               shmem_agg_res_offsets,
+                               shmem_agg_mask_offsets,
+                               d_agg_kinds);
+  }
+}
+
+constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
+
+}  // namespace
+
+size_t available_shared_memory_size(cudf::size_type grid_size)
+{
+  auto const active_blocks_per_sm =
+    cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
+
+  size_t dynamic_shmem_size = 0;
+  CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
+    &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
+  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
+}
+
+size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; }
+
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream)
+{
+  auto const shmem_size = available_shared_memory_size(grid_size);
+  // For each aggregation, need one offset determining where the aggregation is
+  // performed, another indicating the validity of the aggregation
+  auto const offsets_size = shmem_offsets_size(output_values.num_columns());
+  // The rest of shmem is utilized for the actual arrays in shmem
+  CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations");
+  auto const shmem_agg_size = shmem_size - offsets_size * 2;
+  single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+    num_input_rows,
+    row_bitmask,
+    skip_rows_with_nulls,
+    local_mapping_index,
+    global_mapping_index,
+    block_cardinality,
+    input_values,
+    output_values,
+    d_agg_kinds,
+    shmem_agg_size,
+    offsets_size);
+}
+}  // namespace cudf::groupby::detail::hash
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
new file mode 100644
index 00000000000..7dc2b448a60
--- /dev/null
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/aggregation.hpp>
+#include <cudf/table/table_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf::groupby::detail::hash {
+
+size_t available_shared_memory_size(cudf::size_type grid_size);
+
+size_t shmem_offsets_size(cudf::size_type num_cols);
+
+void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                cudf::size_type num_input_rows,
+                                bitmask_type const* row_bitmask,
+                                bool skip_rows_with_nulls,
+                                cudf::size_type* local_mapping_index,
+                                cudf::size_type* global_mapping_index,
+                                cudf::size_type* block_cardinality,
+                                cudf::table_device_view input_values,
+                                cudf::mutable_table_device_view output_values,
+                                cudf::aggregation::Kind const* d_agg_kinds,
+                                rmm::cuda_stream_view stream);
+
+}  // namespace cudf::groupby::detail::hash

From 5e85f23a4c0e93c02304497d71bfea25fcd5b888 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 24 Oct 2024 10:38:08 -0700
Subject: [PATCH 03/11] Update comments

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index e167df2bec4..1ca387358b8 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -23,7 +23,7 @@
 #include <cuco/static_set_ref.cuh>
 
 namespace cudf::groupby::detail::hash {
-// TODO: TO BE REMOVED
+// TODO: TO BE REMOVED issue tracked via #17171
 template <typename T, cudf::aggregation::Kind k>
 __device__ constexpr bool is_supported()
 {

From 00ed4b28436d9abf84ced95992cea0d3d7ffb3f3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 25 Oct 2024 11:32:08 -0700
Subject: [PATCH 04/11] Update
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 3e961fa1d76..9e03fef2a56 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -70,7 +70,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
     auto const next_col_total_size = next_col_size + valid_col_size;
 
     if (bytes_allocated + next_col_total_size > total_agg_size) {
-      CUDF_UNREACHABLE("No enough memory space for shared memory aggregations");
+      CUDF_UNREACHABLE("Not enough memory for shared memory aggregations");
     }
 
     shmem_agg_res_offsets[col_end]  = bytes_allocated;

From 8c3f1923924d21e81cc76d015a7d26354b4a74b3 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 25 Oct 2024 13:21:32 -0700
Subject: [PATCH 05/11] Leverage existing utilities to eliminate duplication

---
 .../hash/compute_shared_memory_aggs.cu        | 19 +++++++++++--------
 cpp/src/groupby/hash/helpers.cuh              |  9 ---------
 2 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 9e03fef2a56..9c0044ae7f0 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -44,6 +44,9 @@ struct size_of_functor {
   }
 };
 
+/// Shared memory data alignment
+CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8;
+
 // Prepares shared memory data required by each output column, exits if
 // no enough memory space to perform the shared memory aggregation for the
 // current output column
@@ -59,14 +62,16 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start,
   col_start                       = col_end;
   cudf::size_type bytes_allocated = 0;
 
-  auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality);
+  auto const valid_col_size =
+    cudf::util::round_up_safe(static_cast<cudf::size_type>(sizeof(bool) * cardinality), ALIGNMENT);
 
   while (bytes_allocated < total_agg_size && col_end < output_size) {
     auto const col_idx = col_end;
     auto const next_col_size =
-      round_to_multiple_of_8(cudf::type_dispatcher<cudf::dispatch_storage_type>(
-                               output_values.column(col_idx).type(), size_of_functor{}) *
-                             cardinality);
+      cudf::util::round_up_safe(cudf::type_dispatcher<cudf::dispatch_storage_type>(
+                                  output_values.column(col_idx).type(), size_of_functor{}) *
+                                  cardinality,
+                                ALIGNMENT);
     auto const next_col_total_size = next_col_size + valid_col_size;
 
     if (bytes_allocated + next_col_total_size > total_agg_size) {
@@ -268,9 +273,6 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
                                d_agg_kinds);
   }
 }
-
-constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; }
-
 }  // namespace
 
 size_t available_shared_memory_size(cudf::size_type grid_size)
@@ -281,7 +283,8 @@ size_t available_shared_memory_size(cudf::size_type grid_size)
   size_t dynamic_shmem_size = 0;
   CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock(
     &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE));
-  return get_previous_multiple_of_8(0.5 * dynamic_shmem_size);
+  return cudf::util::round_down_safe(static_cast<cudf::size_type>(0.5 * dynamic_shmem_size),
+                                     ALIGNMENT);
 }
 
 size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; }
diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh
index 0d117ca35b3..00836567b4f 100644
--- a/cpp/src/groupby/hash/helpers.cuh
+++ b/cpp/src/groupby/hash/helpers.cuh
@@ -54,15 +54,6 @@ using shmem_extent_t =
 CUDF_HOST_DEVICE auto constexpr window_extent =
   cuco::make_window_extent<GROUPBY_CG_SIZE, GROUPBY_WINDOW_SIZE>(shmem_extent_t{});
 
-/**
- * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer.
- */
-CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num)
-{
-  std::size_t constexpr base = 8;
-  return cudf::util::div_rounding_up_safe(num, base) * base;
-}
-
 using row_hash_t =
   cudf::experimental::row::hash::device_row_hasher<cudf::hashing::detail::default_hash,
                                                    cudf::nullate::DYNAMIC>;

From f0d9c5a0c35c9d13435ca6b22538bceb877535cc Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 25 Oct 2024 13:28:36 -0700
Subject: [PATCH 06/11] Minor cleanup

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 9c0044ae7f0..3fb4518a7ce 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -287,8 +287,6 @@ size_t available_shared_memory_size(cudf::size_type grid_size)
                                      ALIGNMENT);
 }
 
-size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; }
-
 void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 cudf::size_type num_input_rows,
                                 bitmask_type const* row_bitmask,
@@ -304,7 +302,7 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
   auto const shmem_size = available_shared_memory_size(grid_size);
   // For each aggregation, need one offset determining where the aggregation is
   // performed, another indicating the validity of the aggregation
-  auto const offsets_size = shmem_offsets_size(output_values.num_columns());
+  auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
   // The rest of shmem is utilized for the actual arrays in shmem
   CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations");
   auto const shmem_agg_size = shmem_size - offsets_size * 2;
@@ -319,6 +317,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
     output_values,
     d_agg_kinds,
     shmem_agg_size,
-    offsets_size);
+    shmem_offsets_size);
 }
 }  // namespace cudf::groupby::detail::hash

From 30decde06b551225306e29e6ca9b6f7984efd934 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 25 Oct 2024 13:31:48 -0700
Subject: [PATCH 07/11] Fix typos

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 3fb4518a7ce..5e42042b5ae 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -304,8 +304,9 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
   // performed, another indicating the validity of the aggregation
   auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
   // The rest of shmem is utilized for the actual arrays in shmem
-  CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations");
-  auto const shmem_agg_size = shmem_size - offsets_size * 2;
+  CUDF_EXPECTS(shmem_size > shmem_offsets_size * 2,
+               "No enough space for shared memory aggregations");
+  auto const shmem_agg_size = shmem_size - shmem_offsets_size * 2;
   single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,

From f11777439580116c1252bf6d6743e932faab3138 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Fri, 25 Oct 2024 13:38:28 -0700
Subject: [PATCH 08/11] Pass available_shmem_size as argument to avoid
 redundant invocation

---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu  | 10 +++++-----
 cpp/src/groupby/hash/compute_shared_memory_aggs.hpp |  5 ++---
 2 files changed, 7 insertions(+), 8 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index 5e42042b5ae..ca43613db06 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -275,7 +275,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows,
 }
 }  // namespace
 
-size_t available_shared_memory_size(cudf::size_type grid_size)
+std::size_t available_shared_memory_size(cudf::size_type grid_size)
 {
   auto const active_blocks_per_sm =
     cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors());
@@ -288,6 +288,7 @@ size_t available_shared_memory_size(cudf::size_type grid_size)
 }
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                std::size_t available_shmem_size,
                                 cudf::size_type num_input_rows,
                                 bitmask_type const* row_bitmask,
                                 bool skip_rows_with_nulls,
@@ -299,15 +300,14 @@ void compute_shared_memory_aggs(cudf::size_type grid_size,
                                 cudf::aggregation::Kind const* d_agg_kinds,
                                 rmm::cuda_stream_view stream)
 {
-  auto const shmem_size = available_shared_memory_size(grid_size);
   // For each aggregation, need one offset determining where the aggregation is
   // performed, another indicating the validity of the aggregation
   auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type);
   // The rest of shmem is utilized for the actual arrays in shmem
-  CUDF_EXPECTS(shmem_size > shmem_offsets_size * 2,
+  CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2,
                "No enough space for shared memory aggregations");
-  auto const shmem_agg_size = shmem_size - shmem_offsets_size * 2;
-  single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, shmem_size, stream>>>(
+  auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2;
+  single_pass_shmem_aggs_kernel<<<grid_size, GROUPBY_BLOCK_SIZE, available_shmem_size, stream>>>(
     num_input_rows,
     row_bitmask,
     skip_rows_with_nulls,
diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
index 7dc2b448a60..653821fd53b 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp
@@ -23,11 +23,10 @@
 
 namespace cudf::groupby::detail::hash {
 
-size_t available_shared_memory_size(cudf::size_type grid_size);
-
-size_t shmem_offsets_size(cudf::size_type num_cols);
+std::size_t available_shared_memory_size(cudf::size_type grid_size);
 
 void compute_shared_memory_aggs(cudf::size_type grid_size,
+                                std::size_t available_shmem_size,
                                 cudf::size_type num_input_rows,
                                 bitmask_type const* row_bitmask,
                                 bool skip_rows_with_nulls,

From 8afef8ee985ae5e64b1542e05d4ede5416a86f5a Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 30 Oct 2024 14:09:05 -0700
Subject: [PATCH 09/11] Apply suggestions from code review

Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com>
Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com>
---
 cpp/src/groupby/hash/compute_shared_memory_aggs.cu |  4 ++--
 cpp/src/groupby/hash/single_pass_functors.cuh      | 10 +++-------
 2 files changed, 5 insertions(+), 9 deletions(-)

diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
index ca43613db06..12c02a1865e 100644
--- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
+++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu
@@ -99,9 +99,9 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c
 {
   for (auto col_idx = col_start; col_idx < col_end; col_idx++) {
     for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) {
-      cuda::std::byte* target =
+      auto target =
         reinterpret_cast<cuda::std::byte*>(shmem_agg_storage + shmem_agg_res_offsets[col_idx]);
-      bool* target_mask =
+      auto target_mask =
         reinterpret_cast<bool*>(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]);
       cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(),
                                                   d_agg_kinds[col_idx],
diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 1ca387358b8..5238ac75541 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -55,11 +55,11 @@ template <typename T, cudf::aggregation::Kind k>
 __device__ T get_identity()
 {
   if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) {
-    if constexpr (cudf::is_timestamp<T>())
+    if constexpr (cudf::is_timestamp<T>()) {
       return k == cudf::aggregation::ARGMAX
                ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)}
                : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)};
-    else {
+    } else {
       using DeviceType = cudf::device_storage_type_t<T>;
       return k == cudf::aggregation::ARGMAX
                ? static_cast<DeviceType>(cudf::detail::ARGMAX_SENTINEL)
@@ -90,11 +90,7 @@ struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target
 
     target_casted[idx] = get_identity<DeviceType, k>();
 
-    if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) {
-      target_mask[idx] = true;
-    } else {
-      target_mask[idx] = false;
-    }
+    target_mask[idx] =  (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID);
   }
 };
 

From 73590710c586a58e01fab6c09a133b941ad6d95b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 30 Oct 2024 14:11:07 -0700
Subject: [PATCH 10/11] Formatting

---
 cpp/src/groupby/hash/single_pass_functors.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 5238ac75541..56e2a646ad8 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -90,7 +90,7 @@ struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target
 
     target_casted[idx] = get_identity<DeviceType, k>();
 
-    target_mask[idx] =  (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID);
+    target_mask[idx] = (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID);
   }
 };
 

From 7f11406ffbf800760afeb9c8a69ad68aced8755b Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Wed, 30 Oct 2024 15:07:58 -0700
Subject: [PATCH 11/11] Apply suggestions from code review

Co-authored-by: Bradley Dice <bdice@bradleydice.com>
---
 cpp/src/groupby/hash/single_pass_functors.cuh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh
index 56e2a646ad8..28a5b578e00 100644
--- a/cpp/src/groupby/hash/single_pass_functors.cuh
+++ b/cpp/src/groupby/hash/single_pass_functors.cuh
@@ -54,7 +54,7 @@ identity_from_operator()
 template <typename T, cudf::aggregation::Kind k>
 __device__ T get_identity()
 {
-  if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) {
+  if ((k == cudf::aggregation::ARGMAX) or (k == cudf::aggregation::ARGMIN)) {
     if constexpr (cudf::is_timestamp<T>()) {
       return k == cudf::aggregation::ARGMAX
                ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)}
@@ -90,7 +90,7 @@ struct initialize_target_element<Target, k, std::enable_if_t<is_supported<Target
 
     target_casted[idx] = get_identity<DeviceType, k>();
 
-    target_mask[idx] = (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID);
+    target_mask[idx] = (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::COUNT_VALID);
   }
 };