From df8a561d02a9cf313368b2e3a995c2c477d10151 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 16:46:44 -0700 Subject: [PATCH 01/11] Add functors used by compute_shared_memory_aggs --- cpp/src/groupby/hash/single_pass_functors.cuh | 85 +++++++++++++++++++ 1 file changed, 85 insertions(+) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 73791b3aa71..e167df2bec4 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -23,6 +23,91 @@ #include namespace cudf::groupby::detail::hash { +// TODO: TO BE REMOVED +template +__device__ constexpr bool is_supported() +{ + return cudf::is_fixed_width() and + ((k == cudf::aggregation::SUM) or (k == cudf::aggregation::SUM_OF_SQUARES) or + (k == cudf::aggregation::MIN) or (k == cudf::aggregation::MAX) or + (k == cudf::aggregation::COUNT_VALID) or (k == cudf::aggregation::COUNT_ALL) or + (k == cudf::aggregation::ARGMIN) or (k == cudf::aggregation::ARGMAX) or + (k == cudf::aggregation::STD) or (k == cudf::aggregation::VARIANCE) or + (k == cudf::aggregation::PRODUCT) and cudf::detail::is_product_supported()); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + using DeviceType = cudf::device_storage_type_t; + return cudf::detail::corresponding_operator_t::template identity(); +} + +template +__device__ std::enable_if_t, void>, T> +identity_from_operator() +{ + CUDF_UNREACHABLE("Unable to get identity/sentinel from device operator"); +} + +template +__device__ T get_identity() +{ + if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) { + if constexpr (cudf::is_timestamp()) + return k == cudf::aggregation::ARGMAX + ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)} + : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)}; + else { + using DeviceType = cudf::device_storage_type_t; + return k == cudf::aggregation::ARGMAX + ? static_cast(cudf::detail::ARGMAX_SENTINEL) + : static_cast(cudf::detail::ARGMIN_SENTINEL); + } + } + return identity_from_operator(); +} + +template +struct initialize_target_element { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + CUDF_UNREACHABLE("Invalid source type and aggregation combination."); + } +}; + +template +struct initialize_target_element()>> { + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + using DeviceType = cudf::device_storage_type_t; + DeviceType* target_casted = reinterpret_cast(target); + + target_casted[idx] = get_identity(); + + if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) { + target_mask[idx] = true; + } else { + target_mask[idx] = false; + } + } +}; + +struct initialize_shmem { + template + __device__ void operator()(cuda::std::byte* target, + bool* target_mask, + cudf::size_type idx) const noexcept + { + initialize_target_element{}(target, target_mask, idx); + } +}; + /** * @brief Computes single-pass aggregations and store results into a sparse `output_values` table, * and populate `set` with indices of unique keys From ec5b70504a4e04aa4e771b107726337142a258ca Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 23 Oct 2024 16:47:06 -0700 Subject: [PATCH 02/11] Add compute_shared_memory_aggs --- cpp/CMakeLists.txt | 1 + .../hash/compute_shared_memory_aggs.cu | 321 ++++++++++++++++++ .../hash/compute_shared_memory_aggs.hpp | 42 +++ 3 files changed, 364 insertions(+) create mode 100644 cpp/src/groupby/hash/compute_shared_memory_aggs.cu create mode 100644 cpp/src/groupby/hash/compute_shared_memory_aggs.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index e4b9cbf8921..68e15fbeb06 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,6 +369,7 @@ add_library( src/filling/sequence.cu src/groupby/groupby.cu src/groupby/hash/compute_groupby.cu + src/groupby/hash/compute_shared_memory_aggs.cu src/groupby/hash/compute_single_pass_aggs.cu src/groupby/hash/create_sparse_results_table.cu src/groupby/hash/flatten_single_pass_aggs.cpp diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu new file mode 100644 index 00000000000..3e961fa1d76 --- /dev/null +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "compute_shared_memory_aggs.hpp" +#include "global_memory_aggregator.cuh" +#include "helpers.cuh" +#include "shared_memory_aggregator.cuh" +#include "single_pass_functors.cuh" + +#include +#include +#include +#include +#include +#include +#include + +#include + +#include +#include + +namespace cudf::groupby::detail::hash { +namespace { +/// Functor used by type dispatcher returning the size of the underlying C++ type +struct size_of_functor { + template + __device__ constexpr cudf::size_type operator()() + { + return sizeof(T); + } +}; + +// Prepares shared memory data required by each output column, exits if +// no enough memory space to perform the shared memory aggregation for the +// current output column +__device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, + cudf::size_type& col_end, + cudf::mutable_table_device_view output_values, + cudf::size_type output_size, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::size_type cardinality, + cudf::size_type total_agg_size) +{ + col_start = col_end; + cudf::size_type bytes_allocated = 0; + + auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + + while (bytes_allocated < total_agg_size && col_end < output_size) { + auto const col_idx = col_end; + auto const next_col_size = + round_to_multiple_of_8(cudf::type_dispatcher( + output_values.column(col_idx).type(), size_of_functor{}) * + cardinality); + auto const next_col_total_size = next_col_size + valid_col_size; + + if (bytes_allocated + next_col_total_size > total_agg_size) { + CUDF_UNREACHABLE("No enough memory space for shared memory aggregations"); + } + + shmem_agg_res_offsets[col_end] = bytes_allocated; + shmem_agg_mask_offsets[col_end] = bytes_allocated + next_col_size; + + bytes_allocated += next_col_total_size; + ++col_end; + } +} + +// Each block initialize its own shared memory aggregation results +__device__ void initialize_shmem_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, + cudf::size_type col_end, + cudf::mutable_table_device_view output_values, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::size_type cardinality, + cudf::aggregation::Kind const* d_agg_kinds) +{ + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + cuda::std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + bool* target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); + cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), + d_agg_kinds[col_idx], + initialize_shmem{}, + target, + target_mask, + idx); + } + } + block.sync(); +} + +__device__ void compute_pre_aggregrations(cudf::size_type col_start, + cudf::size_type col_end, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::table_device_view source, + cudf::size_type num_input_rows, + cudf::size_type* local_mapping_index, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* shmem_agg_res_offsets, + cudf::size_type* shmem_agg_mask_offsets, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // Aggregates global memory sources to shared memory targets + for (auto source_idx = cudf::detail::grid_1d::global_thread_id(); source_idx < num_input_rows; + source_idx += cudf::detail::grid_1d::grid_stride()) { + if (not skip_rows_with_nulls or cudf::bit_is_set(row_bitmask, source_idx)) { + auto const target_idx = local_mapping_index[source_idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto const source_col = source.column(col_idx); + + cuda::std::byte* target = + reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); + bool* target_mask = + reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(source_col.type(), + d_agg_kinds[col_idx], + shmem_element_aggregator{}, + target, + target_mask, + target_idx, + source_col, + source_idx); + } + } + } +} + +__device__ void compute_final_aggregations(cooperative_groups::thread_block const& block, + cudf::size_type col_start, + cudf::size_type col_end, + cudf::table_device_view input_values, + cudf::mutable_table_device_view target, + cudf::size_type cardinality, + cudf::size_type* global_mapping_index, + cuda::std::byte* shmem_agg_storage, + cudf::size_type* agg_res_offsets, + cudf::size_type* agg_mask_offsets, + cudf::aggregation::Kind const* d_agg_kinds) +{ + // Aggregates shared memory sources to global memory targets + for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { + auto const target_idx = + global_mapping_index[block.group_index().x * GROUPBY_SHM_MAX_ELEMENTS + idx]; + for (auto col_idx = col_start; col_idx < col_end; col_idx++) { + auto target_col = target.column(col_idx); + + cuda::std::byte* source = + reinterpret_cast(shmem_agg_storage + agg_res_offsets[col_idx]); + bool* source_mask = reinterpret_cast(shmem_agg_storage + agg_mask_offsets[col_idx]); + + cudf::detail::dispatch_type_and_aggregation(input_values.column(col_idx).type(), + d_agg_kinds[col_idx], + gmem_element_aggregator{}, + target_col, + target_idx, + input_values.column(col_idx), + source, + source_mask, + idx); + } + } + block.sync(); +} + +/* Takes the local_mapping_index and global_mapping_index to compute + * pre (shared) and final (global) aggregates*/ +CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + cudf::size_type total_agg_size, + cudf::size_type offsets_size) +{ + auto const block = cooperative_groups::this_thread_block(); + auto const cardinality = block_cardinality[block.group_index().x]; + if (cardinality >= GROUPBY_CARDINALITY_THRESHOLD) { return; } + + auto const num_cols = output_values.num_columns(); + + __shared__ cudf::size_type col_start; + __shared__ cudf::size_type col_end; + extern __shared__ cuda::std::byte shmem_agg_storage[]; + + cudf::size_type* shmem_agg_res_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size); + cudf::size_type* shmem_agg_mask_offsets = + reinterpret_cast(shmem_agg_storage + total_agg_size + offsets_size); + + if (block.thread_rank() == 0) { + col_start = 0; + col_end = 0; + } + block.sync(); + + while (col_end < num_cols) { + if (block.thread_rank() == 0) { + calculate_columns_to_aggregate(col_start, + col_end, + output_values, + num_cols, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + cardinality, + total_agg_size); + } + block.sync(); + + initialize_shmem_aggregations(block, + col_start, + col_end, + output_values, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + cardinality, + d_agg_kinds); + + compute_pre_aggregrations(col_start, + col_end, + row_bitmask, + skip_rows_with_nulls, + input_values, + num_rows, + local_mapping_index, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + d_agg_kinds); + block.sync(); + + compute_final_aggregations(block, + col_start, + col_end, + input_values, + output_values, + cardinality, + global_mapping_index, + shmem_agg_storage, + shmem_agg_res_offsets, + shmem_agg_mask_offsets, + d_agg_kinds); + } +} + +constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } + +} // namespace + +size_t available_shared_memory_size(cudf::size_type grid_size) +{ + auto const active_blocks_per_sm = + cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); + + size_t dynamic_shmem_size = 0; + CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( + &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); + return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); +} + +size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; } + +void compute_shared_memory_aggs(cudf::size_type grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream) +{ + auto const shmem_size = available_shared_memory_size(grid_size); + // For each aggregation, need one offset determining where the aggregation is + // performed, another indicating the validity of the aggregation + auto const offsets_size = shmem_offsets_size(output_values.num_columns()); + // The rest of shmem is utilized for the actual arrays in shmem + CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); + auto const shmem_agg_size = shmem_size - offsets_size * 2; + single_pass_shmem_aggs_kernel<<>>( + num_input_rows, + row_bitmask, + skip_rows_with_nulls, + local_mapping_index, + global_mapping_index, + block_cardinality, + input_values, + output_values, + d_agg_kinds, + shmem_agg_size, + offsets_size); +} +} // namespace cudf::groupby::detail::hash diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp new file mode 100644 index 00000000000..7dc2b448a60 --- /dev/null +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include +#include + +#include + +namespace cudf::groupby::detail::hash { + +size_t available_shared_memory_size(cudf::size_type grid_size); + +size_t shmem_offsets_size(cudf::size_type num_cols); + +void compute_shared_memory_aggs(cudf::size_type grid_size, + cudf::size_type num_input_rows, + bitmask_type const* row_bitmask, + bool skip_rows_with_nulls, + cudf::size_type* local_mapping_index, + cudf::size_type* global_mapping_index, + cudf::size_type* block_cardinality, + cudf::table_device_view input_values, + cudf::mutable_table_device_view output_values, + cudf::aggregation::Kind const* d_agg_kinds, + rmm::cuda_stream_view stream); + +} // namespace cudf::groupby::detail::hash From 5e85f23a4c0e93c02304497d71bfea25fcd5b888 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Thu, 24 Oct 2024 10:38:08 -0700 Subject: [PATCH 03/11] Update comments --- cpp/src/groupby/hash/single_pass_functors.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index e167df2bec4..1ca387358b8 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -23,7 +23,7 @@ #include namespace cudf::groupby::detail::hash { -// TODO: TO BE REMOVED +// TODO: TO BE REMOVED issue tracked via #17171 template __device__ constexpr bool is_supported() { From 00ed4b28436d9abf84ced95992cea0d3d7ffb3f3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 25 Oct 2024 11:32:08 -0700 Subject: [PATCH 04/11] Update cpp/src/groupby/hash/compute_shared_memory_aggs.cu Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 3e961fa1d76..9e03fef2a56 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -70,7 +70,7 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, auto const next_col_total_size = next_col_size + valid_col_size; if (bytes_allocated + next_col_total_size > total_agg_size) { - CUDF_UNREACHABLE("No enough memory space for shared memory aggregations"); + CUDF_UNREACHABLE("Not enough memory for shared memory aggregations"); } shmem_agg_res_offsets[col_end] = bytes_allocated; From 8c3f1923924d21e81cc76d015a7d26354b4a74b3 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 25 Oct 2024 13:21:32 -0700 Subject: [PATCH 05/11] Leverage existing utilities to eliminate duplication --- .../hash/compute_shared_memory_aggs.cu | 19 +++++++++++-------- cpp/src/groupby/hash/helpers.cuh | 9 --------- 2 files changed, 11 insertions(+), 17 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 9e03fef2a56..9c0044ae7f0 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -44,6 +44,9 @@ struct size_of_functor { } }; +/// Shared memory data alignment +CUDF_HOST_DEVICE cudf::size_type constexpr ALIGNMENT = 8; + // Prepares shared memory data required by each output column, exits if // no enough memory space to perform the shared memory aggregation for the // current output column @@ -59,14 +62,16 @@ __device__ void calculate_columns_to_aggregate(cudf::size_type& col_start, col_start = col_end; cudf::size_type bytes_allocated = 0; - auto const valid_col_size = round_to_multiple_of_8(sizeof(bool) * cardinality); + auto const valid_col_size = + cudf::util::round_up_safe(static_cast(sizeof(bool) * cardinality), ALIGNMENT); while (bytes_allocated < total_agg_size && col_end < output_size) { auto const col_idx = col_end; auto const next_col_size = - round_to_multiple_of_8(cudf::type_dispatcher( - output_values.column(col_idx).type(), size_of_functor{}) * - cardinality); + cudf::util::round_up_safe(cudf::type_dispatcher( + output_values.column(col_idx).type(), size_of_functor{}) * + cardinality, + ALIGNMENT); auto const next_col_total_size = next_col_size + valid_col_size; if (bytes_allocated + next_col_total_size > total_agg_size) { @@ -268,9 +273,6 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, d_agg_kinds); } } - -constexpr size_t get_previous_multiple_of_8(size_t number) { return number / 8 * 8; } - } // namespace size_t available_shared_memory_size(cudf::size_type grid_size) @@ -281,7 +283,8 @@ size_t available_shared_memory_size(cudf::size_type grid_size) size_t dynamic_shmem_size = 0; CUDF_CUDA_TRY(cudaOccupancyAvailableDynamicSMemPerBlock( &dynamic_shmem_size, single_pass_shmem_aggs_kernel, active_blocks_per_sm, GROUPBY_BLOCK_SIZE)); - return get_previous_multiple_of_8(0.5 * dynamic_shmem_size); + return cudf::util::round_down_safe(static_cast(0.5 * dynamic_shmem_size), + ALIGNMENT); } size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; } diff --git a/cpp/src/groupby/hash/helpers.cuh b/cpp/src/groupby/hash/helpers.cuh index 0d117ca35b3..00836567b4f 100644 --- a/cpp/src/groupby/hash/helpers.cuh +++ b/cpp/src/groupby/hash/helpers.cuh @@ -54,15 +54,6 @@ using shmem_extent_t = CUDF_HOST_DEVICE auto constexpr window_extent = cuco::make_window_extent(shmem_extent_t{}); -/** - * @brief Returns the smallest multiple of 8 that is greater than or equal to the given integer. - */ -CUDF_HOST_DEVICE constexpr std::size_t round_to_multiple_of_8(std::size_t num) -{ - std::size_t constexpr base = 8; - return cudf::util::div_rounding_up_safe(num, base) * base; -} - using row_hash_t = cudf::experimental::row::hash::device_row_hasher; From f0d9c5a0c35c9d13435ca6b22538bceb877535cc Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 25 Oct 2024 13:28:36 -0700 Subject: [PATCH 06/11] Minor cleanup --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 9c0044ae7f0..3fb4518a7ce 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -287,8 +287,6 @@ size_t available_shared_memory_size(cudf::size_type grid_size) ALIGNMENT); } -size_t shmem_offsets_size(cudf::size_type num_cols) { return sizeof(cudf::size_type) * num_cols; } - void compute_shared_memory_aggs(cudf::size_type grid_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, @@ -304,7 +302,7 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, auto const shmem_size = available_shared_memory_size(grid_size); // For each aggregation, need one offset determining where the aggregation is // performed, another indicating the validity of the aggregation - auto const offsets_size = shmem_offsets_size(output_values.num_columns()); + auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); // The rest of shmem is utilized for the actual arrays in shmem CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); auto const shmem_agg_size = shmem_size - offsets_size * 2; @@ -319,6 +317,6 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, output_values, d_agg_kinds, shmem_agg_size, - offsets_size); + shmem_offsets_size); } } // namespace cudf::groupby::detail::hash From 30decde06b551225306e29e6ca9b6f7984efd934 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 25 Oct 2024 13:31:48 -0700 Subject: [PATCH 07/11] Fix typos --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 3fb4518a7ce..5e42042b5ae 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -304,8 +304,9 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, // performed, another indicating the validity of the aggregation auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(shmem_size > offsets_size * 2, "No enough space for shared memory aggregations"); - auto const shmem_agg_size = shmem_size - offsets_size * 2; + CUDF_EXPECTS(shmem_size > shmem_offsets_size * 2, + "No enough space for shared memory aggregations"); + auto const shmem_agg_size = shmem_size - shmem_offsets_size * 2; single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, From f11777439580116c1252bf6d6743e932faab3138 Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Fri, 25 Oct 2024 13:38:28 -0700 Subject: [PATCH 08/11] Pass available_shmem_size as argument to avoid redundant invocation --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 10 +++++----- cpp/src/groupby/hash/compute_shared_memory_aggs.hpp | 5 ++--- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index 5e42042b5ae..ca43613db06 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -275,7 +275,7 @@ CUDF_KERNEL void single_pass_shmem_aggs_kernel(cudf::size_type num_rows, } } // namespace -size_t available_shared_memory_size(cudf::size_type grid_size) +std::size_t available_shared_memory_size(cudf::size_type grid_size) { auto const active_blocks_per_sm = cudf::util::div_rounding_up_safe(grid_size, cudf::detail::num_multiprocessors()); @@ -288,6 +288,7 @@ size_t available_shared_memory_size(cudf::size_type grid_size) } void compute_shared_memory_aggs(cudf::size_type grid_size, + std::size_t available_shmem_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, bool skip_rows_with_nulls, @@ -299,15 +300,14 @@ void compute_shared_memory_aggs(cudf::size_type grid_size, cudf::aggregation::Kind const* d_agg_kinds, rmm::cuda_stream_view stream) { - auto const shmem_size = available_shared_memory_size(grid_size); // For each aggregation, need one offset determining where the aggregation is // performed, another indicating the validity of the aggregation auto const shmem_offsets_size = output_values.num_columns() * sizeof(cudf::size_type); // The rest of shmem is utilized for the actual arrays in shmem - CUDF_EXPECTS(shmem_size > shmem_offsets_size * 2, + CUDF_EXPECTS(available_shmem_size > shmem_offsets_size * 2, "No enough space for shared memory aggregations"); - auto const shmem_agg_size = shmem_size - shmem_offsets_size * 2; - single_pass_shmem_aggs_kernel<<>>( + auto const shmem_agg_size = available_shmem_size - shmem_offsets_size * 2; + single_pass_shmem_aggs_kernel<<>>( num_input_rows, row_bitmask, skip_rows_with_nulls, diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp index 7dc2b448a60..653821fd53b 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.hpp @@ -23,11 +23,10 @@ namespace cudf::groupby::detail::hash { -size_t available_shared_memory_size(cudf::size_type grid_size); - -size_t shmem_offsets_size(cudf::size_type num_cols); +std::size_t available_shared_memory_size(cudf::size_type grid_size); void compute_shared_memory_aggs(cudf::size_type grid_size, + std::size_t available_shmem_size, cudf::size_type num_input_rows, bitmask_type const* row_bitmask, bool skip_rows_with_nulls, From 8afef8ee985ae5e64b1542e05d4ede5416a86f5a Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 30 Oct 2024 14:09:05 -0700 Subject: [PATCH 09/11] Apply suggestions from code review Co-authored-by: David Wendt <45795991+davidwendt@users.noreply.github.com> Co-authored-by: Nghia Truong <7416935+ttnghia@users.noreply.github.com> --- cpp/src/groupby/hash/compute_shared_memory_aggs.cu | 4 ++-- cpp/src/groupby/hash/single_pass_functors.cuh | 10 +++------- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu index ca43613db06..12c02a1865e 100644 --- a/cpp/src/groupby/hash/compute_shared_memory_aggs.cu +++ b/cpp/src/groupby/hash/compute_shared_memory_aggs.cu @@ -99,9 +99,9 @@ __device__ void initialize_shmem_aggregations(cooperative_groups::thread_block c { for (auto col_idx = col_start; col_idx < col_end; col_idx++) { for (auto idx = block.thread_rank(); idx < cardinality; idx += block.num_threads()) { - cuda::std::byte* target = + auto target = reinterpret_cast(shmem_agg_storage + shmem_agg_res_offsets[col_idx]); - bool* target_mask = + auto target_mask = reinterpret_cast(shmem_agg_storage + shmem_agg_mask_offsets[col_idx]); cudf::detail::dispatch_type_and_aggregation(output_values.column(col_idx).type(), d_agg_kinds[col_idx], diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 1ca387358b8..5238ac75541 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -55,11 +55,11 @@ template __device__ T get_identity() { if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) { - if constexpr (cudf::is_timestamp()) + if constexpr (cudf::is_timestamp()) { return k == cudf::aggregation::ARGMAX ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)} : T{typename T::duration(cudf::detail::ARGMIN_SENTINEL)}; - else { + } else { using DeviceType = cudf::device_storage_type_t; return k == cudf::aggregation::ARGMAX ? static_cast(cudf::detail::ARGMAX_SENTINEL) @@ -90,11 +90,7 @@ struct initialize_target_element(); - if (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID) { - target_mask[idx] = true; - } else { - target_mask[idx] = false; - } + target_mask[idx] = (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID); } }; From 73590710c586a58e01fab6c09a133b941ad6d95b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 30 Oct 2024 14:11:07 -0700 Subject: [PATCH 10/11] Formatting --- cpp/src/groupby/hash/single_pass_functors.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 5238ac75541..56e2a646ad8 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -90,7 +90,7 @@ struct initialize_target_element(); - target_mask[idx] = (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID); + target_mask[idx] = (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID); } }; From 7f11406ffbf800760afeb9c8a69ad68aced8755b Mon Sep 17 00:00:00 2001 From: Yunsong Wang Date: Wed, 30 Oct 2024 15:07:58 -0700 Subject: [PATCH 11/11] Apply suggestions from code review Co-authored-by: Bradley Dice --- cpp/src/groupby/hash/single_pass_functors.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/groupby/hash/single_pass_functors.cuh b/cpp/src/groupby/hash/single_pass_functors.cuh index 56e2a646ad8..28a5b578e00 100644 --- a/cpp/src/groupby/hash/single_pass_functors.cuh +++ b/cpp/src/groupby/hash/single_pass_functors.cuh @@ -54,7 +54,7 @@ identity_from_operator() template __device__ T get_identity() { - if ((k == cudf::aggregation::ARGMAX) || (k == cudf::aggregation::ARGMIN)) { + if ((k == cudf::aggregation::ARGMAX) or (k == cudf::aggregation::ARGMIN)) { if constexpr (cudf::is_timestamp()) { return k == cudf::aggregation::ARGMAX ? T{typename T::duration(cudf::detail::ARGMAX_SENTINEL)} @@ -90,7 +90,7 @@ struct initialize_target_element(); - target_mask[idx] = (k == cudf::aggregation::COUNT_ALL || k == cudf::aggregation::COUNT_VALID); + target_mask[idx] = (k == cudf::aggregation::COUNT_ALL) or (k == cudf::aggregation::COUNT_VALID); } };