From a0d1f0186bf3f7ef4cc4a31ffea07128cc3fbec5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 15 Jul 2024 14:21:29 -0700 Subject: [PATCH 001/126] add a create_graph_from_edgelist function that takes edge list in multiple chunks --- cpp/include/cugraph/graph_functions.hpp | 66 + .../create_graph_from_edgelist_impl.cuh | 1198 +++++++++++++---- .../create_graph_from_edgelist_mg_v32_e32.cu | 92 +- .../create_graph_from_edgelist_mg_v32_e64.cu | 91 +- .../create_graph_from_edgelist_mg_v64_e64.cu | 92 +- .../create_graph_from_edgelist_sg_v32_e32.cu | 92 +- .../create_graph_from_edgelist_sg_v32_e64.cu | 92 +- .../create_graph_from_edgelist_sg_v64_e64.cu | 92 +- 8 files changed, 1541 insertions(+), 274 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 79ff576571e..e1364f69991 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -730,6 +730,72 @@ create_graph_from_edgelist(raft::handle_t const& handle, bool renumber, bool do_expensive_check = false); +/** + * @brief create a graph from (the optional vertex list and) the given edge list (with optional edge + * IDs and types). + * + * This version takes edge list in multiple chunks (e.g. edge data from multiple files). + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam edge_t Type of edge identifiers. Needs to be an integral type. + * @tparam weight_t Type of edge weight. Needs to be floating point type + * @tparam edge_id_t Type of edge id. Needs to be an integral type + * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is + * supported + * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if + * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param vertices If valid, part of the entire set of vertices in the graph to be renumbered. + * This parameter can be used to include isolated vertices. If @p renumber is false and @p vertices + * is valid, @p vertices elements should be consecutive integers starting from 0. If multi-GPU, + * applying the compute_gpu_id_from_vertex_t to every vertex should return the local GPU ID for this + * function to work (vertices should be pre-shuffled). + * @param edgelist_srcs Vectors of edge source vertex IDs. If multi-GPU, applying the + * compute_gpu_id_from_ext_edge_endpoints_t to every edge should return the local GPU ID for this + * function to work (edges should be pre-shuffled). + * @param edgelist_dsts Vectors of edge destination vertex IDs. + * @param edgelist_weights Vectors of weight values for edges + * @param edgelist_edge_ids Vectors of edge_id values for edges + * @param edgelist_edge_types Vectors of edge_type values for edges + * @param graph_properties Properties of the graph represented by the input (optional vertex list + * and) edge list. + * @param renumber Flag indicating whether to renumber vertices or not (must be true if @p multi_gpu + * is true). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + * @return Tuple of the generated graph and optional edge_property_t objects storing the provided + * edge properties and a renumber map (if @p renumber is true). + */ +template +std::tuple< + graph_t, + std::optional< + edge_property_t, weight_t>>, + std::optional< + edge_property_t, edge_id_t>>, + std::optional< + edge_property_t, edge_type_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check = false); + /** * @brief Find all 2-hop neighbors in the graph * diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 1c15842982b..1a92b9221fe 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -317,206 +317,57 @@ std::enable_if_t< std::optional< edge_property_t, edge_type_t>>, std::optional>>> -create_graph_from_edgelist_impl( +create_graph_from_partitioned_edgelist( raft::handle_t const& handle, std::optional>&& local_vertices, - rmm::device_uvector&& edgelist_srcs, - rmm::device_uvector&& edgelist_dsts, - std::optional>&& edgelist_weights, - std::optional>&& edgelist_edge_ids, - std::optional>&& edgelist_edge_types, + std::vector>&& edge_partition_edgelist_srcs, + std::vector>&& edge_partition_edgelist_dsts, + std::optional>>&& edge_partition_edgelist_weights, + std::optional>>&& edge_partition_edgelist_edge_ids, + std::optional>>&& edge_partition_edgelist_edge_types, + std::vector> const& edgelist_intra_partition_segment_offsets, graph_properties_t graph_properties, - bool renumber, - bool do_expensive_check) + bool renumber) { auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(), - "Invalid input arguments: edgelist_srcs.size() != edgelist_dsts.size()."); - CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs.size() == (*edgelist_weights).size()), - "Invalid input arguments: edgelist_srcs.size() != edgelist_weights.size()."); - CUGRAPH_EXPECTS(!edgelist_edge_ids || (edgelist_srcs.size() == (*edgelist_edge_ids).size()), - "Invalid input arguments: edgelist_srcs.size() != " - "std::get<0>((*edgelist_edge_ids)).size()."); - CUGRAPH_EXPECTS(!edgelist_edge_types || (edgelist_srcs.size() == (*edgelist_edge_types).size()), - "Invalid input arguments: edgelist_srcs.size() != " - "std::get<1>((*edgelist_edge_types)).size()."); - CUGRAPH_EXPECTS(renumber, - "Invalid input arguments: renumber should be true if multi_gpu is true."); - - if (do_expensive_check) { - expensive_check_edgelist(handle, - local_vertices, - store_transposed ? edgelist_dsts : edgelist_srcs, - store_transposed ? edgelist_srcs : edgelist_dsts, - renumber); - - if (graph_properties.is_symmetric) { - CUGRAPH_EXPECTS( - (check_symmetric( - handle, - raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), - raft::device_span(edgelist_dsts.data(), edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " - "not symmetric."); - } - - if (!graph_properties.is_multigraph) { - CUGRAPH_EXPECTS( - check_no_parallel_edge( - handle, - raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), - raft::device_span(edgelist_dsts.data(), edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " - "has parallel edges."); - } - } - - // 1. groupby edges to their target local adjacency matrix partition (and further groupby within - // the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex IDs). - - auto d_edge_counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( - handle, - store_transposed ? edgelist_dsts : edgelist_srcs, - store_transposed ? edgelist_srcs : edgelist_dsts, - edgelist_weights, - edgelist_edge_ids, - edgelist_edge_types, - true); - - std::vector h_edge_counts(d_edge_counts.size()); - raft::update_host( - h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream()); - handle.sync_stream(); + // 1. renumber std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); - auto edgelist_intra_partition_segment_offsets = - std::make_optional>>( - minor_comm_size, std::vector(major_comm_size + 1, edge_t{0})); - for (int i = 0; i < minor_comm_size; ++i) { - edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i, - h_edge_counts.begin() + major_comm_size * (i + 1), - edge_t{0}); - std::partial_sum(h_edge_counts.begin() + major_comm_size * i, - h_edge_counts.begin() + major_comm_size * (i + 1), - (*edgelist_intra_partition_segment_offsets)[i].begin() + 1); - } - std::vector edgelist_displacements(minor_comm_size, edge_t{0}); - std::partial_sum(edgelist_edge_counts.begin(), - edgelist_edge_counts.end() - 1, - edgelist_displacements.begin() + 1); - - // 2. split the input edges to local partitions - - std::vector> edge_partition_edgelist_srcs{}; - edge_partition_edgelist_srcs.reserve(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - rmm::device_uvector tmp_srcs(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_srcs.begin() + edgelist_displacements[i], - edgelist_srcs.begin() + edgelist_displacements[i] + edgelist_edge_counts[i], - tmp_srcs.begin()); - edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); - } - edgelist_srcs.resize(0, handle.get_stream()); - edgelist_srcs.shrink_to_fit(handle.get_stream()); - - std::vector> edge_partition_edgelist_dsts{}; - edge_partition_edgelist_dsts.reserve(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - rmm::device_uvector tmp_dsts(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_dsts.begin() + edgelist_displacements[i], - edgelist_dsts.begin() + edgelist_displacements[i] + edgelist_edge_counts[i], - tmp_dsts.begin()); - edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); - } - edgelist_dsts.resize(0, handle.get_stream()); - edgelist_dsts.shrink_to_fit(handle.get_stream()); - - std::optional>> edge_partition_edgelist_weights{}; - if (edgelist_weights) { - edge_partition_edgelist_weights = std::vector>{}; - (*edge_partition_edgelist_weights).reserve(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - rmm::device_uvector tmp_weights(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy( - handle.get_thrust_policy(), - (*edgelist_weights).begin() + edgelist_displacements[i], - (*edgelist_weights).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], - tmp_weights.begin()); - (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); - } - (*edgelist_weights).resize(0, handle.get_stream()); - (*edgelist_weights).shrink_to_fit(handle.get_stream()); - } - - std::optional>> edge_partition_edgelist_edge_ids{}; - if (edgelist_edge_ids) { - edge_partition_edgelist_edge_ids = std::vector>{}; - (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - rmm::device_uvector tmp_edge_ids(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy( - handle.get_thrust_policy(), - (*edgelist_edge_ids).begin() + edgelist_displacements[i], - (*edgelist_edge_ids).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], - tmp_edge_ids.begin()); - (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); - } - (*edgelist_edge_ids).resize(0, handle.get_stream()); - (*edgelist_edge_ids).shrink_to_fit(handle.get_stream()); - } - - std::optional>> edge_partition_edgelist_edge_types{}; - if (edgelist_edge_types) { - edge_partition_edgelist_edge_types = std::vector>{}; - (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - rmm::device_uvector tmp_edge_types(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy( - handle.get_thrust_policy(), - (*edgelist_edge_types).begin() + edgelist_displacements[i], - (*edgelist_edge_types).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], - tmp_edge_types.begin()); - (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); - } - (*edgelist_edge_types).resize(0, handle.get_stream()); - (*edgelist_edge_types).shrink_to_fit(handle.get_stream()); + for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) { + edgelist_edge_counts[i] = static_cast(edge_partition_edgelist_srcs[i].size()); } - // 3. renumber - std::vector src_ptrs(minor_comm_size); std::vector dst_ptrs(src_ptrs.size()); for (int i = 0; i < minor_comm_size; ++i) { src_ptrs[i] = edge_partition_edgelist_srcs[i].begin(); dst_ptrs[i] = edge_partition_edgelist_dsts[i].begin(); } - auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( - handle, - std::move(local_vertices), - src_ptrs, - dst_ptrs, - edgelist_edge_counts, - edgelist_intra_partition_segment_offsets, - store_transposed); + auto [renumber_map_labels, meta] = + cugraph::renumber_edgelist(handle, + std::move(local_vertices), + src_ptrs, + dst_ptrs, + edgelist_edge_counts, + edgelist_intra_partition_segment_offsets, + store_transposed); auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); auto use_dcs = num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); - // 4. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid + // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid auto total_global_mem = handle.get_device_properties().totalGlobalMem; size_t element_size = sizeof(vertex_t) * 2; - if (edgelist_weights) { element_size += sizeof(weight_t); } - if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } - if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } + if (edge_partition_edgelist_weights) { element_size += sizeof(weight_t); } + if (edge_partition_edgelist_edge_ids) { element_size += sizeof(edge_id_t); } + if (edge_partition_edgelist_edge_types) { element_size += sizeof(edge_type_t); } auto constexpr mem_frugal_ratio = 0.25; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the // total_global_mem, switch to the memory frugal approach @@ -567,9 +418,9 @@ create_graph_from_edgelist_impl( meta.edge_partition_segment_offsets[num_segments_per_vertex_partition * i + detail::num_sparse_segments_per_vertex_partition]) : std::nullopt; - if (edgelist_weights) { - if (edgelist_edge_ids) { - if (edgelist_edge_types) { + if (edge_partition_edgelist_weights) { + if (edge_partition_edgelist_edge_ids) { + if (edge_partition_edgelist_edge_types) { std::forward_as_tuple( offsets, indices, std::tie(weights, edge_ids, edge_types), dcs_nzd_vertices) = detail::sort_and_compress_edgelist( std::move(edge_partition_edgelist_srcs[i]), @@ -715,7 +566,7 @@ create_graph_from_edgelist_impl( } } - // 5. segmented sort neighbors + // 3. segmented sort neighbors for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { if (edge_partition_weights) { @@ -801,46 +652,43 @@ create_graph_from_edgelist_impl( } } - // 6. create a graph and an edge_property_t object. + // 4. create a graph and an edge_property_t object. - std::optional< - edge_property_t, weight_t>> + std::optional, weight_t>> edge_weights{std::nullopt}; if (edge_partition_weights) { edge_weights = - edge_property_t, weight_t>( + edge_property_t, weight_t>( std::move(*edge_partition_weights)); } - std::optional< - edge_property_t, edge_id_t>> + std::optional, edge_id_t>> edge_ids{std::nullopt}; if (edge_partition_edge_ids) { - edge_ids = - edge_property_t, edge_id_t>( - std::move(*edge_partition_edge_ids)); + edge_ids = edge_property_t, edge_id_t>( + std::move(*edge_partition_edge_ids)); } std::optional< - edge_property_t, edge_type_t>> + edge_property_t, edge_type_t>> edge_types{std::nullopt}; if (edge_partition_edge_types) { edge_types = - edge_property_t, edge_type_t>( + edge_property_t, edge_type_t>( std::move(*edge_partition_edge_types)); } return std::make_tuple( - cugraph::graph_t( + cugraph::graph_t( handle, std::move(edge_partition_offsets), std::move(edge_partition_indices), std::move(edge_partition_dcs_nzd_vertices), - cugraph::graph_meta_t{meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.edge_partition_segment_offsets}), + cugraph::graph_meta_t{meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.edge_partition_segment_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -855,7 +703,7 @@ template std::enable_if_t< - !multi_gpu, + multi_gpu, std::tuple< cugraph::graph_t, std::optional< @@ -867,7 +715,7 @@ std::enable_if_t< std::optional>>> create_graph_from_edgelist_impl( raft::handle_t const& handle, - std::optional>&& vertices, + std::optional>&& local_vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -877,24 +725,28 @@ create_graph_from_edgelist_impl( bool renumber, bool do_expensive_check) { - CUGRAPH_EXPECTS( - !vertices || ((*vertices).size() < static_cast(std::numeric_limits::max())), - "Invalid input arguments: # unique vertex IDs should be smaller than " - "std::numeric_limits::Max()."); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(), "Invalid input arguments: edgelist_srcs.size() != edgelist_dsts.size()."); CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs.size() == (*edgelist_weights).size()), - "Invalid input arguments: edgelist_srcs.size() != edgelist_weights.size()."); + "Invalid input arguments: edgelist_weights.has_value() is true and " + "edgelist_srcs.size() != (*edgelist_weights).size()."); CUGRAPH_EXPECTS(!edgelist_edge_ids || (edgelist_srcs.size() == (*edgelist_edge_ids).size()), - "Invalid input arguments: edgelist_srcs.size() != " - "(*edgelist_edge_ids).size()."); + "Invalid input arguments: edgelist_edge_ids.has_value() is true and " + "edgelist_srcs.size() != (*edgelist_edge_ids).size()."); CUGRAPH_EXPECTS(!edgelist_edge_types || (edgelist_srcs.size() == (*edgelist_edge_types).size()), - "Invalid input arguments: edgelist_srcs.size() != " - "(*edgelist_edge_types).size()."); + "Invalid input arguments: edgelist_edge_types.has_value() is true, " + "edgelist_srcs.size() != (*edgelist_edge_types).size()."); + CUGRAPH_EXPECTS(renumber, + "Invalid input arguments: renumber should be true if multi_gpu is true."); if (do_expensive_check) { expensive_check_edgelist(handle, - vertices, + local_vertices, store_transposed ? edgelist_dsts : edgelist_srcs, store_transposed ? edgelist_srcs : edgelist_dsts, renumber); @@ -920,50 +772,660 @@ create_graph_from_edgelist_impl( } } - // renumber + // 1. groupby edges to their target local adjacency matrix partition (and further groupby within + // the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex IDs). - auto renumber_map_labels = - renumber ? std::make_optional>(0, handle.get_stream()) - : std::nullopt; - renumber_meta_t meta{}; - if (renumber) { - std::tie(*renumber_map_labels, meta) = cugraph::renumber_edgelist( - handle, - std::move(vertices), - edgelist_srcs.data(), - edgelist_dsts.data(), - static_cast(edgelist_srcs.size()), - store_transposed); - } + auto d_edge_counts = cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( + handle, + store_transposed ? edgelist_dsts : edgelist_srcs, + store_transposed ? edgelist_srcs : edgelist_dsts, + edgelist_weights, + edgelist_edge_ids, + edgelist_edge_types, + true); - vertex_t num_vertices{}; - if (renumber) { - num_vertices = static_cast((*renumber_map_labels).size()); - } else { - if (vertices) { - num_vertices = (*vertices).size(); - } else { - num_vertices = 1 + cugraph::detail::compute_maximum_vertex_id( - handle.get_stream(), edgelist_srcs, edgelist_dsts); - } - } + std::vector h_edge_counts(d_edge_counts.size()); + raft::update_host( + h_edge_counts.data(), d_edge_counts.data(), d_edge_counts.size(), handle.get_stream()); + handle.sync_stream(); - // convert edge list (COO) to compressed sparse format (CSR or CSC) + std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); + auto edgelist_intra_partition_segment_offsets = std::vector>( + minor_comm_size, std::vector(major_comm_size + 1, edge_t{0})); + for (int i = 0; i < minor_comm_size; ++i) { + edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i, + h_edge_counts.begin() + major_comm_size * (i + 1), + edge_t{0}); + std::partial_sum(h_edge_counts.begin() + major_comm_size * i, + h_edge_counts.begin() + major_comm_size * (i + 1), + edgelist_intra_partition_segment_offsets[i].begin() + 1); + } + std::vector edgelist_displacements(minor_comm_size, edge_t{0}); + std::partial_sum(edgelist_edge_counts.begin(), + edgelist_edge_counts.end() - 1, + edgelist_displacements.begin() + 1); - auto total_global_mem = handle.get_device_properties().totalGlobalMem; - size_t element_size = sizeof(vertex_t) * 2; - if (edgelist_weights) { element_size += sizeof(weight_t); } - if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } - if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } - auto constexpr mem_frugal_ratio = - 0.25; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the - // total_global_mem, switch to the memory frugal approach - auto mem_frugal_threshold = - static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); + // 2. split the input edges to local partitions - rmm::device_uvector offsets(size_t{0}, handle.get_stream()); - rmm::device_uvector indices(size_t{0}, handle.get_stream()); - std::optional> weights{std::nullopt}; + std::vector> edge_partition_edgelist_srcs{}; + edge_partition_edgelist_srcs.reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_srcs(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_srcs.begin() + edgelist_displacements[i], + edgelist_srcs.begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_srcs.begin()); + edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + } + edgelist_srcs.resize(0, handle.get_stream()); + edgelist_srcs.shrink_to_fit(handle.get_stream()); + + std::vector> edge_partition_edgelist_dsts{}; + edge_partition_edgelist_dsts.reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_dsts(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_dsts.begin() + edgelist_displacements[i], + edgelist_dsts.begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_dsts.begin()); + edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + } + edgelist_dsts.resize(0, handle.get_stream()); + edgelist_dsts.shrink_to_fit(handle.get_stream()); + + std::optional>> edge_partition_edgelist_weights{}; + if (edgelist_weights) { + edge_partition_edgelist_weights = std::vector>{}; + (*edge_partition_edgelist_weights).reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_weights(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy( + handle.get_thrust_policy(), + (*edgelist_weights).begin() + edgelist_displacements[i], + (*edgelist_weights).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_weights.begin()); + (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); + } + (*edgelist_weights).resize(0, handle.get_stream()); + (*edgelist_weights).shrink_to_fit(handle.get_stream()); + } + + std::optional>> edge_partition_edgelist_edge_ids{}; + if (edgelist_edge_ids) { + edge_partition_edgelist_edge_ids = std::vector>{}; + (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_edge_ids(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy( + handle.get_thrust_policy(), + (*edgelist_edge_ids).begin() + edgelist_displacements[i], + (*edgelist_edge_ids).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_edge_ids.begin()); + (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); + } + (*edgelist_edge_ids).resize(0, handle.get_stream()); + (*edgelist_edge_ids).shrink_to_fit(handle.get_stream()); + } + + std::optional>> edge_partition_edgelist_edge_types{}; + if (edgelist_edge_types) { + edge_partition_edgelist_edge_types = std::vector>{}; + (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_edge_types(edgelist_edge_counts[i], handle.get_stream()); + thrust::copy( + handle.get_thrust_policy(), + (*edgelist_edge_types).begin() + edgelist_displacements[i], + (*edgelist_edge_types).begin() + edgelist_displacements[i] + edgelist_edge_counts[i], + tmp_edge_types.begin()); + (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); + } + (*edgelist_edge_types).resize(0, handle.get_stream()); + (*edgelist_edge_types).shrink_to_fit(handle.get_stream()); + } + + return create_graph_from_partitioned_edgelist( + handle, + std::move(local_vertices), + std::move(edge_partition_edgelist_srcs), + std::move(edge_partition_edgelist_dsts), + std::move(edge_partition_edgelist_weights), + std::move(edge_partition_edgelist_edge_ids), + std::move(edge_partition_edgelist_edge_types), + edgelist_intra_partition_segment_offsets, + graph_properties, + renumber); +} + +template +std::enable_if_t< + multi_gpu, + std::tuple< + cugraph::graph_t, + std::optional< + edge_property_t, weight_t>>, + std::optional< + edge_property_t, edge_id_t>>, + std::optional< + edge_property_t, edge_type_t>>, + std::optional>>> +create_graph_from_edgelist_impl( + raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check) +{ + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(), + "Invalid input arguments: edgelist_srcs.size() != edgelist_dsts.size()."); + CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs.size() == (*edgelist_weights).size()), + "Invalid input arguments: edgelist_weights.has_value() is true and " + "edgelist_srcs.size() != (*edgelist_weights).size()."); + CUGRAPH_EXPECTS(!edgelist_edge_ids || (edgelist_srcs.size() == (*edgelist_edge_ids).size()), + "Invalid input arguments: edgelist_edge_ids.has_value() is true and " + "edgelist_srcs.size() != (*edgelist_edge_ids).size()."); + CUGRAPH_EXPECTS(!edgelist_edge_types || (edgelist_srcs.size() == (*edgelist_edge_types).size()), + "Invalid input arguments: edgelist_edge_types.has_value() is true, " + "edgelist_srcs.size() != (*edgelist_edge_types).size()."); + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + CUGRAPH_EXPECTS(edgelist_srcs[i].size() == edgelist_dsts[i].size(), + "Invalid input arguments: edgelist_srcs[i].size() != edgelist_dsts[i].size()."); + CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs[i].size() == (*edgelist_weights)[i].size()), + "Invalid input arguments: edgelist_weights.has_value() is true and " + "edgelist_srcs[i].size() != (*edgelist_weights)[i].size()."); + CUGRAPH_EXPECTS( + !edgelist_edge_ids || (edgelist_srcs[i].size() == (*edgelist_edge_ids)[i].size()), + "Invalid input arguments: edgelist_edge_ids.has_value() is true and " + "edgelist_srcs[i].size() != (*edgelist_edge_ids)[i].size()."); + CUGRAPH_EXPECTS( + !edgelist_edge_types || (edgelist_srcs[i].size() == (*edgelist_edge_types)[i].size()), + "Invalid input arguments: edgelist_edge_types.has_value() is true, " + "edgelist_srcs[i].size() != (*edgelist_edge_types)[i].size()."); + } + CUGRAPH_EXPECTS(renumber, + "Invalid input arguments: renumber should be true if multi_gpu is true."); + + if (do_expensive_check) { + edge_t aggregate_edge_count{0}; + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + aggregate_edge_count += edgelist_srcs[i].size(); + } + + rmm::device_uvector aggregate_edgelist_srcs(aggregate_edge_count, + handle.get_stream()); + rmm::device_uvector aggregate_edgelist_dsts(aggregate_edge_count, + handle.get_stream()); + edge_t output_offset{0}; + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + aggregate_edgelist_srcs.begin() + output_offset); + thrust::copy(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + aggregate_edgelist_dsts.begin() + output_offset); + output_offset += edgelist_srcs[i].size(); + } + + expensive_check_edgelist( + handle, + local_vertices, + store_transposed ? aggregate_edgelist_dsts : aggregate_edgelist_srcs, + store_transposed ? aggregate_edgelist_srcs : aggregate_edgelist_dsts, + renumber); + + if (graph_properties.is_symmetric) { + CUGRAPH_EXPECTS( + (check_symmetric( + handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "not symmetric."); + } + + if (!graph_properties.is_multigraph) { + CUGRAPH_EXPECTS( + check_no_parallel_edge(handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size())), + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "has parallel edges."); + } + } + + // 1. groupby each edge chunks to their target local adjacency matrix partition (and further + // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex + // IDs). + + std::vector>> edgelist_partitioned_srcs( + edgelist_srcs.size()); + std::vector>> edgelist_partitioned_dsts( + edgelist_srcs.size()); + auto edgelist_partitioned_weights = + edgelist_weights ? std::make_optional>>>( + edgelist_srcs.size()) + : std::nullopt; + auto edgelist_partitioned_edge_ids = + edgelist_edge_ids + ? std::make_optional>>>( + edgelist_srcs.size()) + : std::nullopt; + auto edgelist_partitioned_edge_types = + edgelist_edge_types + ? std::make_optional>>>( + edgelist_srcs.size()) + : std::nullopt; + + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { // iterate over input edge chunks + std::optional> this_chunk_weights{std::nullopt}; + if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); } + std::optional> this_chunk_edge_ids{std::nullopt}; + if (edgelist_edge_ids) { this_chunk_edge_ids = std::move((*edgelist_edge_ids)[i]); } + std::optional> this_chunk_edge_types{std::nullopt}; + if (edgelist_edge_types) { this_chunk_edge_types = std::move((*edgelist_edge_types)[i]); } + auto d_this_chunk_edge_counts = + cugraph::detail::groupby_and_count_edgelist_by_local_partition_id( + handle, + store_transposed ? edgelist_dsts[i] : edgelist_srcs[i], + store_transposed ? edgelist_srcs[i] : edgelist_dsts[i], + this_chunk_weights, + this_chunk_edge_ids, + this_chunk_edge_types, + true); + + std::vector h_this_chunk_edge_counts(d_this_chunk_edge_counts.size()); + raft::update_host(h_this_chunk_edge_counts.data(), + d_this_chunk_edge_counts.data(), + d_this_chunk_edge_counts.size(), + handle.get_stream()); + handle.sync_stream(); + std::vector h_this_chunk_edge_displacements(h_this_chunk_edge_counts.size()); + std::exclusive_scan(h_this_chunk_edge_counts.begin(), + h_this_chunk_edge_counts.end(), + h_this_chunk_edge_displacements.begin(), + size_t{0}); + + for (int j = 0; j < minor_comm_size /* # local edge partitions */ * + major_comm_size /* # segments in the local minor range */; + ++j) { + rmm::device_uvector tmp_srcs(h_this_chunk_edge_counts[j], handle.get_stream()); + auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_displacements[j]; + thrust::copy( + handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin()); + edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); + } + edgelist_srcs[i].resize(0, handle.get_stream()); + edgelist_srcs[i].shrink_to_fit(handle.get_stream()); + + for (int j = 0; j < minor_comm_size /* # local edge partitions */ * + major_comm_size /* # segments in the local minor range */; + ++j) { + rmm::device_uvector tmp_dsts(h_this_chunk_edge_counts[j], handle.get_stream()); + auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_displacements[j]; + thrust::copy( + handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin()); + edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); + } + edgelist_dsts[i].resize(0, handle.get_stream()); + edgelist_dsts[i].shrink_to_fit(handle.get_stream()); + + if (this_chunk_weights) { + for (int j = 0; j < minor_comm_size /* # local edge partitions */ * + major_comm_size /* # segments in the local minor range */; + ++j) { + rmm::device_uvector tmp_weights(h_this_chunk_edge_counts[j], handle.get_stream()); + auto input_first = (*this_chunk_weights).begin() + h_this_chunk_edge_displacements[j]; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + tmp_weights.size(), + tmp_weights.begin()); + (*edgelist_partitioned_weights)[i].push_back(std::move(tmp_weights)); + } + (*this_chunk_weights).resize(0, handle.get_stream()); + (*this_chunk_weights).shrink_to_fit(handle.get_stream()); + } + + if (this_chunk_edge_ids) { + for (int j = 0; j < minor_comm_size /* # local edge partitions */ * + major_comm_size /* # segments in the local minor range */; + ++j) { + rmm::device_uvector tmp_edge_ids(h_this_chunk_edge_counts[j], + handle.get_stream()); + auto input_first = (*this_chunk_edge_ids).begin() + h_this_chunk_edge_displacements[j]; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + tmp_edge_ids.size(), + tmp_edge_ids.begin()); + (*edgelist_partitioned_edge_ids)[i].push_back(std::move(tmp_edge_ids)); + } + (*this_chunk_edge_ids).resize(0, handle.get_stream()); + (*this_chunk_edge_ids).shrink_to_fit(handle.get_stream()); + } + + if (this_chunk_edge_types) { + for (int j = 0; j < minor_comm_size /* # local edge partitions */ * + major_comm_size /* # segments in the local minor range */; + ++j) { + rmm::device_uvector tmp_edge_types(h_this_chunk_edge_counts[j], + handle.get_stream()); + auto input_first = (*this_chunk_edge_types).begin() + h_this_chunk_edge_displacements[j]; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + tmp_edge_types.size(), + tmp_edge_types.begin()); + (*edgelist_partitioned_edge_types)[i].push_back(std::move(tmp_edge_types)); + } + (*this_chunk_edge_types).resize(0, handle.get_stream()); + (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); + } + } + edgelist_srcs.clear(); + edgelist_dsts.clear(); + if (edgelist_weights) { (*edgelist_weights).clear(); } + if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); } + if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } + + // 2. split the grouped edge chunks to local partitions + + auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); + + std::vector> edge_partition_edgelist_srcs{}; + edge_partition_edgelist_srcs.reserve(minor_comm_size); + std::vector> edge_partition_edgelist_dsts{}; + edge_partition_edgelist_dsts.reserve(minor_comm_size); + auto edge_partition_edgelist_weights = + edgelist_partitioned_weights ? std::make_optional>>() + : std::nullopt; + if (edgelist_partitioned_weights) { (*edge_partition_edgelist_weights).reserve(minor_comm_size); } + auto edge_partition_edgelist_edge_ids = + edgelist_partitioned_edge_ids + ? std::make_optional>>() + : std::nullopt; + if (edgelist_partitioned_edge_ids) { + (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); + } + auto edge_partition_edgelist_edge_types = + edgelist_partitioned_edge_types + ? std::make_optional>>() + : std::nullopt; + if (edgelist_partitioned_edge_types) { + (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); + } + + for (int i = 0; i < minor_comm_size; ++i) { // iterate over local edge partitions + edge_t edge_count{0}; + std::vector intra_partition_segment_sizes(major_comm_size, 0); + std::vector intra_segment_copy_output_displacements(major_comm_size * + edgelist_partitioned_srcs.size()); + for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) { + edge_t displacement{0}; + for (size_t k = 0; k < edgelist_partitioned_srcs.size() /* # input edge chunks */; ++k) { + auto segment_size = edgelist_partitioned_srcs[k][i * major_comm_size + j].size(); + edge_count += segment_size; + intra_partition_segment_sizes[j] += segment_size; + intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k] = + displacement; + displacement += segment_size; + } + } + std::vector intra_partition_segment_offsets(major_comm_size + 1, 0); + std::inclusive_scan(intra_partition_segment_sizes.begin(), + intra_partition_segment_sizes.end(), + intra_partition_segment_offsets.begin() + 1); + + rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); + for (int j = 0; j < major_comm_size; ++j) { + for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { + auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; + thrust::copy( + handle.get_thrust_policy(), + input_buffer.begin(), + input_buffer.end(), + tmp_srcs.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k]); + input_buffer.resize(0, handle.get_stream()); + input_buffer.shrink_to_fit(handle.get_stream()); + } + } + edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); + + rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); + for (int j = 0; j < major_comm_size; ++j) { + for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { + auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; + thrust::copy( + handle.get_thrust_policy(), + input_buffer.begin(), + input_buffer.end(), + tmp_dsts.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); + input_buffer.resize(0, handle.get_stream()); + input_buffer.shrink_to_fit(handle.get_stream()); + } + } + edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); + + if (edge_partition_edgelist_weights) { + rmm::device_uvector tmp_weights(edge_count, handle.get_stream()); + for (int j = 0; j < major_comm_size; ++j) { + for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { + auto& input_buffer = (*edgelist_partitioned_weights)[k][i * major_comm_size + j]; + thrust::copy( + handle.get_thrust_policy(), + input_buffer.begin(), + input_buffer.end(), + tmp_weights.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); + input_buffer.resize(0, handle.get_stream()); + input_buffer.shrink_to_fit(handle.get_stream()); + } + } + (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); + } + + if (edge_partition_edgelist_edge_ids) { + rmm::device_uvector tmp_edge_ids(edge_count, handle.get_stream()); + for (int j = 0; j < major_comm_size; ++j) { + for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { + auto& input_buffer = (*edgelist_partitioned_edge_ids)[k][i * major_comm_size + j]; + thrust::copy( + handle.get_thrust_policy(), + input_buffer.begin(), + input_buffer.end(), + tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); + input_buffer.resize(0, handle.get_stream()); + input_buffer.shrink_to_fit(handle.get_stream()); + } + } + (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); + } + + if (edge_partition_edgelist_edge_types) { + rmm::device_uvector tmp_edge_types(edge_count, handle.get_stream()); + for (int j = 0; j < major_comm_size; ++j) { + for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { + auto& input_buffer = (*edgelist_partitioned_edge_types)[k][i * major_comm_size + j]; + thrust::copy( + handle.get_thrust_policy(), + input_buffer.begin(), + input_buffer.end(), + tmp_edge_types.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); + input_buffer.resize(0, handle.get_stream()); + input_buffer.shrink_to_fit(handle.get_stream()); + } + } + (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); + } + + edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); + } + + return create_graph_from_partitioned_edgelist( + handle, + std::move(local_vertices), + std::move(edge_partition_edgelist_srcs), + std::move(edge_partition_edgelist_dsts), + std::move(edge_partition_edgelist_weights), + std::move(edge_partition_edgelist_edge_ids), + std::move(edge_partition_edgelist_edge_types), + edgelist_intra_partition_segment_offsets, + graph_properties, + renumber); +} + +template +std::enable_if_t< + !multi_gpu, + std::tuple< + cugraph::graph_t, + std::optional< + edge_property_t, weight_t>>, + std::optional< + edge_property_t, edge_id_t>>, + std::optional< + edge_property_t, edge_type_t>>, + std::optional>>> +create_graph_from_edgelist_impl( + raft::handle_t const& handle, + std::optional>&& vertices, + rmm::device_uvector&& edgelist_srcs, + rmm::device_uvector&& edgelist_dsts, + std::optional>&& edgelist_weights, + std::optional>&& edgelist_edge_ids, + std::optional>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check) +{ + CUGRAPH_EXPECTS( + !vertices || ((*vertices).size() < static_cast(std::numeric_limits::max())), + "Invalid input arguments: # unique vertex IDs should be smaller than " + "std::numeric_limits::Max()."); + CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(), + "Invalid input arguments: edgelist_srcs.size() != edgelist_dsts.size()."); + CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs.size() == (*edgelist_weights).size()), + "Invalid input arguments: edgelist_srcs.size() != edgelist_weights.size()."); + CUGRAPH_EXPECTS(!edgelist_edge_ids || (edgelist_srcs.size() == (*edgelist_edge_ids).size()), + "Invalid input arguments: edgelist_srcs.size() != " + "(*edgelist_edge_ids).size()."); + CUGRAPH_EXPECTS(!edgelist_edge_types || (edgelist_srcs.size() == (*edgelist_edge_types).size()), + "Invalid input arguments: edgelist_srcs.size() != " + "(*edgelist_edge_types).size()."); + + if (do_expensive_check) { + expensive_check_edgelist(handle, + vertices, + store_transposed ? edgelist_dsts : edgelist_srcs, + store_transposed ? edgelist_srcs : edgelist_dsts, + renumber); + + if (graph_properties.is_symmetric) { + CUGRAPH_EXPECTS( + (check_symmetric( + handle, + raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), + raft::device_span(edgelist_dsts.data(), edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "not symmetric."); + } + + if (!graph_properties.is_multigraph) { + CUGRAPH_EXPECTS( + check_no_parallel_edge( + handle, + raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), + raft::device_span(edgelist_dsts.data(), edgelist_dsts.size())), + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "has parallel edges."); + } + } + + // 1. renumber + + auto renumber_map_labels = + renumber ? std::make_optional>(0, handle.get_stream()) + : std::nullopt; + renumber_meta_t meta{}; + if (renumber) { + std::tie(*renumber_map_labels, meta) = cugraph::renumber_edgelist( + handle, + std::move(vertices), + edgelist_srcs.data(), + edgelist_dsts.data(), + static_cast(edgelist_srcs.size()), + store_transposed); + } + + vertex_t num_vertices{}; + if (renumber) { + num_vertices = static_cast((*renumber_map_labels).size()); + } else { + if (vertices) { + num_vertices = (*vertices).size(); + } else { + num_vertices = 1 + cugraph::detail::compute_maximum_vertex_id( + handle.get_stream(), edgelist_srcs, edgelist_dsts); + } + } + + // 2. convert edge list (COO) to compressed sparse format (CSR or CSC) + + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + size_t element_size = sizeof(vertex_t) * 2; + if (edgelist_weights) { element_size += sizeof(weight_t); } + if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } + if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } + auto constexpr mem_frugal_ratio = + 0.25; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + // total_global_mem, switch to the memory frugal approach + auto mem_frugal_threshold = + static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); + + rmm::device_uvector offsets(size_t{0}, handle.get_stream()); + rmm::device_uvector indices(size_t{0}, handle.get_stream()); + std::optional> weights{std::nullopt}; std::optional> ids{std::nullopt}; std::optional> types{std::nullopt}; @@ -1098,7 +1560,7 @@ create_graph_from_edgelist_impl( } } - // create a graph and an edge_property_t object. + // 3. create a graph and an edge_property_t object. std::optional< edge_property_t, weight_t>> @@ -1133,7 +1595,7 @@ create_graph_from_edgelist_impl( std::move(buffers)); } - // graph_t constructor + // 4. graph_t constructor return std::make_tuple( cugraph::graph_t( @@ -1150,6 +1612,196 @@ create_graph_from_edgelist_impl( std::move(renumber_map_labels)); } +template +std::enable_if_t< + !multi_gpu, + std::tuple< + cugraph::graph_t, + std::optional< + edge_property_t, weight_t>>, + std::optional< + edge_property_t, edge_id_t>>, + std::optional< + edge_property_t, edge_type_t>>, + std::optional>>> +create_graph_from_edgelist_impl( + raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check) +{ + CUGRAPH_EXPECTS(edgelist_srcs.size() == edgelist_dsts.size(), + "Invalid input arguments: edgelist_srcs.size() != edgelist_dsts.size()."); + CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs.size() == (*edgelist_weights).size()), + "Invalid input arguments: edgelist_weights.has_value() is true and " + "edgelist_srcs.size() != (*edgelist_weights).size()."); + CUGRAPH_EXPECTS(!edgelist_edge_ids || (edgelist_srcs.size() == (*edgelist_edge_ids).size()), + "Invalid input arguments: edgelist_edge_ids.has_value() is true and " + "edgelist_srcs.size() != (*edgelist_edge_ids).size()."); + CUGRAPH_EXPECTS(!edgelist_edge_types || (edgelist_srcs.size() == (*edgelist_edge_types).size()), + "Invalid input arguments: edgelist_edge_types.has_value() is true, " + "edgelist_srcs.size() != (*edgelist_edge_types).size()."); + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + CUGRAPH_EXPECTS(edgelist_srcs[i].size() == edgelist_dsts[i].size(), + "Invalid input arguments: edgelist_srcs[i].size() != edgelist_dsts[i].size()."); + CUGRAPH_EXPECTS(!edgelist_weights || (edgelist_srcs[i].size() == (*edgelist_weights)[i].size()), + "Invalid input arguments: edgelist_weights.has_value() is true and " + "edgelist_srcs[i].size() != (*edgelist_weights)[i].size()."); + CUGRAPH_EXPECTS( + !edgelist_edge_ids || (edgelist_srcs[i].size() == (*edgelist_edge_ids)[i].size()), + "Invalid input arguments: edgelist_edge_ids.has_value() is true and " + "edgelist_srcs[i].size() != (*edgelist_edge_ids)[i].size()."); + CUGRAPH_EXPECTS( + !edgelist_edge_types || (edgelist_srcs[i].size() == (*edgelist_edge_types)[i].size()), + "Invalid input arguments: edgelist_edge_types.has_value() is true, " + "edgelist_srcs[i].size() != (*edgelist_edge_types)[i].size()."); + } + CUGRAPH_EXPECTS(renumber, + "Invalid input arguments: renumber should be true if multi_gpu is true."); + + std::vector chunk_edge_counts(edgelist_srcs.size()); + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + chunk_edge_counts[i] = edgelist_srcs[i].size(); + } + std::vector chunk_edge_displacements(chunk_edge_counts.size()); + std::exclusive_scan(chunk_edge_counts.begin(), + chunk_edge_counts.end(), + chunk_edge_displacements.begin(), + edge_t{0}); + auto aggregate_edge_count = chunk_edge_displacements.back() + chunk_edge_counts.back(); + + rmm::device_uvector aggregate_edgelist_srcs(aggregate_edge_count, handle.get_stream()); + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + aggregate_edgelist_srcs.begin() + chunk_edge_displacements[i]); + edgelist_srcs[i].resize(0, handle.get_stream()); + edgelist_srcs[i].shrink_to_fit(handle.get_stream()); + } + edgelist_srcs.clear(); + + rmm::device_uvector aggregate_edgelist_dsts(aggregate_edge_count, handle.get_stream()); + for (size_t i = 0; i < edgelist_dsts.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + aggregate_edgelist_dsts.begin() + chunk_edge_displacements[i]); + edgelist_dsts[i].resize(0, handle.get_stream()); + edgelist_dsts[i].shrink_to_fit(handle.get_stream()); + } + edgelist_dsts.clear(); + + auto aggregate_edgelist_weights = + edgelist_weights + ? std::make_optional>(aggregate_edge_count, handle.get_stream()) + : std::nullopt; + if (aggregate_edgelist_weights) { + for (size_t i = 0; i < (*edgelist_weights).size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + (*edgelist_weights)[i].begin(), + (*edgelist_weights)[i].end(), + (*aggregate_edgelist_weights).begin() + chunk_edge_displacements[i]); + (*edgelist_weights)[i].resize(0, handle.get_stream()); + (*edgelist_weights)[i].shrink_to_fit(handle.get_stream()); + } + (*edgelist_weights).clear(); + } + + auto aggregate_edgelist_edge_ids = edgelist_edge_ids + ? std::make_optional>( + aggregate_edge_count, handle.get_stream()) + : std::nullopt; + if (aggregate_edgelist_edge_ids) { + for (size_t i = 0; i < (*edgelist_edge_ids).size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + (*edgelist_edge_ids)[i].begin(), + (*edgelist_edge_ids)[i].end(), + (*aggregate_edgelist_edge_ids).begin() + chunk_edge_displacements[i]); + (*edgelist_edge_ids)[i].resize(0, handle.get_stream()); + (*edgelist_edge_ids)[i].shrink_to_fit(handle.get_stream()); + } + (*edgelist_edge_ids).clear(); + } + + auto aggregate_edgelist_edge_types = edgelist_edge_types + ? std::make_optional>( + aggregate_edge_count, handle.get_stream()) + : std::nullopt; + if (aggregate_edgelist_edge_types) { + for (size_t i = 0; i < (*edgelist_edge_types).size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + (*edgelist_edge_types)[i].begin(), + (*edgelist_edge_types)[i].end(), + (*aggregate_edgelist_edge_types).begin() + chunk_edge_displacements[i]); + (*edgelist_edge_types)[i].resize(0, handle.get_stream()); + (*edgelist_edge_types)[i].shrink_to_fit(handle.get_stream()); + } + (*edgelist_edge_types).clear(); + } + + if (do_expensive_check) { + expensive_check_edgelist( + handle, + local_vertices, + store_transposed ? aggregate_edgelist_dsts : aggregate_edgelist_srcs, + store_transposed ? aggregate_edgelist_srcs : aggregate_edgelist_dsts, + renumber); + + if (graph_properties.is_symmetric) { + CUGRAPH_EXPECTS( + (check_symmetric( + handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "not symmetric."); + } + + if (!graph_properties.is_multigraph) { + CUGRAPH_EXPECTS( + check_no_parallel_edge(handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size())), + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "has parallel edges."); + } + } + + return create_graph_from_edgelist_impl(handle, + std::move(local_vertices), + std::move(aggregate_edgelist_srcs), + std::move(aggregate_edgelist_dsts), + std::move(aggregate_edgelist_weights), + std::move(aggregate_edgelist_edge_ids), + std::move(aggregate_edgelist_edge_types), + graph_properties, + renumber, + do_expensive_check); +} + } // namespace template +std::tuple< + cugraph::graph_t, + std::optional< + edge_property_t, weight_t>>, + std::optional< + edge_property_t, edge_id_t>>, + std::optional< + edge_property_t, edge_type_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check) +{ + return create_graph_from_edgelist_impl(handle, + std::move(vertices), + std::move(edgelist_srcs), + std::move(edgelist_dsts), + std::move(edgelist_weights), + std::move(edgelist_edge_ids), + std::move(edgelist_edge_types), + graph_properties, + renumber, + do_expensive_check); +} + } // namespace cugraph diff --git a/cpp/src/structure/create_graph_from_edgelist_mg_v32_e32.cu b/cpp/src/structure/create_graph_from_edgelist_mg_v32_e32.cu index 62eb4ccbd96..d0e41734365 100644 --- a/cpp/src/structure/create_graph_from_edgelist_mg_v32_e32.cu +++ b/cpp/src/structure/create_graph_from_edgelist_mg_v32_e32.cu @@ -30,7 +30,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -51,7 +51,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -72,7 +72,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -93,7 +93,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -103,4 +103,88 @@ create_graph_from_edgelist, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/create_graph_from_edgelist_mg_v32_e64.cu b/cpp/src/structure/create_graph_from_edgelist_mg_v32_e64.cu index efbe8058c41..380d3474292 100644 --- a/cpp/src/structure/create_graph_from_edgelist_mg_v32_e64.cu +++ b/cpp/src/structure/create_graph_from_edgelist_mg_v32_e64.cu @@ -30,7 +30,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -51,7 +51,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -72,7 +72,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -93,7 +93,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -103,4 +103,87 @@ create_graph_from_edgelist, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); } // namespace cugraph diff --git a/cpp/src/structure/create_graph_from_edgelist_mg_v64_e64.cu b/cpp/src/structure/create_graph_from_edgelist_mg_v64_e64.cu index cc62166a7af..cbbaf025856 100644 --- a/cpp/src/structure/create_graph_from_edgelist_mg_v64_e64.cu +++ b/cpp/src/structure/create_graph_from_edgelist_mg_v64_e64.cu @@ -30,7 +30,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -51,7 +51,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -72,7 +72,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -93,7 +93,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -103,4 +103,88 @@ create_graph_from_edgelist, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/create_graph_from_edgelist_sg_v32_e32.cu b/cpp/src/structure/create_graph_from_edgelist_sg_v32_e32.cu index 34c91494c8b..28dc3befd8d 100644 --- a/cpp/src/structure/create_graph_from_edgelist_sg_v32_e32.cu +++ b/cpp/src/structure/create_graph_from_edgelist_sg_v32_e32.cu @@ -30,7 +30,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -51,7 +51,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -72,7 +72,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -93,7 +93,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -103,4 +103,88 @@ create_graph_from_edgelist, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/create_graph_from_edgelist_sg_v32_e64.cu b/cpp/src/structure/create_graph_from_edgelist_sg_v32_e64.cu index d09e50e0f81..71bd74c1a44 100644 --- a/cpp/src/structure/create_graph_from_edgelist_sg_v32_e64.cu +++ b/cpp/src/structure/create_graph_from_edgelist_sg_v32_e64.cu @@ -30,7 +30,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -51,7 +51,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -72,7 +72,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -93,7 +93,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -103,4 +103,88 @@ create_graph_from_edgelist, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/create_graph_from_edgelist_sg_v64_e64.cu b/cpp/src/structure/create_graph_from_edgelist_sg_v64_e64.cu index 099c6801f0d..7db38452c72 100644 --- a/cpp/src/structure/create_graph_from_edgelist_sg_v64_e64.cu +++ b/cpp/src/structure/create_graph_from_edgelist_sg_v64_e64.cu @@ -30,7 +30,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -51,7 +51,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -72,7 +72,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -93,7 +93,7 @@ template std::tuple< std::optional>> create_graph_from_edgelist( raft::handle_t const& handle, - std::optional>&& vertex_span, + std::optional>&& vertices, rmm::device_uvector&& edgelist_srcs, rmm::device_uvector&& edgelist_dsts, std::optional>&& edgelist_weights, @@ -103,4 +103,88 @@ create_graph_from_edgelist, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, float>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + +template std::tuple< + cugraph::graph_t, + std::optional< + cugraph::edge_property_t, double>>, + std::optional< + cugraph::edge_property_t, int64_t>>, + std::optional< + cugraph::edge_property_t, int32_t>>, + std::optional>> +create_graph_from_edgelist( + raft::handle_t const& handle, + std::optional>&& vertices, + std::vector>&& edgelist_srcs, + std::vector>&& edgelist_dsts, + std::optional>>&& edgelist_weights, + std::optional>>&& edgelist_edge_ids, + std::optional>>&& edgelist_edge_types, + graph_properties_t graph_properties, + bool renumber, + bool do_expensive_check); + } // namespace cugraph From 55513ae35e6705aabfc69a3f4dc8077b660f3ffd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 15 Jul 2024 14:24:31 -0700 Subject: [PATCH 002/126] update R-mat graph generators to generate edge list in multiple chunks --- cpp/tests/utilities/test_graphs.hpp | 272 +++++++++++++++++----------- 1 file changed, 167 insertions(+), 105 deletions(-) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 0f3224bfc52..4059f2003e4 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -131,9 +131,9 @@ class File_Usecase : public detail::TranslateGraph_Usecase { } template - std::tuple, - rmm::device_uvector, - std::optional>, + std::tuple>, + std::vector>, + std::optional>>, std::optional>, bool> construct_edgelist(raft::handle_t const& handle, @@ -159,8 +159,20 @@ class File_Usecase : public detail::TranslateGraph_Usecase { translate(handle, srcs, dsts); if (vertices) { translate(handle, *vertices); } - return std::make_tuple( - std::move(srcs), std::move(dsts), std::move(weights), std::move(vertices), is_symmetric); + std::vector> edge_src_chunks{}; + edge_src_chunks.push_back(std::move(srcs)); + std::vector> edge_dst_chunks{}; + edge_dst_chunks.push_back(std::move(dsts)); + std::optional>> edge_weight_chunks{std::nullopt}; + if (weights) { + edge_weight_chunks = std::vector>{}; + (*edge_weight_chunks).push_back(std::move(*weights)); + } + return std::make_tuple(std::move(edge_src_chunks), + std::move(edge_dst_chunks), + std::move(edge_weight_chunks), + std::move(vertices), + is_symmetric); } private: @@ -193,9 +205,9 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } template - std::tuple, - rmm::device_uvector, - std::optional>, + std::tuple>, + std::vector>, + std::optional>>, std::optional>, bool> construct_edgelist(raft::handle_t const& handle, @@ -213,7 +225,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { // cuMemAddressReserve // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size) - size_t constexpr num_partitions_per_gpu = 2; + size_t constexpr num_partitions_per_gpu = 4; size_t num_partitions = num_partitions_per_gpu * static_cast(multi_gpu ? handle.get_comms().get_size() : 1); @@ -253,14 +265,14 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { raft::random::RngState rng_state{ base_seed_ + static_cast(multi_gpu ? handle.get_comms().get_rank() : 0)}; - std::vector> src_partitions{}; - std::vector> dst_partitions{}; - auto weight_partitions = test_weighted - ? std::make_optional>>() - : std::nullopt; - src_partitions.reserve(num_partitions_per_gpu); - dst_partitions.reserve(num_partitions_per_gpu); - if (weight_partitions) { (*weight_partitions).reserve(num_partitions_per_gpu); } + std::vector> edge_src_chunks{}; + std::vector> edge_dst_chunks{}; + auto edge_weight_chunks = test_weighted + ? std::make_optional>>() + : std::nullopt; + edge_src_chunks.reserve(num_partitions_per_gpu); + edge_dst_chunks.reserve(num_partitions_per_gpu); + if (edge_weight_chunks) { (*edge_weight_chunks).reserve(num_partitions_per_gpu); } for (size_t i = 0; i < num_partitions_per_gpu; ++i) { auto [tmp_src_v, tmp_dst_v] = cugraph::generate_rmat_edgelist(handle, @@ -277,7 +289,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { } std::optional> tmp_weights_v{std::nullopt}; - if (weight_partitions) { + if (edge_weight_chunks) { tmp_weights_v = std::make_optional>(tmp_src_v.size(), handle.get_stream()); @@ -315,27 +327,9 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { std::nullopt); } - src_partitions.push_back(std::move(tmp_src_v)); - dst_partitions.push_back(std::move(tmp_dst_v)); - if (weight_partitions) { (*weight_partitions).push_back(std::move(*tmp_weights_v)); } - } - - size_t tot_edge_counts{0}; - for (size_t i = 0; i < src_partitions.size(); ++i) { - tot_edge_counts += src_partitions[i].size(); - } - - // detail::concatenate uses a host buffer to store input vectors if initial device memory - // allocation for the return vector fails. This does not improve peak memory usage and is not - // helpful with the rmm_mode = cuda. However, if rmm_mode = pool, memory allocation can fail - // even when the aggregate free memory size far exceeds the requested size. This heuristic is - // helpful in this case. - - auto src_v = detail::concatenate(handle, std::move(src_partitions)); - auto dst_v = detail::concatenate(handle, std::move(dst_partitions)); - std::optional> weight_v{std::nullopt}; - if (weight_partitions) { - weight_v = detail::concatenate(handle, std::move(*weight_partitions)); + edge_src_chunks.push_back(std::move(tmp_src_v)); + edge_dst_chunks.push_back(std::move(tmp_dst_v)); + if (edge_weight_chunks) { (*edge_weight_chunks).push_back(std::move(*tmp_weights_v)); } } // 3. generate vertices @@ -364,8 +358,11 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { handle, std::move(vertex_v)); } - return std::make_tuple( - std::move(src_v), std::move(dst_v), std::move(weight_v), std::move(vertex_v), undirected_); + return std::make_tuple(std::move(edge_src_chunks), + std::move(edge_dst_chunks), + std::move(edge_weight_chunks), + std::move(vertex_v), + undirected_); } void set_scale(size_t scale) { scale_ = scale; } @@ -388,17 +385,15 @@ class PathGraph_Usecase { public: PathGraph_Usecase() = delete; - PathGraph_Usecase(std::vector> parms, - bool weighted = false, - bool scramble = false) - : parms_(parms), weighted_(weighted) + PathGraph_Usecase(std::vector> parms, bool scramble = false) + : parms_(parms) { } template - std::tuple, - rmm::device_uvector, - std::optional>, + std::tuple>, + std::vector>, + std::optional>>, std::optional>, bool> construct_edgelist(raft::handle_t const& handle, @@ -415,21 +410,44 @@ class PathGraph_Usecase { static_cast(std::get<1>(p))); }); - auto [src_v, dst_v] = cugraph::generate_path_graph_edgelist(handle, converted_parms); - std::tie(src_v, dst_v, std::ignore) = + auto [srcs, dsts] = cugraph::generate_path_graph_edgelist(handle, converted_parms); + + raft::random::RngState rng_state{ + base_seed_ + static_cast(multi_gpu ? handle.get_comms().get_rank() : 0)}; + + std::optional> weights{std::nullopt}; + if (test_weighted) { + weights = std::make_optional>(srcs.size(), handle.get_stream()); + + cugraph::detail::uniform_random_fill(handle.get_stream(), + weights->data(), + weights->size(), + weight_t{0.0}, + weight_t{1.0}, + rng_state); + } + + std::tie(srcs, dsts, weights) = cugraph::symmetrize_edgelist_from_triangular( - handle, std::move(src_v), std::move(dst_v), std::nullopt); + handle, std::move(srcs), std::move(dsts), std::move(weights)); rmm::device_uvector d_vertices(num_vertices_, handle.get_stream()); cugraph::detail::sequence_fill( handle.get_stream(), d_vertices.data(), num_vertices_, vertex_t{0}); handle.sync_stream(); - return std::make_tuple(std::move(src_v), - std::move(dst_v), - test_weighted ? std::make_optional>( - src_v.size(), handle.get_stream()) - : std::nullopt, + std::vector> edge_src_chunks{}; + edge_src_chunks.push_back(std::move(srcs)); + std::vector> edge_dst_chunks{}; + edge_dst_chunks.push_back(std::move(dsts)); + std::optional>> edge_weight_chunks{std::nullopt}; + if (weights) { + edge_weight_chunks = std::vector>{}; + (*edge_weight_chunks).push_back(std::move(*weights)); + } + return std::make_tuple(std::move(edge_src_chunks), + std::move(edge_dst_chunks), + std::move(edge_weight_chunks), std::move(d_vertices), symmetric); } @@ -437,7 +455,7 @@ class PathGraph_Usecase { private: std::vector> parms_{}; size_t num_vertices_{0}; - bool weighted_{false}; + uint64_t base_seed_{}; }; class Mesh2DGraph_Usecase { @@ -450,9 +468,9 @@ class Mesh2DGraph_Usecase { } template - std::tuple, - rmm::device_uvector, - std::optional>, + std::tuple>, + std::vector>, + std::optional>>, std::optional>, bool> construct_edgelist(raft::handle_t const& handle, @@ -476,9 +494,9 @@ class Mesh3DGraph_Usecase { } template - std::tuple, - rmm::device_uvector, - std::optional>, + std::tuple>, + std::vector>, + std::optional>>, std::optional>, bool> construct_edgelist(raft::handle_t const& handle, @@ -501,9 +519,9 @@ class CompleteGraph_Usecase { } template - std::tuple, - rmm::device_uvector, - std::optional>, + std::tuple>, + std::vector>, + std::optional>>, std::optional>, bool> construct_edgelist(raft::handle_t const& handle, @@ -573,10 +591,10 @@ class CombinedGenerator_Usecase { CombinedGenerator_Usecase(generator_tuple_t const& tuple) : generator_tuple_(tuple) {} template - std::tuple, - rmm::device_uvector, - std::optional>, - rmm::device_uvector, + std::tuple>, + std::vector>, + std::optional>>, + std::optional>, vertex_t, bool> construct_edgelist(raft::handle_t const& handle, @@ -593,6 +611,13 @@ class CombinedGenerator_Usecase { // Need to combine elements. We have a vector of tuples, we want to combine // the elements of each component of the tuple CUGRAPH_FAIL("not implemented"); + + std::vector> edge_src_chunks{}; + edge_src_chunks.push_back(rmm::device_uvector(0, handle.get_stream())); + std::vector> edge_dst_chunks{}; + edge_dst_chunks.push_back(rmm::device_uvector(0, handle.get_stream())); + return std::make_tuple( + std::move(edge_src_chunks), std::move(edge_dst_chunks), std::nullopt, std::nullopt, false); } private: @@ -617,32 +642,49 @@ construct_graph(raft::handle_t const& handle, bool drop_self_loops = false, bool drop_multi_edges = false) { - auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] = + auto [edge_src_chunks, edge_dst_chunks, edge_weight_chunks, d_vertices_v, is_symmetric] = input_usecase.template construct_edgelist( handle, test_weighted, store_transposed, multi_gpu); - CUGRAPH_EXPECTS(d_src_v.size() <= static_cast(std::numeric_limits::max()), + size_t num_edges{0}; + for (size_t i = 0; i < edge_src_chunks.size(); ++i) { + num_edges += edge_src_chunks[i].size(); + } + CUGRAPH_EXPECTS(num_edges <= static_cast(std::numeric_limits::max()), "Invalid template parameter: edge_t overflow."); if (drop_self_loops) { - std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) = - cugraph::remove_self_loops(handle, - std::move(d_src_v), - std::move(d_dst_v), - std::move(d_weights_v), - std::nullopt, - std::nullopt); + for (size_t i = 0; i < edge_src_chunks.size(); ++i) { + std::optional> tmp_weights{std::nullopt}; + std::tie(edge_src_chunks[i], edge_dst_chunks[i], tmp_weights, std::ignore, std::ignore) = + cugraph::remove_self_loops( + handle, + std::move(edge_src_chunks[i]), + std::move(edge_dst_chunks[i]), + edge_weight_chunks + ? std::make_optional>(std::move((*edge_weight_chunks)[i])) + : std::nullopt, + std::nullopt, + std::nullopt); + if (tmp_weights) { (*edge_weight_chunks)[i] = std::move(*tmp_weights); } + } } if (drop_multi_edges) { - std::tie(d_src_v, d_dst_v, d_weights_v, std::ignore, std::ignore) = - cugraph::remove_multi_edges( - handle, - std::move(d_src_v), - std::move(d_dst_v), - std::move(d_weights_v), - std::nullopt, - std::nullopt, - is_symmetric ? true /* keep minimum weight edges to maintain symmetry */ : false); + for (size_t i = 0; i < edge_src_chunks.size(); ++i) { + std::optional> tmp_weights{std::nullopt}; + std::tie(edge_src_chunks[i], edge_dst_chunks[i], tmp_weights, std::ignore, std::ignore) = + cugraph::remove_multi_edges( + handle, + std::move(edge_src_chunks[i]), + std::move(edge_dst_chunks[i]), + edge_weight_chunks + ? std::make_optional>(std::move((*edge_weight_chunks)[i])) + : std::nullopt, + std::nullopt, + std::nullopt, + is_symmetric ? true /* keep minimum weight edges to maintain symmetry */ : false); + if (tmp_weights) { (*edge_weight_chunks)[i] = std::move(*tmp_weights); } + } } graph_t graph(handle); @@ -650,23 +692,43 @@ construct_graph(raft::handle_t const& handle, edge_property_t, weight_t>> edge_weights{std::nullopt}; std::optional> renumber_map{std::nullopt}; - std::tie(graph, edge_weights, std::ignore, std::ignore, renumber_map) = - cugraph::create_graph_from_edgelist( - handle, - std::move(d_vertices_v), - std::move(d_src_v), - std::move(d_dst_v), - std::move(d_weights_v), - std::nullopt, - std::nullopt, - cugraph::graph_properties_t{is_symmetric, drop_multi_edges ? false : true}, - renumber); + if (edge_src_chunks.size() == 1) { + std::tie(graph, edge_weights, std::ignore, std::ignore, renumber_map) = + cugraph::create_graph_from_edgelist( + handle, + std::move(d_vertices_v), + std::move(edge_src_chunks[0]), + std::move(edge_dst_chunks[0]), + edge_weight_chunks ? std::make_optional(std::move((*edge_weight_chunks)[0])) : std::nullopt, + std::nullopt, + std::nullopt, + cugraph::graph_properties_t{is_symmetric, drop_multi_edges ? false : true}, + renumber); + } else { + std::tie(graph, edge_weights, std::ignore, std::ignore, renumber_map) = + cugraph::create_graph_from_edgelist( + handle, + std::move(d_vertices_v), + std::move(edge_src_chunks), + std::move(edge_dst_chunks), + std::move(edge_weight_chunks), + std::nullopt, + std::nullopt, + cugraph::graph_properties_t{is_symmetric, drop_multi_edges ? false : true}, + renumber); + } return std::make_tuple(std::move(graph), std::move(edge_weights), std::move(renumber_map)); } From a9dfb92fbd9cb2ada16b84906728685c50a5f89d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 15 Jul 2024 16:00:49 -0700 Subject: [PATCH 003/126] fix build error --- cpp/tests/structure/renumbering_test.cpp | 26 +++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/cpp/tests/structure/renumbering_test.cpp b/cpp/tests/structure/renumbering_test.cpp index ffb51943660..0d9d8dfd5d7 100644 --- a/cpp/tests/structure/renumbering_test.cpp +++ b/cpp/tests/structure/renumbering_test.cpp @@ -69,10 +69,29 @@ class Tests_Renumbering rmm::device_uvector src_v(0, handle.get_stream()); rmm::device_uvector dst_v(0, handle.get_stream()); - rmm::device_uvector renumber_map_labels_v(0, handle.get_stream()); - std::tie(src_v, dst_v, std::ignore, std::ignore, std::ignore) = - input_usecase.template construct_edgelist(handle, false, false, false); + { + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::tie(src_chunks, dst_chunks, std::ignore, std::ignore, std::ignore) = + input_usecase.template construct_edgelist(handle, false, false, false); + + edge_t edge_count{0}; + for (size_t i = 0; i < src_chunks.size(); ++i) { + edge_count += static_cast(src_chunks[i].size()); + } + src_v.resize(edge_count, handle.get_stream()); + dst_v.resize(src_v.size(), handle.get_stream()); + + edge_t offset{0}; + for (size_t i = 0; i < src_chunks.size(); ++i) { + raft::copy( + src_v.data() + offset, src_chunks[i].data(), src_chunks[i].size(), handle.get_stream()); + raft::copy( + dst_v.data() + offset, dst_chunks[i].data(), dst_chunks[i].size(), handle.get_stream()); + offset += static_cast(src_chunks[i].size()); + } + } if (renumbering_usecase.check_correctness) { h_original_src_v = cugraph::test::to_host(handle, src_v); @@ -84,6 +103,7 @@ class Tests_Renumbering hr_timer.start("Renumbering"); } + rmm::device_uvector renumber_map_labels_v(0, handle.get_stream()); std::tie(renumber_map_labels_v, std::ignore) = cugraph::renumber_edgelist( handle, std::nullopt, src_v.begin(), dst_v.begin(), src_v.size(), false); From e7b33ca701eaefe003e4233f304e29087401f32d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 15 Jul 2024 16:01:34 -0700 Subject: [PATCH 004/126] delete unused functions --- cpp/tests/utilities/test_graphs.hpp | 47 ----------------------------- 1 file changed, 47 deletions(-) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 4059f2003e4..9791b5448cf 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -35,53 +35,6 @@ namespace test { namespace detail { -template -std::optional> try_allocate(raft::handle_t const& handle, size_t size) -{ - try { - return std::make_optional>(size, handle.get_stream()); - } catch (std::exception const& e) { - return std::nullopt; - } -} - -// use host memory as temporary buffer if memroy allocation on device fails -template -rmm::device_uvector concatenate(raft::handle_t const& handle, - std::vector>&& inputs) -{ - size_t tot_count{0}; - for (size_t i = 0; i < inputs.size(); ++i) { - tot_count += inputs[i].size(); - } - - auto output = try_allocate(handle, tot_count); - if (output) { - size_t offset{0}; - for (size_t i = 0; i < inputs.size(); ++i) { - raft::copy( - (*output).data() + offset, inputs[i].data(), inputs[i].size(), handle.get_stream()); - offset += inputs[i].size(); - } - inputs.clear(); - inputs.shrink_to_fit(); - } else { - std::vector h_buffer(tot_count); - size_t offset{0}; - for (size_t i = 0; i < inputs.size(); ++i) { - raft::update_host( - h_buffer.data() + offset, inputs[i].data(), inputs[i].size(), handle.get_stream()); - offset += inputs[i].size(); - } - inputs.clear(); - inputs.shrink_to_fit(); - output = rmm::device_uvector(tot_count, handle.get_stream()); - raft::update_device((*output).data(), h_buffer.data(), h_buffer.size(), handle.get_stream()); - } - - return std::move(*output); -} - class TranslateGraph_Usecase { public: TranslateGraph_Usecase() = delete; From 27ea550a0df320b8483572c27e88bd9005977c50 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 16 Jul 2024 16:04:57 -0700 Subject: [PATCH 005/126] fix build errors --- cpp/tests/mtmg/multi_node_threaded_test.cu | 19 +++++- cpp/tests/mtmg/threaded_test.cu | 19 +++++- cpp/tests/mtmg/threaded_test_jaccard.cu | 19 +++++- cpp/tests/mtmg/threaded_test_louvain.cu | 19 +++++- cpp/tests/structure/renumbering_test.cpp | 18 +----- cpp/tests/utilities/test_graphs.hpp | 68 ++++++++++++++++++++++ 6 files changed, 135 insertions(+), 27 deletions(-) diff --git a/cpp/tests/mtmg/multi_node_threaded_test.cu b/cpp/tests/mtmg/multi_node_threaded_test.cu index 24852562b86..06ccd4a7fa1 100644 --- a/cpp/tests/mtmg/multi_node_threaded_test.cu +++ b/cpp/tests/mtmg/multi_node_threaded_test.cu @@ -150,9 +150,22 @@ class Tests_Multithreaded instance_manager->reset_threads(); // Load SG edge list - auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, multithreaded_usecase.test_weighted, false, false); + rmm::device_uvector d_src_v(0, handle.get_stream()); + rmm::device_uvector d_dst_v(0, handle.get_stream()); + std::optional> d_weights_v{std::nullopt}; + std::optional> d_vertices_v{std::nullopt}; + bool is_symmetric{}; + { + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::optional>> weight_chunks{std::nullopt}; + std::tie(src_chunks, dst_chunks, weight_chunks, d_vertices_v, is_symmetric) = + input_usecase.template construct_edgelist( + handle, multithreaded_usecase.test_weighted, false, false); + + std::tie(d_src_v, d_dst_v, d_weights_v) = cugraph::test::detail::concatenate_edge_chunks( + handle, std::move(src_chunks), std::move(dst_chunks), std::move(weight_chunks)); + } auto h_src_v = cugraph::test::to_host(handle, d_src_v); auto h_dst_v = cugraph::test::to_host(handle, d_dst_v); diff --git a/cpp/tests/mtmg/threaded_test.cu b/cpp/tests/mtmg/threaded_test.cu index df5a9e079df..a288ef63da9 100644 --- a/cpp/tests/mtmg/threaded_test.cu +++ b/cpp/tests/mtmg/threaded_test.cu @@ -151,9 +151,22 @@ class Tests_Multithreaded instance_manager->reset_threads(); // Load SG edge list - auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, multithreaded_usecase.test_weighted, false, false); + rmm::device_uvector d_src_v(0, handle.get_stream()); + rmm::device_uvector d_dst_v(0, handle.get_stream()); + std::optional> d_weights_v{std::nullopt}; + std::optional> d_vertices_v{std::nullopt}; + bool is_symmetric{}; + { + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::optional>> weight_chunks{std::nullopt}; + std::tie(src_chunks, dst_chunks, weight_chunks, d_vertices_v, is_symmetric) = + input_usecase.template construct_edgelist( + handle, multithreaded_usecase.test_weighted, false, false); + + std::tie(d_src_v, d_dst_v, d_weights_v) = cugraph::test::detail::concatenate_edge_chunks( + handle, std::move(src_chunks), std::move(dst_chunks), std::move(weight_chunks)); + } rmm::device_uvector d_unique_vertices(2 * d_src_v.size(), handle.get_stream()); thrust::copy( diff --git a/cpp/tests/mtmg/threaded_test_jaccard.cu b/cpp/tests/mtmg/threaded_test_jaccard.cu index 0f531796cff..bed2f193130 100644 --- a/cpp/tests/mtmg/threaded_test_jaccard.cu +++ b/cpp/tests/mtmg/threaded_test_jaccard.cu @@ -144,9 +144,22 @@ class Tests_Multithreaded instance_manager->reset_threads(); // Load SG edge list - auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, test_weighted, store_transposed, false); + rmm::device_uvector d_src_v(0, handle.get_stream()); + rmm::device_uvector d_dst_v(0, handle.get_stream()); + std::optional> d_weights_v{std::nullopt}; + std::optional> d_vertices_v{std::nullopt}; + bool is_symmetric{}; + { + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::optional>> weight_chunks{std::nullopt}; + std::tie(src_chunks, dst_chunks, weight_chunks, d_vertices_v, is_symmetric) = + input_usecase.template construct_edgelist( + handle, test_weighted, store_transposed, false); + + std::tie(d_src_v, d_dst_v, d_weights_v) = cugraph::test::detail::concatenate_edge_chunks( + handle, std::move(src_chunks), std::move(dst_chunks), std::move(weight_chunks)); + } rmm::device_uvector d_unique_vertices(2 * d_src_v.size(), handle.get_stream()); thrust::copy( diff --git a/cpp/tests/mtmg/threaded_test_louvain.cu b/cpp/tests/mtmg/threaded_test_louvain.cu index b9c8f621ab8..a2ec9244b38 100644 --- a/cpp/tests/mtmg/threaded_test_louvain.cu +++ b/cpp/tests/mtmg/threaded_test_louvain.cu @@ -151,9 +151,22 @@ class Tests_Multithreaded instance_manager->reset_threads(); // Load SG edge list - auto [d_src_v, d_dst_v, d_weights_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, multithreaded_usecase.test_weighted, false, false); + rmm::device_uvector d_src_v(0, handle.get_stream()); + rmm::device_uvector d_dst_v(0, handle.get_stream()); + std::optional> d_weights_v{std::nullopt}; + std::optional> d_vertices_v{std::nullopt}; + bool is_symmetric{}; + { + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::optional>> weight_chunks{std::nullopt}; + std::tie(src_chunks, dst_chunks, weight_chunks, d_vertices_v, is_symmetric) = + input_usecase.template construct_edgelist( + handle, multithreaded_usecase.test_weighted, false, false); + + std::tie(d_src_v, d_dst_v, d_weights_v) = cugraph::test::detail::concatenate_edge_chunks( + handle, std::move(src_chunks), std::move(dst_chunks), std::move(weight_chunks)); + } rmm::device_uvector d_unique_vertices(2 * d_src_v.size(), handle.get_stream()); thrust::copy( diff --git a/cpp/tests/structure/renumbering_test.cpp b/cpp/tests/structure/renumbering_test.cpp index 0d9d8dfd5d7..d5829be6fe3 100644 --- a/cpp/tests/structure/renumbering_test.cpp +++ b/cpp/tests/structure/renumbering_test.cpp @@ -76,21 +76,9 @@ class Tests_Renumbering std::tie(src_chunks, dst_chunks, std::ignore, std::ignore, std::ignore) = input_usecase.template construct_edgelist(handle, false, false, false); - edge_t edge_count{0}; - for (size_t i = 0; i < src_chunks.size(); ++i) { - edge_count += static_cast(src_chunks[i].size()); - } - src_v.resize(edge_count, handle.get_stream()); - dst_v.resize(src_v.size(), handle.get_stream()); - - edge_t offset{0}; - for (size_t i = 0; i < src_chunks.size(); ++i) { - raft::copy( - src_v.data() + offset, src_chunks[i].data(), src_chunks[i].size(), handle.get_stream()); - raft::copy( - dst_v.data() + offset, dst_chunks[i].data(), dst_chunks[i].size(), handle.get_stream()); - offset += static_cast(src_chunks[i].size()); - } + std::tie(src_v, dst_v, std::ignore) = + cugraph::test::detail::concatenate_edge_chunks( + handle, std::move(src_chunks), std::move(dst_chunks), std::nullopt); } if (renumbering_usecase.check_correctness) { diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 9791b5448cf..813187f338c 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -35,6 +35,74 @@ namespace test { namespace detail { +template +std::tuple, + rmm::device_uvector, + std::optional>> +concatenate_edge_chunks(raft::handle_t const& handle, + std::vector>&& src_chunks, + std::vector>&& dst_chunks, + std::optional>> weight_chunks) +{ + if (src_chunks.size() == 1) { + return std::make_tuple(std::move(src_chunks[0]), + std::move(dst_chunks[0]), + weight_chunks ? std::make_optional>( + std::move((*weight_chunks)[0])) + : std::nullopt); + } else { + size_t edge_count{0}; + for (size_t i = 0; i < src_chunks.size(); ++i) { + edge_count += src_chunks[i].size(); + } + + rmm::device_uvector srcs(edge_count, handle.get_stream()); + { + size_t offset{0}; + for (size_t i = 0; i < src_chunks.size(); ++i) { + raft::copy( + srcs.data() + offset, src_chunks[i].data(), src_chunks[i].size(), handle.get_stream()); + offset += src_chunks[i].size(); + src_chunks[i].resize(0, handle.get_stream()); + src_chunks[i].shrink_to_fit(handle.get_stream()); + } + src_chunks.clear(); + } + + rmm::device_uvector dsts(edge_count, handle.get_stream()); + { + size_t offset{0}; + for (size_t i = 0; i < dst_chunks.size(); ++i) { + raft::copy( + dsts.data() + offset, dst_chunks[i].data(), dst_chunks[i].size(), handle.get_stream()); + offset += dst_chunks[i].size(); + dst_chunks[i].resize(0, handle.get_stream()); + dst_chunks[i].shrink_to_fit(handle.get_stream()); + } + dst_chunks.clear(); + } + + auto weights = weight_chunks ? std::make_optional>( + edge_count, handle.get_stream()) + : std::nullopt; + if (weights) { + size_t offset{0}; + for (size_t i = 0; i < (*weight_chunks).size(); ++i) { + raft::copy((*weights).data() + offset, + (*weight_chunks)[i].data(), + (*weight_chunks)[i].size(), + handle.get_stream()); + offset += (*weight_chunks)[i].size(); + (*weight_chunks)[i].resize(0, handle.get_stream()); + (*weight_chunks)[i].shrink_to_fit(handle.get_stream()); + } + (*weight_chunks).clear(); + } + + return std::make_tuple(std::move(srcs), std::move(dsts), std::move(weights)); + } +} + class TranslateGraph_Usecase { public: TranslateGraph_Usecase() = delete; From e5e825732af5447a657c7df2ec6bede382aedcd3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Jul 2024 23:02:27 -0700 Subject: [PATCH 006/126] add temporary performance measurement code --- ...educe_v_frontier_outgoing_e_by_src_dst.cuh | 25 ++++ cpp/src/traversal/bfs_impl.cuh | 122 +++++++++++++++++- 2 files changed, 144 insertions(+), 3 deletions(-) diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh index e58ab08fa97..dc2c6a2aa27 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh @@ -178,6 +178,10 @@ auto sort_and_reduce_buffer_elements( return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); } +#if 1 // FIXME: delete +#define TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT +#endif + template ( handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); +#endif if constexpr (GraphViewType::is_multi_gpu) { // FIXME: this step is unnecessary if major_comm_size== 1 auto& comm = handle.get_comms(); @@ -315,6 +331,15 @@ transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, detail::sort_and_reduce_buffer_elements( handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time3 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::cout << "\tprim took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" + << std::endl; +#endif if constexpr (!std::is_same_v) { return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 8a18dedd2ab..07ca6a6f994 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -88,6 +88,10 @@ struct bottomup_e_op_t { namespace detail { +#if 1 // FIXME: delete +#define BFS_PERFORMANCE_MEASUREMENT +#endif + template void bfs(raft::handle_t const& handle, GraphViewType const& graph_view, @@ -107,6 +111,10 @@ void bfs(raft::handle_t const& handle, static_assert(!GraphViewType::is_storage_transposed, "GraphViewType should support the push model."); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep0 = std::chrono::steady_clock::now(); +#endif // direction optimizing BFS implementation is based on "S. Beamer, K. Asanovic, D. Patterson, // Direction-Optimizing Breadth-First Search, 2012" @@ -244,6 +252,12 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur).end(), prev_dst_visited_flags.mutable_view(), true); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep1 = std::chrono::steady_clock::now(); + std::chrono::duration dur = prep1 - prep0; + std::cout << "prep took " << dur.count() << " s." << std::endl; +#endif // 4. BFS iteration vertex_t depth{0}; @@ -253,6 +267,10 @@ void bfs(raft::handle_t const& handle, while (true) { vertex_t next_aggregate_vertex_frontier_size{}; if (top_down) { +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown0 = std::chrono::steady_clock::now(); +#endif topdown_e_op_t e_op{}; e_op.prev_visited_flags = detail::edge_partition_endpoint_property_device_view_t( @@ -271,6 +289,10 @@ void bfs(raft::handle_t const& handle, edge_dummy_property_t{}.view(), e_op, reduce_op::any()); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown1 = std::chrono::steady_clock::now(); +#endif auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), predecessor_buffer.begin()); @@ -285,10 +307,28 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown2 = std::chrono::steady_clock::now(); +#endif next_aggregate_vertex_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_next).aggregate_size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown3 = std::chrono::steady_clock::now(); +#endif + if (next_aggregate_vertex_frontier_size == 0) { +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + std::chrono::duration dur0 = topdown1 - topdown0; + std::chrono::duration dur1 = topdown2 - topdown1; + std::chrono::duration dur2 = topdown3 - topdown2; + std::chrono::duration dur = topdown3 - topdown0; + std::cout << "topdown took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << ") s." << std::endl; +#endif + break; + } fill_edge_dst_property(handle, graph_view, @@ -296,6 +336,10 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next).end(), prev_dst_visited_flags.mutable_view(), true); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown4 = std::chrono::steady_clock::now(); +#endif if (direction_optimizing) { auto m_f = thrust::transform_reduce( @@ -355,6 +399,10 @@ void bfs(raft::handle_t const& handle, top_down = false; } } +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown5 = std::chrono::steady_clock::now(); +#endif if (top_down) { // staying in top-down vertex_frontier.bucket(bucket_idx_cur) = @@ -369,7 +417,25 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next) = key_bucket_t(handle); } - } else { // bottom up +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto topdown6 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = topdown1 - topdown0; + std::chrono::duration dur1 = topdown2 - topdown1; + std::chrono::duration dur2 = topdown3 - topdown2; + std::chrono::duration dur3 = topdown4 - topdown3; + std::chrono::duration dur4 = topdown5 - topdown4; + std::chrono::duration dur5 = topdown6 - topdown5; + std::chrono::duration dur = topdown6 - topdown0; + std::cout << "topdown took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," + << dur5.count() << ") s." << std::endl; +#endif + } else { // bottom up +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup0 = std::chrono::steady_clock::now(); +#endif bottomup_e_op_t e_op{}; e_op.prev_visited_flags = detail::edge_partition_endpoint_property_device_view_t( @@ -384,6 +450,10 @@ void bfs(raft::handle_t const& handle, edge_dummy_property_t{}.view(), e_op, reduce_op::any()); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup1 = std::chrono::steady_clock::now(); +#endif auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), predecessor_buffer.begin()); @@ -397,6 +467,10 @@ void bfs(raft::handle_t const& handle, thrust::make_zip_iterator(distances, predecessor_first)); assert(direction_optimizing); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup2 = std::chrono::steady_clock::now(); +#endif { rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), @@ -412,6 +486,10 @@ void bfs(raft::handle_t const& handle, handle.get_stream()); nzd_unvisited_vertices = std::move(tmp_vertices); } +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup3 = std::chrono::steady_clock::now(); +#endif next_aggregate_vertex_frontier_size = GraphViewType::is_multi_gpu @@ -420,7 +498,22 @@ void bfs(raft::handle_t const& handle, raft::comms::op_t::SUM, handle.get_stream()) : static_cast(new_frontier_vertex_buffer.size()); - if (next_aggregate_vertex_frontier_size == 0) { break; } +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup4 = std::chrono::steady_clock::now(); +#endif + if (next_aggregate_vertex_frontier_size == 0) { +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + std::chrono::duration dur0 = bottomup1 - bottomup0; + std::chrono::duration dur1 = bottomup2 - bottomup1; + std::chrono::duration dur2 = bottomup3 - bottomup2; + std::chrono::duration dur3 = bottomup4 - bottomup3; + std::chrono::duration dur = bottomup4 - bottomup0; + std::cout << "bottomup took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << "," << dur3.count() << " s." << std::endl; +#endif + break; + } fill_edge_dst_property(handle, graph_view, @@ -428,6 +521,10 @@ void bfs(raft::handle_t const& handle, new_frontier_vertex_buffer.end(), prev_dst_visited_flags.mutable_view(), true); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup5 = std::chrono::steady_clock::now(); +#endif auto aggregate_nzd_unvisted_vertices = GraphViewType::is_multi_gpu @@ -442,6 +539,10 @@ void bfs(raft::handle_t const& handle, (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) { top_down = true; } +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup6 = std::chrono::steady_clock::now(); +#endif if (top_down) { // swithcing to top-down vertex_frontier.bucket(bucket_idx_cur) = @@ -454,6 +555,21 @@ void bfs(raft::handle_t const& handle, raft::device_span((*nzd_unvisited_vertices).data(), (*nzd_unvisited_vertices).size())); } +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto bottomup7 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = bottomup1 - bottomup0; + std::chrono::duration dur1 = bottomup2 - bottomup1; + std::chrono::duration dur2 = bottomup3 - bottomup2; + std::chrono::duration dur3 = bottomup4 - bottomup3; + std::chrono::duration dur4 = bottomup5 - bottomup4; + std::chrono::duration dur5 = bottomup6 - bottomup5; + std::chrono::duration dur6 = bottomup7 - bottomup6; + std::chrono::duration dur = bottomup7 - bottomup0; + std::cout << "bottomup took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," + << dur5.count() << "," << dur6.count() << ") s." << std::endl; +#endif } cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size; From 7ec5b088fd8ebbb5d461ee170e686bd82aecb9a5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Jul 2024 23:05:23 -0700 Subject: [PATCH 007/126] add code to broadcast frontier using a bitmap --- .../detail/extract_transform_v_frontier_e.cuh | 247 ++++++++++++++++-- 1 file changed, 221 insertions(+), 26 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 177c79ace87..e242fc8d593 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -579,6 +579,8 @@ __global__ static void extract_transform_v_frontier_e_high_degree( } } +#define EXTRACT_PERFORMANCE_MEASUREMENT + template (size_t{0}, handle.get_stream()); @@ -689,13 +697,32 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_key_last = get_dataframe_buffer_end(frontier_keys); } - // 1. fill the buffers + { // drop zero degree vertices + size_t partition_idx{0}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + partition_idx = static_cast(minor_comm.get_rank()); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - auto key_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - auto value_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + if (segment_offsets) { + auto v_threshold = + graph_view.local_vertex_partition_range_first() + *((*segment_offsets).rbegin() + 1); + if constexpr (std::is_same_v) { + frontier_key_last = thrust::lower_bound( + handle.get_thrust_policy(), frontier_key_first, frontier_key_last, v_threshold); + } else { + key_t key_threshold{}; + thrust::get<0>(key_threshold) = v_threshold; + frontier_key_last = thrust::lower_bound( + handle.get_thrust_policy(), + frontier_key_first, + frontier_key_last, + key_threshold, + [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); }); + } + } + } std::vector local_frontier_sizes{}; if constexpr (GraphViewType::is_multi_gpu) { @@ -709,9 +736,107 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; } + // update frontier bitmap (used to reduce broadcast bandwidth size) + + std::conditional_t, + std::optional>, + std::byte /* dummy */> + frontier_bitmap{}; + std::conditional_t, + std::vector, + std::byte /* dummy */> + use_bitmap_flags{}; + if constexpr (GraphViewType::is_multi_gpu && std::is_same_v) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + frontier_bitmap = std::nullopt; + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + auto threshold_ratio = 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + use_bitmap_flags = std::vector(minor_comm_size, false); + + size_t this_bool_size{0}; + if constexpr (VertexFrontierBucketType::is_sorted_unique) { + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + if (i == static_cast(minor_comm_rank)) { this_bool_size = bool_size; } + if (local_frontier_sizes[i] > static_cast(bool_size * threshold_ratio)) { + use_bitmap_flags[i] = true; + } + } + } else { + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(); + this_bool_size = bool_size; + bool use_bitmap_flag{false}; + if (local_frontier_sizes[minor_comm_rank] > + static_cast(bool_size * threshold_ratio)) { + auto num_uniques = static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(local_frontier_sizes[minor_comm_rank]), + cugraph::detail::is_first_in_run_t{frontier_key_first})); + if (num_uniques == local_frontier_sizes[minor_comm_rank]) { use_bitmap_flag = true; } + } + auto tmp_flags = host_scalar_allgather( + minor_comm, use_bitmap_flag ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + std::transform(tmp_flags.begin(), + tmp_flags.end(), + use_bitmap_flags.begin(), + [] __device__(uint8_t flag) { return flag == 1; }); + } + + if (use_bitmap_flags[minor_comm_rank]) { + frontier_bitmap = + rmm::device_uvector(packed_bool_size(this_bool_size), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + (*frontier_bitmap).begin(), + (*frontier_bitmap).end(), + packed_bool_empty_mask()); + thrust::for_each( + handle.get_thrust_policy(), + frontier_key_first, + frontier_key_last, + [bitmap = + raft::device_span((*frontier_bitmap).data(), (*frontier_bitmap).size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(cugraph::packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + } + } + } + + // 2. fill the buffers + + auto key_buffer = + allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); + auto value_buffer = + allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); + rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + auto edge_mask_view = graph_view.edge_mask_view(); +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time1 = std::chrono::steady_clock::now(); +#endif for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime0 = std::chrono::steady_clock::now(); +#endif auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); @@ -724,27 +849,73 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition_frontier_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - vertex_t edge_partition_frontier_size = static_cast(local_frontier_sizes[i]); + vertex_t edge_partition_frontier_size = static_cast(local_frontier_sizes[i]); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + auto edge_partition_frontier_key_first = frontier_key_first; auto edge_partition_frontier_key_last = frontier_key_last; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - - resize_dataframe_buffer( - edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream()); - - device_bcast(minor_comm, - frontier_key_first, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition_frontier_size, - static_cast(i), - handle.get_stream()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + resize_dataframe_buffer( + edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream()); + + if constexpr (std::is_same_v) { + if (use_bitmap_flags[i]) { + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + rmm::device_uvector edge_partition_bitmap(packed_bool_size(bool_size), + handle.get_stream()); + device_bcast(minor_comm, + (*frontier_bitmap).data(), + edge_partition_bitmap.data(), + edge_partition_bitmap.size(), + static_cast(i), + handle.get_stream()); + auto it = thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(edge_partition.major_range_first()), + thrust::make_counting_iterator(edge_partition.major_range_first()) + bool_size, + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + edge_partition_bitmap.data(), + edge_partition_bitmap.size())] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != + packed_bool_empty_mask()); + })), + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), + thrust::identity{}); + std::cout << "size=" + << thrust::distance( + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), it) + << std::endl; + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), + edge_partition_frontier_size, + static_cast(i), + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), + edge_partition_frontier_size, + static_cast(i), + handle.get_stream()); + } - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_frontier_key_buffer); + edge_partition_frontier_key_first = + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer); + edge_partition_frontier_key_last = + get_dataframe_buffer_end(edge_partition_frontier_key_buffer); + } } auto edge_partition_frontier_major_first = @@ -754,9 +925,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust_tuple_get_or_identity( edge_partition_frontier_key_last); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto max_pushes = max_one_e_per_frontier_key ? edge_partition_frontier_size - : edge_partition.compute_number_of_edges( + auto max_pushes = max_one_e_per_frontier_key ? edge_partition_frontier_size + : edge_partition.compute_number_of_edges( edge_partition_frontier_major_first, edge_partition_frontier_major_last, handle.get_stream()); @@ -780,6 +950,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime1 = std::chrono::steady_clock::now(); +#endif if (segment_offsets) { static_assert(num_sparse_segments_per_vertex_partition == 3); std::vector h_thresholds(num_sparse_segments_per_vertex_partition + @@ -905,9 +1079,21 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, e_op); } } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime2 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::cout << "\t\t\tdetail::extract i=" << i << " took (" << subdur0.count() << "," + << subdur1.count() << ")" << std::endl; +#endif } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); +#endif - // 2. resize and return the buffers + // 3. resize and return the buffers auto new_buffer_size = buffer_idx.value(handle.get_stream()); @@ -917,6 +1103,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, resize_optional_dataframe_buffer( value_buffer, new_buffer_size, handle.get_stream()); shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time3 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::cout << "\t\tdetail::extract took (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ")" << std::endl; +#endif return std::make_tuple(std::move(key_buffer), std::move(value_buffer)); } From 81f51c15a09aa2f00c244ca4b7281ffd8c9bada6 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 18 Jul 2024 23:11:33 -0700 Subject: [PATCH 008/126] fix build error --- cpp/src/prims/detail/extract_transform_v_frontier_e.cuh | 2 +- .../prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh | 2 +- cpp/src/traversal/bfs_impl.cuh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index e242fc8d593..5a4bf206d7c 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -579,7 +579,7 @@ __global__ static void extract_transform_v_frontier_e_high_degree( } } -#define EXTRACT_PERFORMANCE_MEASUREMENT +#define EXTRACT_PERFORMANCE_MEASUREMENT 1 template From 69cb4f97b2a7393b320cf914643918860fae615d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 21 Jul 2024 14:01:43 -0700 Subject: [PATCH 009/126] update dataframe buffer utilities --- .../cugraph/utilities/dataframe_buffer.hpp | 34 +--- .../detail/optional_dataframe_buffer.hpp | 174 ++++++++---------- 2 files changed, 84 insertions(+), 124 deletions(-) diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp index a20613c65ef..55f3e8ac360 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp +++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp @@ -82,6 +82,14 @@ auto allocate_dataframe_buffer(size_t buffer_size, rmm::cuda_stream_view stream_ std::make_index_sequence(), buffer_size, stream_view); } +template +struct dataframe_buffer_type { + using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; + +template +using dataframe_buffer_type_t = typename dataframe_buffer_type::type; + template void reserve_dataframe_buffer(BufferType& buffer, size_t new_buffer_capacity, @@ -206,30 +214,4 @@ auto get_dataframe_buffer_cend(BufferType& buffer) std::make_index_sequence::value>(), buffer); } -template -struct dataframe_buffer_value_type { - using type = void; -}; - -template -struct dataframe_buffer_value_type> { - using type = T; -}; - -template -struct dataframe_buffer_value_type...>> { - using type = thrust::tuple; -}; - -template -using dataframe_buffer_value_type_t = typename dataframe_buffer_value_type::type; - -template -struct dataframe_buffer_type { - using type = decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); -}; - -template -using dataframe_buffer_type_t = typename dataframe_buffer_type::type; - } // namespace cugraph diff --git a/cpp/src/prims/detail/optional_dataframe_buffer.hpp b/cpp/src/prims/detail/optional_dataframe_buffer.hpp index 87c095f8e81..6657b91f13b 100644 --- a/cpp/src/prims/detail/optional_dataframe_buffer.hpp +++ b/cpp/src/prims/detail/optional_dataframe_buffer.hpp @@ -26,152 +26,130 @@ namespace detail { // we cannot use thrust::iterator_traits::value_type if Iterator is void* (reference to // void is not allowed) template -struct optional_dataframe_buffer_value_type_t; +struct optional_dataframe_buffer_iterator_value_type_t; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = typename thrust::iterator_traits::value_type; }; template -struct optional_dataframe_buffer_value_type_t>> { +struct optional_dataframe_buffer_iterator_value_type_t< + Iterator, + std::enable_if_t>> { using value = void; }; -template >* = nullptr> -std::byte allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) -{ - return std::byte{0}; // dummy -} - -template >* = nullptr> +template auto allocate_optional_dataframe_buffer(size_t size, rmm::cuda_stream_view stream) { - return allocate_dataframe_buffer(size, stream); + if constexpr (std::is_same_v) { + return std::byte{0}; // dummy + } else { + return allocate_dataframe_buffer(size, stream); + } } -template >* = nullptr> -void* get_optional_dataframe_buffer_begin(std::byte& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} +template +struct optional_dataframe_buffer_type { + using type = decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})); +}; -template >* = nullptr> -auto get_optional_dataframe_buffer_begin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_begin(optional_dataframe_buffer); -} +template +using optional_dataframe_buffer_type_t = typename optional_dataframe_buffer_type::type; -template >* = nullptr> -void* get_optional_dataframe_buffer_end(std::byte& optional_dataframe_buffer) +template +auto get_optional_dataframe_buffer_begin( + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return static_cast(nullptr); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_begin(optional_dataframe_buffer); + } } -template >* = nullptr> +template auto get_optional_dataframe_buffer_end( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return get_dataframe_buffer_end(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_end(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cbegin(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cbegin( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cbegin(optional_dataframe_buffer); + } } -template >* = nullptr> -void const* get_optional_dataframe_buffer_cend(std::byte const& optional_dataframe_buffer) -{ - return static_cast(nullptr); -} - -template >* = nullptr> +template auto get_optional_dataframe_buffer_cend( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) -{ - return get_dataframe_buffer_cend(optional_dataframe_buffer); -} - -template >* = nullptr> -void reserve_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_capacity, - rmm::cuda_stream_view stream_view) + optional_dataframe_buffer_type_t const& optional_dataframe_buffer) { - return; + if constexpr (std::is_same_v) { + return static_cast(nullptr); + } else { + return get_dataframe_buffer_cend(optional_dataframe_buffer); + } } -template >* = nullptr> +template void reserve_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_capacity, rmm::cuda_stream_view stream_view) { - return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); -} - -template >* = nullptr> -void resize_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - size_t new_buffer_size, - rmm::cuda_stream_view stream_view) -{ - return; + if constexpr (std::is_same_v) { + return; + } else { + return reserve_dataframe_buffer(optional_dataframe_buffer, new_buffer_capacity, stream_view); + } } -template >* = nullptr> +template void resize_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, + optional_dataframe_buffer_type_t& optional_dataframe_buffer, size_t new_buffer_size, rmm::cuda_stream_view stream_view) { - return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + if constexpr (std::is_same_v) { + return; + } else { + return resize_dataframe_buffer(optional_dataframe_buffer, new_buffer_size, stream_view); + } } -template >* = nullptr> -void shrink_to_fit_optional_dataframe_buffer(std::byte& optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return; -} - -template >* = nullptr> +template void shrink_to_fit_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))>& - optional_dataframe_buffer, - rmm::cuda_stream_view stream_view) -{ - return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); -} - -template >* = nullptr> -size_t size_optional_dataframe_buffer(std::byte const& optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer, rmm::cuda_stream_view stream_view) { - return size_t{0}; + if constexpr (std::is_same_v) { + return; + } else { + return shrink_to_fit_dataframe_buffer(optional_dataframe_buffer, stream_view); + } } -template >* = nullptr> +template size_t size_optional_dataframe_buffer( - std::decay_t(size_t{0}, rmm::cuda_stream_view{}))> const& - optional_dataframe_buffer) + optional_dataframe_buffer_type_t& optional_dataframe_buffer) { - return size_dataframe_buffer(optional_dataframe_buffer); + if constexpr (std::is_same_v) { + return size_t{0}; + } else { + return size_dataframe_buffer(optional_dataframe_buffer); + } } } // namespace detail From 6adcccb6ef4660c364b2a531d142eaf7d1db656c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 21 Jul 2024 14:02:28 -0700 Subject: [PATCH 010/126] reduce # resizes --- .../detail/extract_transform_v_frontier_e.cuh | 128 ++++++++++++------ 1 file changed, 89 insertions(+), 39 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 5a4bf206d7c..237a050bece 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -72,9 +72,9 @@ __device__ void push_buffer_element(BufferKeyOutputIterator buffer_key_output_fi e_op_result_t e_op_result) { using output_key_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; using output_value_t = - typename optional_dataframe_buffer_value_type_t::value; + typename optional_dataframe_buffer_iterator_value_type_t::value; assert(e_op_result.has_value()); @@ -595,9 +595,8 @@ template -std::tuple< - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{})), - decltype(allocate_optional_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> +std::tuple, + optional_dataframe_buffer_type_t> extract_transform_v_frontier_e(raft::handle_t const& handle, GraphViewType const& graph_view, VertexFrontierBucketType const& frontier, @@ -820,11 +819,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // 2. fill the buffers - auto key_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - auto value_buffer = - allocate_optional_dataframe_buffer(size_t{0}, handle.get_stream()); - rmm::device_scalar buffer_idx(size_t{0}, handle.get_stream()); + std::vector> key_buffers{}; + std::vector> value_buffers{}; + key_buffers.reserve(graph_view.number_of_local_edge_partitions()); + value_buffers.reserve(graph_view.number_of_local_edge_partitions()); auto edge_mask_view = graph_view.edge_mask_view(); @@ -931,11 +929,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_frontier_major_last, handle.get_stream()); - auto new_buffer_size = buffer_idx.value(handle.get_stream()) + max_pushes; - resize_optional_dataframe_buffer( - key_buffer, new_buffer_size, handle.get_stream()); - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); + auto tmp_key_buffer = + allocate_optional_dataframe_buffer(max_pushes, handle.get_stream()); + auto tmp_value_buffer = + allocate_optional_dataframe_buffer(max_pushes, handle.get_stream()); + rmm::device_scalar tmp_buffer_idx(size_t{0}, handle.get_stream()); edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -993,9 +991,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), e_op); } if (h_offsets[1] - h_offsets[0] > 0) { @@ -1011,9 +1009,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), e_op); } if (h_offsets[2] - h_offsets[1] > 0) { @@ -1031,9 +1029,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), e_op); } if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) { @@ -1051,9 +1049,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), e_op); } } else { @@ -1073,19 +1071,38 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_dst_value_input, edge_partition_e_value_input, edge_partition_e_mask, - get_optional_dataframe_buffer_begin(key_buffer), - get_optional_dataframe_buffer_begin(value_buffer), - buffer_idx.data(), + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), e_op); } } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime2 = std::chrono::steady_clock::now(); + auto subtime2 = std::chrono::steady_clock::now(); +#endif + + auto tmp_buffer_size = tmp_buffer_idx.value(handle.get_stream()); + + resize_optional_dataframe_buffer( + tmp_key_buffer, tmp_buffer_size, handle.get_stream()); + shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, handle.get_stream()); + + resize_optional_dataframe_buffer( + tmp_value_buffer, tmp_buffer_size, handle.get_stream()); + shrink_to_fit_optional_dataframe_buffer(tmp_value_buffer, handle.get_stream()); + + key_buffers.push_back(std::move(tmp_key_buffer)); + value_buffers.push_back(std::move(tmp_value_buffer)); + +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime3 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; std::cout << "\t\t\tdetail::extract i=" << i << " took (" << subdur0.count() << "," - << subdur1.count() << ")" << std::endl; + << subdur1.count() << "," << subdur2.count() << ")" << std::endl; #endif } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1093,16 +1110,49 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto time2 = std::chrono::steady_clock::now(); #endif - // 3. resize and return the buffers + // 3. concatenate and return the buffers - auto new_buffer_size = buffer_idx.value(handle.get_stream()); + auto key_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); + if (key_buffers.size() == 0) { + key_buffer = std::move(key_buffers[0]); + value_buffer = std::move(value_buffers[0]); + } else { + std::vector buffer_sizes(key_buffers.size()); + static_assert(!std::is_same_v || !std::is_same_v); + for (size_t i = 0; i < key_buffers.size(); ++i) { + if constexpr (!std::is_same_v) { + buffer_sizes[i] = size_optional_dataframe_buffer(key_buffers[i]); + } else { + buffer_sizes[i] = size_optional_dataframe_buffer(value_buffers[i]); + } + } + auto buffer_size = std::reduce(buffer_sizes.begin(), buffer_sizes.end()); + resize_optional_dataframe_buffer(key_buffer, buffer_size, handle.get_stream()); + resize_optional_dataframe_buffer( + value_buffer, buffer_size, handle.get_stream()); + std::vector buffer_displacements(buffer_sizes.size()); + std::exclusive_scan( + buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0}); + for (size_t i = 0; i < key_buffers.size(); ++i) { + if constexpr (!std::is_same_v) { + thrust::copy( + handle.get_thrust_policy(), + get_optional_dataframe_buffer_cbegin(key_buffers[i]), + get_optional_dataframe_buffer_cend(key_buffers[i]), + get_optional_dataframe_buffer_begin(key_buffer) + buffer_displacements[i]); + } - resize_optional_dataframe_buffer(key_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + if constexpr (!std::is_same_v) { + thrust::copy(handle.get_thrust_policy(), + get_optional_dataframe_buffer_cbegin(value_buffers[i]), + get_optional_dataframe_buffer_cend(value_buffers[i]), + get_optional_dataframe_buffer_begin(value_buffer) + + buffer_displacements[i]); + } + } + } - resize_optional_dataframe_buffer( - value_buffer, new_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time3 = std::chrono::steady_clock::now(); From bfe21fc79448d7eb20f718c6e2b060864b1ac472 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 22 Jul 2024 16:02:31 -0700 Subject: [PATCH 011/126] remove debug statement --- cpp/src/prims/detail/extract_transform_v_frontier_e.cuh | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 237a050bece..f42e830c6e6 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -873,7 +873,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_bitmap.size(), static_cast(i), handle.get_stream()); - auto it = thrust::copy_if( + thrust::copy_if( handle.get_thrust_policy(), thrust::make_counting_iterator(edge_partition.major_range_first()), thrust::make_counting_iterator(edge_partition.major_range_first()) + bool_size, @@ -888,10 +888,6 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, })), get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), thrust::identity{}); - std::cout << "size=" - << thrust::distance( - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), it) - << std::endl; } else { device_bcast(minor_comm, frontier_key_first, From 446435bed1572f5160ae3d3efc9e58827cda8ae5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 22 Jul 2024 16:29:10 -0700 Subject: [PATCH 012/126] rename VertexFrontierBucketType to KeyBucketType --- .../detail/extract_transform_v_frontier_e.cuh | 10 +- ...xtract_transform_v_frontier_outgoing_e.cuh | 8 +- ...r_v_random_select_transform_outgoing_e.cuh | 162 +++++++++--------- ...educe_v_frontier_outgoing_e_by_src_dst.cuh | 51 +++--- 4 files changed, 114 insertions(+), 117 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index f42e830c6e6..590747ada18 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -590,7 +590,7 @@ template , optional_dataframe_buffer_type_t> extract_transform_v_frontier_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -612,7 +612,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, #endif using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using output_key_t = OutputKeyT; using output_value_t = OutputValueT; @@ -683,7 +683,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto frontier_key_first = frontier.begin(); auto frontier_key_last = frontier.end(); auto frontier_keys = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - if constexpr (!VertexFrontierBucketType::is_sorted_unique) { + if constexpr (!KeyBucketType::is_sorted_unique) { resize_dataframe_buffer(frontier_keys, frontier.size(), handle.get_stream()); thrust::copy(handle.get_thrust_policy(), frontier_key_first, @@ -757,7 +757,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, use_bitmap_flags = std::vector(minor_comm_size, false); size_t this_bool_size{0}; - if constexpr (VertexFrontierBucketType::is_sorted_unique) { + if constexpr (KeyBucketType::is_sorted_unique) { for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh index 7ad033b93c2..413f46aeb57 100644 --- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh +++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh @@ -64,13 +64,13 @@ namespace cugraph { * @return Dataframe buffer object storing extracted and accumulated valid @p e_op return values. */ template decltype(allocate_dataframe_buffer< - typename detail::edge_op_result_type>, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -240,7 +240,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, #ifndef NO_CUGRAPH_OPS using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using key_buffer_t = dataframe_buffer_type_t; using edge_partition_src_input_device_view_t = std::conditional_t< @@ -289,15 +289,15 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (do_expensive_check) { // FIXME: better re-factor this check function? - auto frontier_vertex_first = - thrust_tuple_get_or_identity(frontier.begin()); - auto frontier_vertex_last = - thrust_tuple_get_or_identity(frontier.end()); + auto key_list_vertex_first = + thrust_tuple_get_or_identity(key_list.begin()); + auto key_list_vertex_last = + thrust_tuple_get_or_identity(key_list.end()); auto num_invalid_keys = - frontier.size() - + key_list.size() - thrust::count_if(handle.get_thrust_policy(), - frontier_vertex_first, - frontier_vertex_last, + key_list_vertex_first, + key_list_vertex_last, check_in_range_t{graph_view.local_vertex_partition_range_first(), graph_view.local_vertex_partition_range_last()}); if constexpr (GraphViewType::is_multi_gpu) { @@ -305,35 +305,35 @@ per_v_random_select_transform_e(raft::handle_t const& handle, handle.get_comms(), num_invalid_keys, raft::comms::op_t::SUM, handle.get_stream()); } CUGRAPH_EXPECTS(num_invalid_keys == size_t{0}, - "Invalid input argument: frontier includes out-of-range keys."); + "Invalid input argument: key_list includes out-of-range keys."); } - std::vector local_frontier_sizes{}; + std::vector local_key_list_sizes{}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather(minor_comm, frontier.size(), handle.get_stream()); + local_key_list_sizes = host_scalar_allgather(minor_comm, key_list.size(), handle.get_stream()); } else { - local_frontier_sizes = std::vector{frontier.size()}; + local_key_list_sizes = std::vector{key_list.size()}; } - std::vector local_frontier_displacements(local_frontier_sizes.size()); - std::exclusive_scan(local_frontier_sizes.begin(), - local_frontier_sizes.end(), - local_frontier_displacements.begin(), + std::vector local_key_list_displacements(local_key_list_sizes.size()); + std::exclusive_scan(local_key_list_sizes.begin(), + local_key_list_sizes.end(), + local_key_list_displacements.begin(), size_t{0}); - // 1. aggregate frontier + // 1. aggregate key_list - std::optional aggregate_local_frontier{std::nullopt}; + std::optional aggregate_local_key_list{std::nullopt}; if (minor_comm_size > 1) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - aggregate_local_frontier = allocate_dataframe_buffer( - local_frontier_displacements.back() + local_frontier_sizes.back(), handle.get_stream()); + aggregate_local_key_list = allocate_dataframe_buffer( + local_key_list_displacements.back() + local_key_list_sizes.back(), handle.get_stream()); device_allgatherv(minor_comm, - frontier.begin(), - get_dataframe_buffer_begin(*aggregate_local_frontier), - local_frontier_sizes, - local_frontier_displacements, + key_list.begin(), + get_dataframe_buffer_begin(*aggregate_local_key_list), + local_key_list_sizes, + local_key_list_displacements, handle.get_stream()); } @@ -342,66 +342,66 @@ per_v_random_select_transform_e(raft::handle_t const& handle, rmm::device_uvector sample_local_nbr_indices(0, handle.get_stream()); std::optional> sample_key_indices{std::nullopt}; - std::vector local_frontier_sample_offsets{}; + std::vector local_key_list_sample_offsets{}; if constexpr (std::is_same_v>) { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = uniform_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier) - : frontier.begin(), - local_frontier_displacements, - local_frontier_sizes, + (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_key_list) + : key_list.begin(), + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement); } else { - std::tie(sample_local_nbr_indices, sample_key_indices, local_frontier_sample_offsets) = + std::tie(sample_local_nbr_indices, sample_key_indices, local_key_list_sample_offsets) = biased_sample_and_compute_local_nbr_indices( handle, graph_view, - (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier) - : frontier.begin(), + (minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_key_list) + : key_list.begin(), edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, e_bias_op, - local_frontier_displacements, - local_frontier_sizes, + local_key_list_displacements, + local_key_list_sizes, rng_state, K, with_replacement, do_expensive_check); } - std::vector local_frontier_sample_counts(minor_comm_size); - std::adjacent_difference(local_frontier_sample_offsets.begin() + 1, - local_frontier_sample_offsets.end(), - local_frontier_sample_counts.begin()); + std::vector local_key_list_sample_counts(minor_comm_size); + std::adjacent_difference(local_key_list_sample_offsets.begin() + 1, + local_key_list_sample_offsets.end(), + local_key_list_sample_counts.begin()); // 3. transform auto sample_e_op_results = - allocate_dataframe_buffer(local_frontier_sample_offsets.back(), handle.get_stream()); + allocate_dataframe_buffer(local_key_list_sample_offsets.back(), handle.get_stream()); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - auto edge_partition_frontier_key_first = - ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_frontier) - : frontier.begin()) + - local_frontier_displacements[i]; + auto edge_partition_key_list_first = + ((minor_comm_size > 1) ? get_dataframe_buffer_begin(*aggregate_local_key_list) + : key_list.begin()) + + local_key_list_displacements[i]; auto edge_partition_sample_local_nbr_index_first = - sample_local_nbr_indices.begin() + local_frontier_sample_offsets[i]; + sample_local_nbr_indices.begin() + local_key_list_sample_offsets[i]; auto edge_partition_sample_e_op_result_first = - get_dataframe_buffer_begin(sample_e_op_results) + local_frontier_sample_offsets[i]; + get_dataframe_buffer_begin(sample_e_op_results) + local_key_list_sample_offsets[i]; edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -418,14 +418,14 @@ per_v_random_select_transform_e(raft::handle_t const& handle, if (sample_key_indices) { auto edge_partition_sample_key_index_first = - (*sample_key_indices).begin() + local_frontier_sample_offsets[i]; + (*sample_key_indices).begin() + local_key_list_sample_offsets[i]; thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_frontier_sample_counts[i]), + thrust::make_counting_iterator(local_key_list_sample_counts[i]), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{ edge_partition, thrust::make_optional(edge_partition_sample_key_index_first), - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -447,10 +447,10 @@ per_v_random_select_transform_e(raft::handle_t const& handle, thrust::transform( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(frontier.size() * K), + thrust::make_counting_iterator(key_list.size() * K), edge_partition_sample_e_op_result_first, transform_local_nbr_indices_t{edge_partition, thrust::nullopt, - edge_partition_frontier_key_first, + edge_partition_key_list_first, edge_partition_sample_local_nbr_index_first, edge_partition_src_value_input, edge_partition_dst_value_input, @@ -469,13 +469,13 @@ per_v_random_select_transform_e(raft::handle_t const& handle, K}); } } - aggregate_local_frontier = std::nullopt; + aggregate_local_key_list = std::nullopt; // 4. shuffle randomly selected & transformed results and update sample_offsets auto sample_offsets = invalid_value ? std::nullopt : std::make_optional>( - frontier.size() + 1, handle.get_stream()); + key_list.size() + 1, handle.get_stream()); assert(K <= std::numeric_limits::max()); if (minor_comm_size > 1) { sample_local_nbr_indices.resize(0, handle.get_stream()); @@ -486,12 +486,12 @@ per_v_random_select_transform_e(raft::handle_t const& handle, std::tie(sample_e_op_results, std::ignore) = shuffle_values(minor_comm, get_dataframe_buffer_begin(sample_e_op_results), - local_frontier_sample_counts, + local_key_list_sample_counts, handle.get_stream()); std::tie(sample_key_indices, std::ignore) = shuffle_values( - minor_comm, (*sample_key_indices).begin(), local_frontier_sample_counts, handle.get_stream()); + minor_comm, (*sample_key_indices).begin(), local_key_list_sample_counts, handle.get_stream()); - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::fill( handle.get_thrust_policy(), sample_counts.begin(), sample_counts.end(), int32_t{0}); auto sample_intra_partition_displacements = @@ -507,7 +507,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_counts.resize(0, handle.get_stream()); sample_counts.shrink_to_fit(handle.get_stream()); - resize_dataframe_buffer(tmp_sample_e_op_results, frontier.size() * K, handle.get_stream()); + resize_dataframe_buffer(tmp_sample_e_op_results, key_list.size() * K, handle.get_stream()); thrust::fill(handle.get_thrust_policy(), get_dataframe_buffer_begin(tmp_sample_e_op_results), get_dataframe_buffer_end(tmp_sample_e_op_results), @@ -556,7 +556,7 @@ per_v_random_select_transform_e(raft::handle_t const& handle, sample_e_op_results = std::move(tmp_sample_e_op_results); } else { if (!invalid_value) { - rmm::device_uvector sample_counts(frontier.size(), handle.get_stream()); + rmm::device_uvector sample_counts(key_list.size(), handle.get_stream()); thrust::tabulate( handle.get_thrust_policy(), sample_counts.begin(), @@ -605,8 +605,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @brief Randomly select and transform the input (tagged-)vertices' outgoing edges with biases. * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -617,8 +617,8 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -655,11 +655,11 @@ per_v_random_select_transform_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeBiasSrcValueInputWrapper edge_bias_src_value_input, EdgeBiasDstValueInputWrapper edge_bias_dst_value_input, EdgeBiasValueInputWrapper edge_bias_value_input, @@ -690,7 +690,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, { return detail::per_v_random_select_transform_e(handle, graph_view, - frontier, + key_list, edge_bias_src_value_input, edge_bias_dst_value_input, edge_bias_value_input, @@ -713,8 +713,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * (uniform neighbor sampling). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. @@ -723,8 +723,8 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object to store the (tagged-)vertex list to sample - * outgoing edges. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to sample outgoing + * edges. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -755,11 +755,11 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, * @return std::tuple Tuple of an optional offset vector of type * std::optional> and a dataframe buffer storing the output values of * type @p T from the selected edges. If @p invalid_value is std::nullopt, the offset vector is - * valid and has the size of @p frontier.size() + 1. If @p invalid_value.has_value() is true, - * std::nullopt is returned (the dataframe buffer will store @p frontier.size() * @p K elements). + * valid and has the size of @p key_list.size() + 1. If @p invalid_value.has_value() is true, + * std::nullopt is returned (the dataframe buffer will store @p key_list.size() * @p K elements). */ template >, decltype(allocate_dataframe_buffer(size_t{0}, rmm::cuda_stream_view{}))> per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& key_list, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -783,7 +783,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, return detail::per_v_random_select_transform_e( handle, graph_view, - frontier, + key_list, edge_src_dummy_property_t{}.view(), edge_dst_dummy_property_t{}.view(), edge_dummy_property_t{}.view(), @@ -791,7 +791,7 @@ per_v_random_select_transform_outgoing_e(raft::handle_t const& handle, detail::edge_endpoint_dummy_property_view_t, detail::edge_endpoint_dummy_property_view_t, edge_dummy_property_view_t, - typename VertexFrontierBucketType::key_type>{}, + typename KeyBucketType::key_type>{}, edge_src_value_input, edge_dst_value_input, edge_value_input, diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh index 3fee9bd8fde..5a87f8c8f33 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh @@ -184,7 +184,7 @@ auto sort_and_reduce_buffer_elements( template std::conditional_t< !std::is_same_v, - std::tuple( + std::tuple( 0, rmm::cuda_stream_view{})), decltype(detail::allocate_optional_dataframe_buffer( 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -213,7 +212,7 @@ transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; using payload_t = typename ReduceOp::value_type; if (do_expensive_check) { @@ -350,17 +349,17 @@ transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, } // namespace detail -template +template size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier) + KeyBucketType const& frontier) { static_assert(!GraphViewType::is_storage_transposed, "GraphViewType should support the push model."); using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; - using key_t = typename VertexFrontierBucketType::key_type; + using key_t = typename KeyBucketType::key_type; size_t ret{0}; @@ -436,11 +435,11 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, * outputs by (tagged-)source ID. * * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). + * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag + * type (KeyBucketType::key_type is identical to a vertex type otherwise). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the + * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the * current (tagged-)vertex frontier. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. @@ -450,7 +449,7 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. + * @param frontier KeyBucketType class object for the current vertex frontier. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -484,7 +483,7 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. */ template std::conditional_t< !std::is_same_v, - std::tuple( + std::tuple( 0, rmm::cuda_stream_view{})), decltype(detail::allocate_optional_dataframe_buffer( 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -524,11 +522,11 @@ transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, * outputs by (tagged-)destination ID. * * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if VertexFrontierBucketType::key_type is a tuple of a vertex type and a tag - * type (VertexFrontierBucketType::key_type is identical to a vertex type otherwise). + * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag + * type (KeyBucketType::key_type is identical to a vertex type otherwise). * * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam VertexFrontierBucketType Type of the vertex frontier bucket class which abstracts the + * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the * current (tagged-)vertex frontier. * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. @@ -538,7 +536,7 @@ transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param frontier VertexFrontierBucketType class object for the current vertex frontier. + * @param frontier KeyBucketType class object for the current vertex frontier. * @param edge_src_value_input Wrapper used to access source input property values (for the edge * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() @@ -572,7 +570,7 @@ transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. */ template std::conditional_t< !std::is_same_v, - std::tuple( + std::tuple( 0, rmm::cuda_stream_view{})), decltype(detail::allocate_optional_dataframe_buffer( 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer( - 0, rmm::cuda_stream_view{}))> + decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexFrontierBucketType const& frontier, + KeyBucketType const& frontier, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, From df463cec1a1a8e10d1995b8b2197219b79b945f8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 24 Jul 2024 23:55:38 -0700 Subject: [PATCH 013/126] update per_v_transform_reduce_incoming|outgoing_e to support reduce_op::any --- ...v_transform_reduce_incoming_outgoing_e.cuh | 1085 ++++++++++++++--- cpp/src/prims/pred_op.cuh | 28 + 2 files changed, 927 insertions(+), 186 deletions(-) create mode 100644 cpp/src/prims/pred_op.cuh diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 027ef1f662d..ff6db8ef45c 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -18,6 +18,7 @@ #include "detail/graph_partition_utils.cuh" #include "prims/detail/prim_functors.cuh" #include "prims/fill_edge_src_dst_property.cuh" +#include "prims/pred_op.cuh" #include "prims/property_op_utils.cuh" #include "prims/reduce_op.cuh" @@ -41,6 +42,7 @@ #include #include +#include #include #include #include @@ -70,18 +72,19 @@ template struct transform_and_atomic_reduce_t { edge_partition_device_view_t const& edge_partition{}; - result_t identity_element{}; vertex_t const* indices{nullptr}; TransformOp const& transform_op{}; + PredOp const& pred_op{}; ResultValueOutputIteratorOrWrapper& result_value_output{}; __device__ void operator()(edge_t i) const { - auto e_op_result = transform_op(i); - if (e_op_result != identity_element) { + if (pred_op(i)) { + auto e_op_result = transform_op(i); auto minor = indices[i]; auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); if constexpr (multi_gpu) { @@ -100,6 +103,7 @@ template __device__ void update_result_value_output( edge_partition_device_view_t const& edge_partition, @@ -108,31 +112,61 @@ __device__ void update_result_value_output( TransformOp const& transform_op, result_t init, ReduceOp const& reduce_op, + PredOp const& pred_op, size_t output_idx /* relevent only when update_major === true */, - result_t identity_element, ResultValueOutputIteratorOrWrapper& result_value_output) { if constexpr (update_major) { - *(result_value_output + output_idx) = - thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); + result_t val{}; + if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v>) { // init is selected only when no + // edges return a valid value + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + auto tmp = transform_op(i); + val = tmp; + break; + } + } else { + val = thrust::transform_reduce(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_op, + init, + reduce_op); + } + } else { + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + if (pred_op(i)) { + auto tmp = transform_op(i); + if constexpr (std::is_same_v>) { // init is selected only when + // no edges return a valid + // value + val = tmp; + break; + } else { + val = reduce_op(val, tmp); + } + } + } + } + *(result_value_output + output_idx) = val; } else { - thrust::for_each( - thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_and_atomic_reduce_t{ - edge_partition, identity_element, indices, transform_op, result_value_output}); + thrust::for_each(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_and_atomic_reduce_t{ + edge_partition, indices, transform_op, pred_op, result_value_output}); } } @@ -160,7 +194,6 @@ __global__ static void per_v_transform_reduce_e_hypersparse( ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, ReduceOp reduce_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< @@ -205,24 +238,18 @@ __global__ static void per_v_transform_reduce_e_hypersparse( edge_offset}; if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - major - *(edge_partition).major_hypersparse_first(), - identity_element, - result_value_output); + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, edge_offset] __device__(edge_t i) { + return (*edge_partition_e_mask).get(edge_offset + i); + }, + major - *(edge_partition).major_hypersparse_first(), + result_value_output); } else { update_result_value_output(edge_partition, indices, @@ -230,8 +257,8 @@ __global__ static void per_v_transform_reduce_e_hypersparse( call_e_op, init, reduce_op, + pred_op::const_true{}, major - *(edge_partition).major_hypersparse_first(), - identity_element, result_value_output); } idx += gridDim.x * blockDim.x; @@ -264,7 +291,6 @@ __global__ static void per_v_transform_reduce_e_low_degree( ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, ReduceOp reduce_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< @@ -304,24 +330,18 @@ __global__ static void per_v_transform_reduce_e_low_degree( edge_offset}; if (edge_partition_e_mask) { - auto transform_op = - [&edge_partition_e_mask, &call_e_op, identity_element, edge_offset] __device__(auto i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_e_op(i); - } else { - return identity_element; - } - }; - - update_result_value_output(edge_partition, - indices, - local_degree, - transform_op, - init, - reduce_op, - idx, - identity_element, - result_value_output); + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, edge_offset] __device__(edge_t i) { + return (*edge_partition_e_mask).get(edge_offset + i); + }, + idx, + result_value_output); } else { update_result_value_output(edge_partition, indices, @@ -329,8 +349,8 @@ __global__ static void per_v_transform_reduce_e_low_degree( call_e_op, init, reduce_op, + pred_op::const_true{}, idx, - identity_element, result_value_output); } idx += gridDim.x * blockDim.x; @@ -363,7 +383,9 @@ __global__ static void per_v_transform_reduce_e_mid_degree( ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true && !std::is_same_v> */ + , ReduceOp reduce_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< @@ -381,10 +403,12 @@ __global__ static void per_v_transform_reduce_e_mid_degree( static_cast(major_range_first - edge_partition.major_range_first()); auto idx = static_cast(tid / raft::warp_size()); - using WarpReduce = cub::WarpReduce; - [[maybe_unused]] __shared__ typename WarpReduce::TempStorage - temp_storage[per_v_transform_reduce_e_kernel_block_size / - raft::warp_size()]; // relevant only if update_major == true + using WarpReduce = cub::WarpReduce< + std::conditional_t>, int32_t, e_op_result_t>>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) + : int32_t{1} /* dummy */]; while (idx < static_cast(major_range_last - major_range_first)) { auto major_offset = static_cast(major_start_offset + idx); @@ -409,11 +433,67 @@ __global__ static void per_v_transform_reduce_e_mid_degree( indices, edge_offset}; - [[maybe_unused]] auto reduced_e_op_result = - lane_id == 0 ? init : identity_element; // relevant only if update_major == true + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_lane_id{}; + if constexpr (update_major) { reduced_e_op_result = (lane_id == 0) ? init : identity_element; } + if constexpr (update_major && std::is_same_v>) { + first_valid_lane_id = raft::warp_size(); + } + if (edge_partition_e_mask) { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree) && + (*edge_partition_e_mask).get(edge_offset + i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); @@ -427,26 +507,18 @@ __global__ static void per_v_transform_reduce_e_mid_degree( } } } - } else { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } } if constexpr (update_major) { - reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(reduced_e_op_result, reduce_op); - if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } + if constexpr (std::is_same_v>) { + if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(reduced_e_op_result, reduce_op); + if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } } idx += gridDim.x * (blockDim.x / raft::warp_size()); @@ -479,7 +551,9 @@ __global__ static void per_v_transform_reduce_e_high_degree( ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true && !std::is_same_v> */ + , ReduceOp reduce_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< @@ -494,9 +568,17 @@ __global__ static void per_v_transform_reduce_e_high_degree( static_cast(major_range_first - edge_partition.major_range_first()); auto idx = static_cast(blockIdx.x); - using BlockReduce = cub::BlockReduce; + using BlockReduce = cub::BlockReduce< + std::conditional_t>, int32_t, e_op_result_t>, + per_v_transform_reduce_e_kernel_block_size>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage; [[maybe_unused]] __shared__ - typename BlockReduce::TempStorage temp_storage; // relevant only if update_major == true + std::conditional_t>, + int32_t, + std::byte /* dummy */> + output_thread_id; while (idx < static_cast(major_range_last - major_range_first)) { auto major_offset = static_cast(major_start_offset + idx); @@ -521,11 +603,78 @@ __global__ static void per_v_transform_reduce_e_high_degree( indices, edge_offset}; - [[maybe_unused]] auto reduced_e_op_result = - threadIdx.x == 0 ? init : identity_element; // relevant only if update_major == true + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_thread_id{}; + if constexpr (update_major) { + reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; + } + if constexpr (update_major && std::is_same_v>) { + first_valid_thread_id = per_v_transform_reduce_e_kernel_block_size; + } + if (edge_partition_e_mask) { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / + per_v_transform_reduce_e_kernel_block_size) * + per_v_transform_reduce_e_kernel_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree) && + ((*edge_partition_e_mask).get_(edge_offset + i))) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / + per_v_transform_reduce_e_kernel_block_size) * + per_v_transform_reduce_e_kernel_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + if (threadIdx.x == output_thread_id) { reduced_e_op_result = *e_op_result; } + if (output_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); @@ -539,31 +688,222 @@ __global__ static void per_v_transform_reduce_e_high_degree( } } } - } else { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } } if constexpr (update_major) { - reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); - if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } + if constexpr (std::is_same_v>) { + if (threadIdx.x == ((first_valid_thread_id == per_v_transform_reduce_e_kernel_block_size) + ? 0 + : first_valid_thread_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); + if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } } idx += gridDim.x; } } +template +__host__ __device__ int rank_to_priority( + int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) +{ + if (rank == root) { // no need for communication (priority 0) + return int{0}; + } else if (rank / subgroup_size == + root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in + // [1, subgroup_size) + int modulo = subgroup_size - 1; + return int{1} + static_cast((static_cast(rank) + offset) % modulo); + } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) + int modulo = comm_size - subgroup_size; + return subgroup_size + static_cast((static_cast(rank) + offset) % modulo); + } +} + +template +__host__ __device__ int priority_to_rank( + int priority, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffict */) +{ + if (priority == int{0}) { + return root; + } else if (priority < subgroup_size) { + int modulo = subgroup_size - int{1}; + return static_cast( + (static_cast(priority - int{1}) + (modulo - static_cast(offset % modulo))) % + modulo); + } else { + int modulo = comm_size - subgroup_size; + return static_cast((static_cast(priority - subgroup_size) + + (modulo - static_cast(offset % modulo))) % + modulo); + } +} + +template +rmm::device_uvector compute_keep_flags( + raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are + // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same + // DGX node should be the next) + + rmm::device_uvector priorities(thrust::distance(value_first, value_last), + stream_view); + thrust::tabulate( + rmm::exec_policy(stream_view), + priorities.begin(), + priorities.end(), + [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { + auto val = *(value_first + offset); + return (val != init) + ? rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)) + : std::numeric_limits::max(); // lowest priority + }); + device_allreduce(comm, + priorities.data(), + priorities.data(), + priorities.size(), + raft::comms::op_t::MIN, + root, + stream_view); + + rmm::device_uvector keep_flags(priorities.size()); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + keep_flags.begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = + priority_to_rank(priority, root, subgroup_size, comm_size, offset); + return (rank == comm_rank); + }); + + return keep_flags; +} + +template +std::tuple, + dataframe_buffer_type_t::value_type>> +compute_offset_value_pairs(raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector keep_flags(0, stream_view); + if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t + keep_flags = compute_keep_flags( + comm, value_first, value_last, root, subgroup_size, init, stream_view); + } else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t + keep_flags = compute_keep_flags( + comm, value_first, value_last, root, subgroup_size, init, stream_view); + } else { // priority_t == uint32_t + keep_flags = compute_keep_flags( + comm, value_first, value_last, root, subgroup_size, init, stream_view); + } + + auto copy_size = thrust::count_if( + rmm::exec_policy(stream_view), keep_flags.begin(), keep_flags.end(), thrust::identity{}); + + rmm::device_uvector offsets(copy_size, stream_view); + auto values = allocate_dataframe_buffer(copy_size, stream_view); + auto offset_value_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), value_first); + thrust::copy_if(rmm::exec_policy(stream_view), + offset_value_pair_first, + offset_value_pair_first + keep_flags.size(), + keep_flags.begin(), + thrust::make_zip_iterator(offsets.begin(), dataframe_buffer_begin(values)), + thrust::identity{}); + + return std::make_tuple(std::move(offsets), std::move(values)); +} + +template +void gather_offset_value_pairs_and_update_vertex_value_output( + raft::comms::comms_t const& comm, + rmm::device_uvector&& offsets, + dataframe_buffer_type_t&& values, + VertexValueOutputIterator vertex_value_output_first, + int root, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + + auto rx_sizes = host_scalar_gather(comm, offsets.size(), root, stream_view); + std::vector rx_displs{}; + rmm::device_uvector rx_offsets(0, stream_view); + if (comm_rank == root) { + rx_displs.resize(rx_sizes.size()); + std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); + rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); + } + + device_gatherv(comm, + offsets.begin(), + rx_offsets.begin(), + offsets.size(), + rx_sizes, + rx_displs, + root, + stream_view); + offsets.resize(0, stream_view); + offsets.shrink_to_fit(stream_view); + + auto rx_values = allocate_dataframe_buffer(rx_offsets.size(), stream_view); + device_gatherv(comm, + get_dataframe_buffer_begin(values), + get_dataframe_buffer_begin(rx_values), + values.size(), + rx_sizes, + rx_displs, + root, + stream_view); + resize_dataframe_buffer(values, 0, stream_view); + shrink_to_fit_dataframe_buffer(values, stream_view); + + if (comm_rank == root) { + thrust::scatter(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + rx_offsets.begin(), + vertex_value_output_first); + } +} + template && - reduce_op::has_identity_element_v); // current restriction, to support - // general reduction, we may need to - // take a less efficient code path - constexpr auto update_major = (incoming == GraphViewType::is_storage_transposed); + + static_assert( + ReduceOp::pure_function && + ((reduce_op::has_compatible_raft_comms_op_v && + reduce_op::has_identity_element_v) || + (update_major && + std::is_same_v>))); // current restriction, to support general + // reduction, we may need to take a less + // efficient code path + [[maybe_unused]] constexpr auto max_segments = detail::num_sparse_segments_per_vertex_partition + size_t{1}; using vertex_t = typename GraphViewType::vertex_type; @@ -619,6 +964,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (update_major && std::is_same_v>) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / minor_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + using minor_tmp_buffer_type = std::conditional_t, edge_dst_property_t>; @@ -652,8 +1012,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { auto minor_init = init; auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may not - // store values for the entire minor range + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may + // not store values for the entire minor range minor_init = ReduceOp::identity_element; } else { auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); @@ -711,6 +1071,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } if (num_streams >= max_segments) { + assert((num_streams % max_segments) == 0); stream_pool_indices = std::vector(num_streams); std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); handle.sync_stream(); @@ -718,8 +1079,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - std::vector(0, rmm::cuda_stream_view{}))> - major_tmp_buffers{}; + std::vector> major_tmp_buffers{}; if constexpr (GraphViewType::is_multi_gpu && update_major) { std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), size_t{0}); @@ -758,6 +1118,26 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); } + std::conditional_t>, + std::vector>, + std::byte /* dummy */> + offset_vectors{}; + std::conditional_t>, + std::vector>, + std::byte /* dummy */> + value_vectors{}; + if constexpr (update_major && std::is_same_v>) { + auto capacity = graph_view.number_of_local_edge_partitions() * + (graph_view.local_edge_partition_segment_offsets(0) ? max_segments : 1); + offset_vectors.reserve(capacity); + value_vectors.reserve(capacity); + + for (size_t i = 0; i < capacity; ++i) { + offset_vectors.emplace_back(0, handle.get_stream()); + value_vectors.emplace_back(0, handle.get_stream()); + } + } + if (stream_pool_indices) { handle.sync_stream(); } auto edge_mask_view = graph_view.edge_mask_view(); @@ -778,7 +1158,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); - major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; + if constexpr (std::is_same_v>) { + major_init = init; // init is selected only when no edges return a valid value + } else { + major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; + } } else { major_init = init; } @@ -823,7 +1207,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // FIXME: we may further improve performance by 1) individually tuning block sizes for // different segments; and 2) adding one more segment for very high degree vertices and // running segmented reduction - if (edge_partition.dcs_nzd_vertex_count()) { + if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) @@ -853,7 +1237,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_output_buffer, e_op, major_init, - ReduceOp::identity_element, reduce_op); } } @@ -879,7 +1262,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_output_buffer, e_op, major_init, - ReduceOp::identity_element, reduce_op); } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { @@ -947,81 +1329,222 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, output_buffer, e_op, major_init, - ReduceOp::identity_element, reduce_op); } } if constexpr (GraphViewType::is_multi_gpu && update_major) { - auto& comm = handle.get_comms(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); if (segment_offsets && stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - device_reduce( - minor_comm, - major_buffer_first + (*segment_offsets)[3], - vertex_value_output_first + (*segment_offsets)[3], - (*segment_offsets)[4] - (*segment_offsets)[3], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size())); + if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[3]; + auto segment_size = (*segment_offsets)[4] - (*segment_offsets)[3]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 3] = std::move(offsets); + value_vectors[i * max_segments + 3] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } } if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[2], - vertex_value_output_first + (*segment_offsets)[2], - (*segment_offsets)[3] - (*segment_offsets)[2], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size())); + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[2]; + auto segment_size = (*segment_offsets)[3] - (*segment_offsets)[2]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 2] = std::move(offsets); + value_vectors[i * max_segments + 2] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } } if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - device_reduce(minor_comm, - major_buffer_first + (*segment_offsets)[1], - vertex_value_output_first + (*segment_offsets)[1], - (*segment_offsets)[2] - (*segment_offsets)[1], - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size())); + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[1]; + auto segment_size = (*segment_offsets)[2] - (*segment_offsets)[1]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 1] = std::move(offsets); + value_vectors[i * max_segments + 1] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } } if ((*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()); + auto segment_size = (*segment_offsets)[1]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = + compute_offset_value_pairs(minor_comm, + major_buffer_first, + major_buffer_first + segment_size, + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments] = std::move(offsets); + value_vectors[i * max_segments] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first, + vertex_value_output_first, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } + } + } else { + size_t reduction_size = static_cast( + segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size()); + if constexpr (std::is_same_v>) { + auto [offsets, values] = + compute_offset_value_pairs(minor_comm, + major_buffer_first, + major_buffer_first + reduction_size, + static_cast(i), + subgroup_size, + init, + handle.get_stream()); + offset_vectors[i] = std::move(offsets); + value_vectors[i] = std::move(values); + } else { device_reduce(minor_comm, major_buffer_first, vertex_value_output_first, - (*segment_offsets)[1], + reduction_size, ReduceOp::compatible_raft_comms_op, static_cast(i), - handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size())); + handle.get_stream()); } - } else { - size_t reduction_size = static_cast( - segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream()); } } if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen as *segment_offsets - // do not necessarily coincide in different edge partitions). + *stream_pool_indices); // to prevent buffer over-write (this can happen as + // *segment_offsets do not necessarily coincide in different edge + // partitions). } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if constexpr (update_major && std::is_same_v>) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + + if (segment_offsets && stream_pool_indices) { + if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[3]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 3]), + std::move(value_vectors[i * max_segments + 3]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[2]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 2]), + std::move(value_vectors[i * max_segments + 2]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } + if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[1]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 1]), + std::move(value_vectors[i * max_segments + 1]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } + if ((*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()); + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments]), + std::move(value_vectors[i * max_segments]), + vertex_value_output_first, + static_cast(i), + segment_stream); + } + } else { + gather_offset_value_pairs_and_update_vertex_value_output(minor_comm, + std::move(offset_vectors[i]), + std::move(value_vectors[i]), + vertex_value_output_first, + static_cast(i), + handle.get_stream()); + } + } + } + if constexpr (GraphViewType::is_multi_gpu && !update_major) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -1145,6 +1668,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and @@ -1180,15 +1705,108 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 incoming edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(GraphViewType::is_storage_transposed); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + vertex_value_output_first); } /** @@ -1224,6 +1842,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, * @param e_op Quinary operator takes edge source, edge destination, property values for the source, * destination, and edge and returns a value to be reduced. * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is * recommended to use the pre-defined reduction operators whenever possible as the current (and @@ -1259,15 +1879,108 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, // currently, nothing to do } - detail::per_v_transform_reduce_e(handle, - graph_view, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + vertex_value_output_first); } } // namespace cugraph diff --git a/cpp/src/prims/pred_op.cuh b/cpp/src/prims/pred_op.cuh new file mode 100644 index 00000000000..a93755346f8 --- /dev/null +++ b/cpp/src/prims/pred_op.cuh @@ -0,0 +1,28 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace cugraph { +namespace pred_op { + +template +struct const_true { + __host__ __device__ constexpr bool operator()(index_t i) const { return true; } +}; + +} // namespace pred_op +} // namespace cugraph From 222148dd40b395aa5020b253a110fc44e15aae0a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 25 Jul 2024 17:10:14 -0700 Subject: [PATCH 014/126] update kernels to take KeyIterator key_first & key_last --- ...v_transform_reduce_incoming_outgoing_e.cuh | 235 +++++++++++------- 1 file changed, 148 insertions(+), 87 deletions(-) diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index ff6db8ef45c..2482f0cc4c5 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -66,6 +66,25 @@ namespace detail { int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +template +struct iterator_value_type_or_default_t; + +template +struct iterator_value_type_or_default_t>> { + using value_type = default_t; // if Iterator is invalid (void*), value_type = default_t +}; + +template +struct iterator_value_type_or_default_t>> { + using value_type = typename thrust::iterator_traits< + Iterator>::value_type; // if iterator is valid, value_type = typename + // thrust::iterator_traits::value_type +}; + template edge_partition, + OptionalKeyIterator key_first, + OptionalKeyIterator key_last, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -199,67 +221,91 @@ __global__ static void per_v_transform_reduce_e_hypersparse( static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< ReduceOp>); // atomic_reduce is defined only when // has_compatible_raft_comms_op_t is true + static_assert(update_major || std::is_same_v); using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - - edge_partition.major_range_first()); + edge_partition.major_range_first()); // SK auto idx = static_cast(tid); - auto dcs_nzd_vertex_count = *(edge_partition.dcs_nzd_vertex_count()); - - while (idx < static_cast(dcs_nzd_vertex_count)) { - auto major = - *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - auto major_idx = - major_start_offset + idx; // major_offset != major_idx in the hypersparse region - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_idx)); + size_t key_count{}; + if constexpr (std::is_same_v) { + key_count = *(edge_partition.dcs_nzd_vertex_count()); + } else { + key_count = static_cast(thrust::distance(key_first, key_last)); + } - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - major, - major_offset, - indices, - edge_offset}; + while (idx < key_count) { + key_t key{}; + vertex_t major{}; + thrust::optional major_idx{}; + if constexpr (std::is_same_v) { + key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + major = key; + major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region + } else { + key = *(key_first + idx); + major = thrust_tuple_get_or_identity(key); + major_idx = edge_partition.major_idx_from_major_nocheck(major); + } - if (edge_partition_e_mask) { - update_result_value_output( - edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - [&edge_partition_e_mask, edge_offset] __device__(edge_t i) { - return (*edge_partition_e_mask).get(edge_offset + i); - }, - major - *(edge_partition).major_hypersparse_first(), - result_value_output); + size_t output_idx = std::is_same_v + ? (major - *(edge_partition).major_hypersparse_first()) + : idx; + if (major_idx) { + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(*major_idx)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, edge_offset] __device__(edge_t i) { + return (*edge_partition_e_mask).get(edge_offset + i); + }, + output_idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + pred_op::const_true{}, + output_idx, + result_value_output); + } } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - pred_op::const_true{}, - major - *(edge_partition).major_hypersparse_first(), - result_value_output); + if constexpr (update_major) { *(result_value_output + output_idx) = init; } } idx += gridDim.x * blockDim.x; } @@ -267,6 +313,7 @@ __global__ static void per_v_transform_reduce_e_hypersparse( template edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, + KeyIterator key_first, + KeyIterator key_last, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -299,15 +346,16 @@ __global__ static void per_v_transform_reduce_e_low_degree( using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid); + auto idx = static_cast(tid); - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t edge_offset{}; edge_t local_degree{}; @@ -315,7 +363,7 @@ __global__ static void per_v_transform_reduce_e_low_degree( edge_partition.local_edges(static_cast(major_offset)); auto call_e_op = call_e_op_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, + KeyIterator key_first, + KeyIterator key_last, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -395,13 +444,12 @@ __global__ static void per_v_transform_reduce_e_mid_degree( using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; auto const tid = threadIdx.x + blockIdx.x * blockDim.x; static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); auto const lane_id = tid % raft::warp_size(); - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); - auto idx = static_cast(tid / raft::warp_size()); + auto idx = static_cast(tid / raft::warp_size()); using WarpReduce = cub::WarpReduce< std::conditional_t>, int32_t, e_op_result_t>>; @@ -410,16 +458,18 @@ __global__ static void per_v_transform_reduce_e_mid_degree( temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) : int32_t{1} /* dummy */]; - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t edge_offset{}; edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); auto call_e_op = call_e_op_t edge_partition, - typename GraphViewType::vertex_type major_range_first, - typename GraphViewType::vertex_type major_range_last, + KeyIterator key_first, + KeyIterator key_last, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -563,9 +614,8 @@ __global__ static void per_v_transform_reduce_e_high_degree( using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; - auto major_start_offset = - static_cast(major_range_first - edge_partition.major_range_first()); auto idx = static_cast(blockIdx.x); using BlockReduce = cub::BlockReduce< @@ -580,16 +630,18 @@ __global__ static void per_v_transform_reduce_e_high_degree( std::byte /* dummy */> output_thread_id; - while (idx < static_cast(major_range_last - major_range_first)) { - auto major_offset = static_cast(major_start_offset + idx); - auto major = edge_partition.major_from_major_offset_nocheck(major_offset); + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t edge_offset{}; edge_t local_degree{}; thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); auto call_e_op = call_e_op_t <<>>( edge_partition, + static_cast(nullptr), + static_cast(nullptr), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1253,8 +1307,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[2], - edge_partition.major_range_first() + (*segment_offsets)[3], + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[2]), + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[3]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1277,8 +1333,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e_mid_degree <<>>( edge_partition, - edge_partition.major_range_first() + (*segment_offsets)[1], - edge_partition.major_range_first() + (*segment_offsets)[2], + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[1]), + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[2]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1300,8 +1358,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e_high_degree <<>>( edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_first() + (*segment_offsets)[1], + thrust::make_counting_iterator(edge_partition.major_range_first()), + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[1]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1320,8 +1379,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, - edge_partition.major_range_first(), - edge_partition.major_range_last(), + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[2]), + thrust::make_counting_iterator(edge_partition.major_range_first() + + (*segment_offsets)[3]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, From effc69c949742008cd9fe68a27ffff23a8d5e264 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 27 Jul 2024 18:28:46 -0700 Subject: [PATCH 015/126] update per_v_transform_reduce_incoming_outgoing_e to support key list --- .../detail/extract_transform_v_frontier_e.cuh | 214 +++------ ...v_transform_reduce_incoming_outgoing_e.cuh | 411 ++++++++++++++---- cpp/src/prims/vertex_frontier.cuh | 168 +++++++ 3 files changed, 554 insertions(+), 239 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 590747ada18..79203af08c3 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -18,6 +18,7 @@ #include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/property_op_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -658,6 +659,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust::optional, thrust::optional>>>); + constexpr bool use_bitmap = GraphViewType::is_multi_gpu && std::is_same_v && + KeyBucketType::is_sorted_unique; + if (do_expensive_check) { auto frontier_vertex_first = thrust_tuple_get_or_identity(frontier.begin()); @@ -705,21 +709,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); if (segment_offsets) { - auto v_threshold = - graph_view.local_vertex_partition_range_first() + *((*segment_offsets).rbegin() + 1); - if constexpr (std::is_same_v) { - frontier_key_last = thrust::lower_bound( - handle.get_thrust_policy(), frontier_key_first, frontier_key_last, v_threshold); - } else { - key_t key_threshold{}; - thrust::get<0>(key_threshold) = v_threshold; - frontier_key_last = thrust::lower_bound( - handle.get_thrust_policy(), - frontier_key_first, - frontier_key_last, - key_threshold, - [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); }); - } + frontier_key_last = compute_key_lower_bound( + frontier_key_first, + frontier_key_last, + graph_view.local_vertex_partition_range_first() + *((*segment_offsets).rbegin() + 1), + handle.get_stream()); } } @@ -737,84 +731,25 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // update frontier bitmap (used to reduce broadcast bandwidth size) - std::conditional_t, - std::optional>, - std::byte /* dummy */> - frontier_bitmap{}; - std::conditional_t, - std::vector, - std::byte /* dummy */> - use_bitmap_flags{}; - if constexpr (GraphViewType::is_multi_gpu && std::is_same_v) { + std:: + conditional_t>, std::byte /* dummy */> + frontier_bitmap{}; + std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + if constexpr (use_bitmap) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - frontier_bitmap = std::nullopt; - if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); - - auto threshold_ratio = 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); - use_bitmap_flags = std::vector(minor_comm_size, false); - - size_t this_bool_size{0}; - if constexpr (KeyBucketType::is_sorted_unique) { - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); - if (i == static_cast(minor_comm_rank)) { this_bool_size = bool_size; } - if (local_frontier_sizes[i] > static_cast(bool_size * threshold_ratio)) { - use_bitmap_flags[i] = true; - } - } - } else { - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : graph_view.local_vertex_partition_range_size(); - this_bool_size = bool_size; - bool use_bitmap_flag{false}; - if (local_frontier_sizes[minor_comm_rank] > - static_cast(bool_size * threshold_ratio)) { - auto num_uniques = static_cast(thrust::count_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_frontier_sizes[minor_comm_rank]), - cugraph::detail::is_first_in_run_t{frontier_key_first})); - if (num_uniques == local_frontier_sizes[minor_comm_rank]) { use_bitmap_flag = true; } - } - auto tmp_flags = host_scalar_allgather( - minor_comm, use_bitmap_flag ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - std::transform(tmp_flags.begin(), - tmp_flags.end(), - use_bitmap_flags.begin(), - [] __device__(uint8_t flag) { return flag == 1; }); - } - - if (use_bitmap_flags[minor_comm_rank]) { - frontier_bitmap = - rmm::device_uvector(packed_bool_size(this_bool_size), handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), - (*frontier_bitmap).begin(), - (*frontier_bitmap).end(), - packed_bool_empty_mask()); - thrust::for_each( - handle.get_thrust_policy(), - frontier_key_first, - frontier_key_last, - [bitmap = - raft::device_span((*frontier_bitmap).data(), (*frontier_bitmap).size()), - v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) { - auto v_offset = v - v_first; - cuda::atomic_ref word( - bitmap[packed_bool_offset(v_offset)]); - word.fetch_or(cugraph::packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); - }); - } - } + auto const minor_comm_rank = minor_comm.get_rank(); + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(); + + std::tie(frontier_bitmap, use_bitmap_flags) = + compute_vertex_list_bitmap_info(minor_comm, + frontier_key_first, + frontier_key_last, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + bool_size, + handle.get_stream()); } // 2. fill the buffers @@ -845,13 +780,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, *edge_mask_view, i) : thrust::nullopt; - auto edge_partition_frontier_key_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - vertex_t edge_partition_frontier_size = static_cast(local_frontier_sizes[i]); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); auto edge_partition_frontier_key_first = frontier_key_first; auto edge_partition_frontier_key_last = frontier_key_last; + auto edge_partition_frontier_key_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -859,48 +793,32 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); resize_dataframe_buffer( - edge_partition_frontier_key_buffer, edge_partition_frontier_size, handle.get_stream()); + edge_partition_frontier_key_buffer, local_frontier_sizes[i], handle.get_stream()); - if constexpr (std::is_same_v) { + if constexpr (use_bitmap) { + std::variant, decltype(frontier_key_first)> v_list{}; if (use_bitmap_flags[i]) { - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); - rmm::device_uvector edge_partition_bitmap(packed_bool_size(bool_size), - handle.get_stream()); - device_bcast(minor_comm, - (*frontier_bitmap).data(), - edge_partition_bitmap.data(), - edge_partition_bitmap.size(), - static_cast(i), - handle.get_stream()); - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(edge_partition.major_range_first()), - thrust::make_counting_iterator(edge_partition.major_range_first()) + bool_size, - thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - cuda::proclaim_return_type( - [bitmap = raft::device_span( - edge_partition_bitmap.data(), - edge_partition_bitmap.size())] __device__(vertex_t v_offset) { - return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != - packed_bool_empty_mask()); - })), - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - thrust::identity{}); + v_list = raft::device_span((*frontier_bitmap).data(), + (*frontier_bitmap).size()); } else { - device_bcast(minor_comm, - frontier_key_first, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition_frontier_size, - static_cast(i), - handle.get_stream()); + v_list = frontier_key_first; } + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + device_bcast_vertex_list( + minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + static_cast(i), + handle.get_stream()); } else { device_bcast(minor_comm, frontier_key_first, get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition_frontier_size, + local_frontier_sizes[i], static_cast(i), handle.get_stream()); } @@ -919,7 +837,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust_tuple_get_or_identity( edge_partition_frontier_key_last); - auto max_pushes = max_one_e_per_frontier_key ? edge_partition_frontier_size + auto max_pushes = max_one_e_per_frontier_key ? local_frontier_sizes[i] : edge_partition.compute_number_of_edges( edge_partition_frontier_major_first, edge_partition_frontier_major_last, @@ -949,28 +867,14 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto subtime1 = std::chrono::steady_clock::now(); #endif if (segment_offsets) { - static_assert(num_sparse_segments_per_vertex_partition == 3); - std::vector h_thresholds(num_sparse_segments_per_vertex_partition + - (graph_view.use_dcs() ? 1 : 0) - 1); - h_thresholds[0] = edge_partition.major_range_first() + (*segment_offsets)[1]; - h_thresholds[1] = edge_partition.major_range_first() + (*segment_offsets)[2]; - if (graph_view.use_dcs()) { - h_thresholds[2] = edge_partition.major_range_first() + (*segment_offsets)[3]; - } - rmm::device_uvector d_thresholds(h_thresholds.size(), handle.get_stream()); - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - rmm::device_uvector d_offsets(d_thresholds.size(), handle.get_stream()); - thrust::lower_bound(handle.get_thrust_policy(), - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - d_thresholds.begin(), - d_thresholds.end(), - d_offsets.begin()); - std::vector h_offsets(d_offsets.size()); - raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); - RAFT_CUDA_TRY(cudaStreamSynchronize(handle.get_stream())); - h_offsets.push_back(edge_partition_frontier_size); + auto h_offsets = compute_key_segment_offsets( + edge_partition_frontier_major_first, + edge_partition_frontier_major_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), + graph_view.use_dcs(), + handle.get_stream()); + // FIXME: we may further improve performance by 1) concurrently running kernels on different // segments; 2) individually tuning block sizes for different segments; and 3) adding one // more segment for very high degree vertices and running segmented reduction @@ -1051,8 +955,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, e_op); } } else { - if (edge_partition_frontier_size > 0) { - raft::grid_1d_thread_t update_grid(edge_partition_frontier_size, + if (local_frontier_sizes[i] > 0) { + raft::grid_1d_thread_t update_grid(local_frontier_sizes[i], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index 2482f0cc4c5..42f42cdaaa3 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -16,11 +16,13 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/fill_edge_src_dst_property.cuh" #include "prims/pred_op.cuh" #include "prims/property_op_utils.cuh" #include "prims/reduce_op.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -218,45 +220,45 @@ __global__ static void per_v_transform_reduce_e_hypersparse( T init /* relevant only if update_major == true */, ReduceOp reduce_op) { + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< ReduceOp>); // atomic_reduce is defined only when // has_compatible_raft_comms_op_t is true - static_assert(update_major || std::is_same_v); + static_assert(update_major || !use_input_key); using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; using key_t = typename iterator_value_type_or_default_t::value_type; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - - edge_partition.major_range_first()); // SK - auto idx = static_cast(tid); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); size_t key_count{}; - if constexpr (std::is_same_v) { - key_count = *(edge_partition.dcs_nzd_vertex_count()); - } else { + if constexpr (use_input_key) { key_count = static_cast(thrust::distance(key_first, key_last)); + } else { + key_count = *(edge_partition.dcs_nzd_vertex_count()); } while (idx < key_count) { key_t key{}; vertex_t major{}; thrust::optional major_idx{}; - if constexpr (std::is_same_v) { - key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); - major = key; - major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region - } else { + if constexpr (use_input_key) { key = *(key_first + idx); major = thrust_tuple_get_or_identity(key); major_idx = edge_partition.major_idx_from_major_nocheck(major); + } else { + key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + major = key; + auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - + edge_partition.major_range_first()); + major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region } - size_t output_idx = std::is_same_v - ? (major - *(edge_partition).major_hypersparse_first()) - : idx; + size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first()); if (major_idx) { auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; @@ -959,6 +961,7 @@ void gather_offset_value_pairs_and_update_vertex_value_output( template void per_v_transform_reduce_e(raft::handle_t const& handle, GraphViewType const& graph_view, + OptionalKeyIterator sorted_unique_key_first, + OptionalKeyIterator sorted_unique_key_last, EdgeSrcValueInputWrapper edge_src_value_input, EdgeDstValueInputWrapper edge_dst_value_input, EdgeValueInputWrapper edge_value_input, @@ -976,8 +981,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ReduceOp reduce_op, VertexValueOutputIterator vertex_value_output_first) { - constexpr auto update_major = (incoming == GraphViewType::is_storage_transposed); + constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || !use_input_key); static_assert( ReduceOp::pure_function && ((reduce_op::has_compatible_raft_comms_op_v && @@ -987,10 +994,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // reduction, we may need to take a less // efficient code path - [[maybe_unused]] constexpr auto max_segments = - detail::num_sparse_segments_per_vertex_partition + size_t{1}; using vertex_t = typename GraphViewType::vertex_type; using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; using edge_partition_src_input_device_view_t = std::conditional_t< std::is_same_v, @@ -1016,6 +1023,71 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + constexpr bool use_bitmap = GraphViewType::is_multi_gpu && + !std::is_same_v && + std::is_same_v; + + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. prepare key list + + auto sorted_unique_nzd_key_last = sorted_unique_key_last; + if constexpr (use_input_key) { + size_t partition_idx = 0; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + auto sorted_uniue_nzd_key_last = compute_key_lower_bound( + sorted_unique_key_first, + sorted_unique_key_last, + graph_view.local_vertex_partition_range_first() + ((*segment_offsets).rbegin() + 1), + handle.get_stream()); + } + } + + std::conditional_t, std::byte /* dummy */> + local_key_list_sizes{}; + if constexpr (use_input_key) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + local_key_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), + handle.get_stream()); + } else { + local_key_list_sizes = std::vector{ + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; + } + } + + std:: + conditional_t>, std::byte /* dummy */> + key_list_bitmap{}; + std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + if constexpr (use_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(); + + std::tie(key_list_bitmap, use_bitmap_flags) = + compute_vertex_list_bitmap_info(minor_comm, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + bool_size, + handle.get_stream()); + } + + // 2. compute subgroup_size, set-up temporary buffers & stream pool, and initialize + [[maybe_unused]] std::conditional_t>, int, std::byte /* dummy */> @@ -1046,19 +1118,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, decltype(minor_tmp_buffer->mutable_view().value_first())>, void /* dummy */>; - if constexpr (update_major) { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { // no vertices in the zero degree segment are visited + if constexpr (update_major) { // no vertices in the zero degree segment are visited + if constexpr (use_input_key) { thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), init); + } else { + size_t partition_idx = 0; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + *((*segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*segment_offsets).rbegin()), + init); + } } } else { if constexpr (GraphViewType::is_multi_gpu) { @@ -1094,8 +1175,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // memory footprint vs parallelism trade-off // peak memory requirement per loop is - // update_major ? V / comm_size * sizeof(T) : 0 + // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) + // FIXME: should we consider edge_partition_key_buffer as well? size_t num_streams = std::min(static_cast(minor_comm_size) * max_segments, @@ -1108,18 +1190,37 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } else { value_size = sizeof(T); } + size_t key_size{0}; + if constexpr (use_input_key) { + if constexpr (std::is_same_v) { + key_size = sizeof(vertex_t); + } else { + key_size = sizeof(thrust::tuple_element<0, key_t>::type) + + sizeof(thrust::tuple_element<1, key_t>::type); + } + } - auto avg_vertex_degree = - graph_view.number_of_vertices() > 0 - ? (static_cast(graph_view.compute_number_of_edges(handle)) / - static_cast(graph_view.number_of_vertices())) - : double{0.0}; - - num_streams = - std::min(static_cast(avg_vertex_degree * (static_cast(sizeof(vertex_t)) / - static_cast(value_size))) * - max_segments, - num_streams); + auto num_edges = graph_view.compute_number_of_edges(handle); + + size_t aggregate_major_range_size{}; + if constexpr (use_input_key) { + aggregate_major_range_size = + host_scalar_allreduce(handle.get_comms(), + static_cast(thrust::distance(sorted_unique_key_first, + sorted_unique_nzd_key_last)), + raft::comms::op_t::SUM, + handle.get_stream()); + } else { + aggregate_major_range_size = graph_view.number_of_vertices(); + } + num_streams = std::min( + static_cast( + (aggregate_major_range_size > 0 + ? (static_cast(num_edges) / static_cast(aggregate_major_range_size)) + : double{0}) * + (static_cast(sizeof(vertex_t)) / static_cast(value_size + key_size))) * + max_segments, + num_streams); } if (num_streams >= max_segments) { @@ -1136,15 +1237,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), size_t{0}); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - major_tmp_buffer_sizes[i] = - *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment + if constexpr (use_input_key) { + major_tmp_buffer_sizes = local_key_list_sizes; } else { - if constexpr (GraphViewType::is_storage_transposed) { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + if (segment_offsets) { + major_tmp_buffer_sizes[i] = + *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment } else { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); + if constexpr (GraphViewType::is_storage_transposed) { + major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); + } else { + major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); + } } } } @@ -1192,6 +1297,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (stream_pool_indices) { handle.sync_stream(); } + // 3. proces local edge partitions + auto edge_mask_view = graph_view.edge_mask_view(); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { @@ -1220,6 +1327,76 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + auto loop_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) + : handle.get_stream(); + + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + auto edge_partition_key_buffer = allocate_optional_dataframe_buffer< + std::conditional_t>(0, + loop_stream); + std::conditional_t>, std::byte /* dummy */> + key_segment_offsets{}; + if constexpr (use_input_key) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + resize_optional_dataframe_buffer( + edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); + + if constexpr (use_bitmap) { + std::variant, decltype(sorted_unique_key_first)> + v_list{}; + if (use_bitmap_flags[i]) { + v_list = raft::device_span((*key_list_bitmap).data(), + (*key_list_bitmap).size()); + } else { + v_list = sorted_unique_key_first; + } + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + device_bcast_vertex_list(minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + static_cast(thrust::distance( + sorted_unique_key_first, sorted_unique_nzd_key_last)), + static_cast(i), + loop_stream); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffer), + local_key_list_sizes[i], + static_cast(i), + loop_stream); + } + + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffer); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); + } + } + if (segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + edge_partition_key_first, + edge_partition_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), + graph_view.use_dcs(), + loop_stream); + } else { + key_segment_offsets = std::nullopt; + } + } + RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -1252,38 +1429,65 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, output_buffer = vertex_value_output_first; } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + using segment_key_iterator_t = + std::conditional_t; + if (segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + std::vector h_offsets{}; + if constexpr (use_input_key) { + h_offsets = (*key_segment_offsets); + } else { + h_offsets.resize((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + h_offsets.begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + // FIXME: we may further improve performance by 1) individually tuning block sizes for // different segments; and 2) adding one more segment for very high degree vertices and // running segmented reduction - if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { + if (edge_partition.dcs_nzd_vertex_count()) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) : handle.get_stream(); - if constexpr (update_major) { // this is necessary as we don't visit every vertex in the - // hypersparse segment + if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit + // every vertex in the hypersparse segment thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + (*segment_offsets)[3], - output_buffer + (*segment_offsets)[4], + output_buffer + h_offsets[3], + output_buffer + h_offsets[4], major_init); } - if (*(edge_partition.dcs_nzd_vertex_count()) > 0) { - raft::grid_1d_thread_t update_grid(*(edge_partition.dcs_nzd_vertex_count()), + auto segment_size = use_input_key + ? (h_offsets[4] - h_offsets[3]) + : static_cast(*(edge_partition.dcs_nzd_vertex_count())); + if (segment_size > 0) { + raft::grid_1d_thread_t update_grid(segment_size, detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[3]; } + if constexpr (update_major) { segment_output_buffer += h_offsets[3]; } + auto segment_key_first = edge_partition_key_first; + auto segment_key_last = edge_partition_key_last; + if constexpr (use_input_key) { + segment_key_first += h_offsets[3]; + segment_key_last += h_offsets[4]; + } else { + assert(segment_key_first == nullptr); + assert(segment_key_last == nullptr); + } detail::per_v_transform_reduce_e_hypersparse <<>>( edge_partition, - static_cast(nullptr), - static_cast(nullptr), + segment_key_first, + segment_key_last, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1294,23 +1498,29 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, reduce_op); } } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + if (h_offsets[3] - h_offsets[2]) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*segment_offsets)[3] - (*segment_offsets)[2], + raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[2]; } + if constexpr (update_major) { segment_output_buffer += h_offsets[2]; } + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + segment_key_first += h_offsets[2]; + auto num_keys = h_offsets[3] - h_offsets[2]; detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[2]), - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[3]), + segment_key_first, + segment_key_first + (h_offsets[3] - h_offsets[2]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1320,23 +1530,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, major_init, reduce_op); } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + if (h_offsets[2] - h_offsets[1] > 0) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*segment_offsets)[2] - (*segment_offsets)[1], + raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*segment_offsets)[1]; } + if constexpr (update_major) { segment_output_buffer += h_offsets[1]; } + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + segment_key_first += h_offsets[1]; detail::per_v_transform_reduce_e_mid_degree <<>>( edge_partition, - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[1]), - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[2]), + segment_key_first, + segment_key_first + (h_offsets[2] - h_offsets[1]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1347,20 +1562,25 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ReduceOp::identity_element, reduce_op); } - if ((*segment_offsets)[1] > 0) { + if (h_offsets[1] > 0) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) : handle.get_stream(); - raft::grid_1d_block_t update_grid((*segment_offsets)[1], + raft::grid_1d_block_t update_grid(h_offsets[1], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } detail::per_v_transform_reduce_e_high_degree <<>>( edge_partition, - thrust::make_counting_iterator(edge_partition.major_range_first()), - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[1]), + segment_key_first, + segment_key_first + h_offsets[1], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1372,17 +1592,29 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, reduce_op); } } else { + size_t num_keys{}; + if constexpr (use_input_key) { + num_keys = + static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + num_keys = static_cast(edge_partition.major_range_size()); + } + if (edge_partition.major_range_size() > 0) { - raft::grid_1d_thread_t update_grid(edge_partition.major_range_size(), + raft::grid_1d_thread_t update_grid(num_keys, detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[2]), - thrust::make_counting_iterator(edge_partition.major_range_first() + - (*segment_offsets)[3]), + segment_key_first, + segment_key_first + num_keys, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1540,6 +1772,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + // 4. communication + if constexpr (update_major && std::is_same_v>) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); @@ -1770,6 +2004,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e(handle, graph_view, + static_cast(nullptr), + static_cast(nullptr), edge_src_value_input, edge_dst_value_input, edge_value_input, @@ -1861,6 +2097,8 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e(handle, graph_view, + key_list.begin(), + key_list.end(), edge_src_value_input, edge_dst_value_input, edge_value_input, @@ -1944,6 +2182,8 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e(handle, graph_view, + static_cast(nullptr), + static_cast(nullptr), edge_src_value_input, edge_dst_value_input, edge_value_input, @@ -2026,6 +2266,7 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, bool do_expensive_check = false) { static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); if (do_expensive_check) { // currently, nothing to do @@ -2035,6 +2276,8 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, detail::per_v_transform_reduce_e(handle, graph_view, + key_list.begin(), + key_list.end(), edge_src_value_input, edge_dst_value_input, edge_value_input, diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index b13e6bfd458..b48ed775de3 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -15,10 +15,12 @@ */ #pragma once +#include #include #include #include +#include #include #include @@ -48,6 +50,172 @@ namespace cugraph { +template +KeyIterator compute_key_lower_bound(KeyIterator sorted_unique_key_first, + KeyIterator sorted_unique_key_last, + vertex_t v_threshold, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + if constexpr (std::is_same_v) { + return thrust::lower_bound( + rmm::exec_policy(stream_view), sorted_unique_key_first, sorted_unique_key_last, v_threshold); + } else { + key_t k_threshold{}; + if constexpr (std::is_same_v) { + k_threshold = v_threshold; + } else { + thrust::get<0>(k_threshold) = v_threshold; + } + return thrust::lower_bound( + rmm::exec_policy(stream_view), + sorted_unique_key_first, + sorted_unique_key_last, + k_threshold, + [] __device__(auto lhs, auto rhs) { return thrust::get<0>(lhs) < thrust::get<0>(rhs); }); + } +} + +template +std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, + KeyIterator sorted_key_last, + raft::host_span segment_offsets, + vertex_t vertex_range_first, + bool use_dcs, + rmm::cuda_stream_view stream_view) +{ + using key_t = typename thrust::iterator_traits::value_type; + + assert(segment_offsets.size() == 6 /* high, mid, low, hypersparse, zero + 1 */); + std::vector h_thresholds(use_dcs ? 3 : 2); + h_thresholds[0] = vertex_range_first + segment_offsets[1]; // high, mid boundary + h_thresholds[1] = vertex_range_first + segment_offsets[2]; // mid, low boundary + if (use_dcs) { h_thresholds[2] = vertex_range_first + segment_offsets[3]; } // low, hypersparse boundary + + rmm::device_uvector d_thresholds(h_thresholds.size(), stream_view); + raft::update_device(d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), stream_view); + + rmm::device_uvector d_offsets(d_thresholds.size(), stream_view); + if constexpr (std::is_same_v) { + thrust::lower_bound(rmm::exec_policy(stream_view), + sorted_key_first, + sorted_key_last, + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } else { + auto sorted_vertex_first = + thrust::make_transform_iterator(sorted_key_first, thrust_tuple_get{}); + thrust::lower_bound(rmm::exec_policy(stream_view), + sorted_vertex_first, + sorted_vertex_first + thrust::distance(sorted_key_first, sorted_key_last), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin()); + } + + std::vector h_offsets(d_offsets.size()); + raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + h_offsets.push_back(static_cast(thrust::distance(sorted_key_first, sorted_key_last))); + + return h_offsets; +} + +template +std::tuple>, std::vector> +compute_vertex_list_bitmap_info( + raft::comms::comms_t const& comm, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + constexpr double threshold_ratio = + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + + std::optional> bitmap{std::nullopt}; + std::vector use_bitmap_flags{}; + + if (comm.get_size() > 1) { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + auto bool_size = vertex_range_last - vertex_range_first; + + if (v_list_size > static_cast(bool_size * threshold_ratio)) { + bitmap = rmm::device_uvector(packed_bool_size(bool_size), stream_view); + thrust::fill(rmm::exec_policy(stream_view), + (*bitmap).begin(), + (*bitmap).end(), + packed_bool_empty_mask()); + thrust::for_each(rmm::exec_policy(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + [bitmap = raft::device_span((*bitmap).data(), (*bitmap).size()), + v_first = vertex_range_first] __device__(vertex_t v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(cugraph::packed_bool_mask(v_offset), + cuda::std::memory_order_relaxed); + }); + } + + auto tmp_flags = host_scalar_allgather(comm, bitmap ? uint8_t{1} : uint8_t{0}, stream_view); + use_bitmap_flags.resize(tmp_flags.size()); + std::transform( + tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](uint8_t tmp_flag) { + return (tmp_flag == uint8_t{1}); + }); + } else { + use_bitmap_flags = {false}; + } + + return std::make_tuple(std::move(bitmap), std::move(use_bitmap_flags)); +} + +template +void device_bcast_vertex_list( + raft::comms::comms_t const& comm, + std::variant, InputVertexIterator> v_list, + OutputVertexIterator output_v_first, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + size_t v_list_size, + int root, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + static_assert( + std::is_same_v::value_type, vertex_t>); + + if (v_list.index() == 0) { // bitmap + rmm::device_uvector tmp_bitmap(std::get<0>(v_list).size(), stream_view); + device_bcast( + comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); + thrust::copy_if(rmm::exec_policy(stream_view), + thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & + packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + output_v_first, + thrust::identity{}); + } else { + device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view); + } +} + // key type is either vertex_t (tag_t == void) or thrust::tuple (tag_t != void) // if sorted_unique is true, stores unique key objects in the sorted (non-descending) order. // if false, there can be duplicates and the elements may not be sorted. From d537290942ceab85fab4dd76a758cfd9fa1b7a6a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 28 Jul 2024 17:43:58 -0700 Subject: [PATCH 016/126] remove pred_op.cuh --- cpp/src/prims/pred_op.cuh | 28 ---------------------------- 1 file changed, 28 deletions(-) delete mode 100644 cpp/src/prims/pred_op.cuh diff --git a/cpp/src/prims/pred_op.cuh b/cpp/src/prims/pred_op.cuh deleted file mode 100644 index a93755346f8..00000000000 --- a/cpp/src/prims/pred_op.cuh +++ /dev/null @@ -1,28 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -namespace cugraph { -namespace pred_op { - -template -struct const_true { - __host__ __device__ constexpr bool operator()(index_t i) const { return true; } -}; - -} // namespace pred_op -} // namespace cugraph From cf928855460bedbebafd3e092ef1208613e9cd17 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 29 Jul 2024 15:34:05 -0700 Subject: [PATCH 017/126] update per_v_transform_reduce_incoming|outgoing_e to take a predicate --- cpp/src/prims/detail/prim_functors.cuh | 22 ++ ...v_transform_reduce_incoming_outgoing_e.cuh | 352 +++++++++++++----- 2 files changed, 286 insertions(+), 88 deletions(-) diff --git a/cpp/src/prims/detail/prim_functors.cuh b/cpp/src/prims/detail/prim_functors.cuh index f426cd993ea..a166f37906a 100644 --- a/cpp/src/prims/detail/prim_functors.cuh +++ b/cpp/src/prims/detail/prim_functors.cuh @@ -21,6 +21,23 @@ namespace cugraph { namespace detail { +template +struct const_true_e_op_t { + __device__ auto operator()(std::conditional_t key_or_src, + std::conditional_t key_or_dst, + src_value_t, + dst_value_t, + e_value_t) const + { + return true; + } +}; + template +struct call_const_true_e_op_t { + __device__ auto operator()(edge_t i) const { return true; } +}; + template >) { + if constexpr (std::is_same_v>) { if constexpr (std::is_same_v>) { // init is selected only when no // edges return a valid value @@ -204,6 +203,7 @@ template __global__ static void per_v_transform_reduce_e_hypersparse( edge_partition_device_view_t; - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< ReduceOp>); // atomic_reduce is defined only when // has_compatible_raft_comms_op_t is true @@ -232,6 +232,23 @@ __global__ static void per_v_transform_reduce_e_hypersparse( using key_t = typename iterator_value_type_or_default_t::value_type; + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto idx = static_cast(tid); @@ -282,6 +299,19 @@ __global__ static void per_v_transform_reduce_e_hypersparse( indices, edge_offset}; + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + if (edge_partition_e_mask) { update_result_value_output( edge_partition, @@ -290,8 +320,12 @@ __global__ static void per_v_transform_reduce_e_hypersparse( call_e_op, init, reduce_op, - [&edge_partition_e_mask, edge_offset] __device__(edge_t i) { - return (*edge_partition_e_mask).get(edge_offset + i); + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(edge_offset + i); + } else { + return false; + } }, output_idx, result_value_output); @@ -302,7 +336,7 @@ __global__ static void per_v_transform_reduce_e_hypersparse( call_e_op, init, reduce_op, - pred_op::const_true{}, + call_pred_op, output_idx, result_value_output); } @@ -326,6 +360,7 @@ template __global__ static void per_v_transform_reduce_e_low_degree( edge_partition_device_view_t); // atomic_reduce is defined only when @@ -350,8 +386,24 @@ __global__ static void per_v_transform_reduce_e_low_degree( using edge_t = typename GraphViewType::edge_type; using key_t = typename thrust::iterator_traits::value_type; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = static_cast(tid); + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); @@ -379,6 +431,19 @@ __global__ static void per_v_transform_reduce_e_low_degree( indices, edge_offset}; + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + if (edge_partition_e_mask) { update_result_value_output( edge_partition, @@ -387,8 +452,12 @@ __global__ static void per_v_transform_reduce_e_low_degree( call_e_op, init, reduce_op, - [&edge_partition_e_mask, edge_offset] __device__(edge_t i) { - return (*edge_partition_e_mask).get(edge_offset + i); + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(edge_offset + i); + } else { + return false; + } }, idx, result_value_output); @@ -399,7 +468,7 @@ __global__ static void per_v_transform_reduce_e_low_degree( call_e_op, init, reduce_op, - pred_op::const_true{}, + call_pred_op, idx, result_value_output); } @@ -420,6 +489,7 @@ template __global__ static void per_v_transform_reduce_e_mid_degree( edge_partition_device_view_t> */ , - ReduceOp reduce_op) + ReduceOp reduce_op, + PredOp pred_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< ReduceOp>); // atomic_reduce is defined only when @@ -448,6 +519,23 @@ __global__ static void per_v_transform_reduce_e_mid_degree( using e_op_result_t = T; using key_t = typename thrust::iterator_traits::value_type; + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); auto const lane_id = tid % raft::warp_size(); @@ -485,6 +573,19 @@ __global__ static void per_v_transform_reduce_e_mid_degree( indices, edge_offset}; + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + [[maybe_unused]] std::conditional_t reduced_e_op_result{}; [[maybe_unused]] std::conditional_t>, @@ -503,8 +604,8 @@ __global__ static void per_v_transform_reduce_e_mid_degree( raft::warp_size(); for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { thrust::optional e_op_result{thrust::nullopt}; - if (i < static_cast(local_degree) && - (*edge_partition_e_mask).get(edge_offset + i)) { + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { e_op_result = call_e_op(i); } first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) @@ -515,7 +616,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree( } } else { for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { + if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(edge_offset + i)) { auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); @@ -537,7 +638,9 @@ __global__ static void per_v_transform_reduce_e_mid_degree( raft::warp_size(); for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { thrust::optional e_op_result{thrust::nullopt}; - if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + if (i < static_cast(local_degree) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); @@ -546,15 +649,17 @@ __global__ static void per_v_transform_reduce_e_mid_degree( } } else { for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } } } } @@ -590,6 +695,7 @@ template __global__ static void per_v_transform_reduce_e_high_degree( edge_partition_device_view_t> */ , - ReduceOp reduce_op) + ReduceOp reduce_op, + PredOp pred_op) { static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< ReduceOp>); // atomic_reduce is defined only when @@ -618,6 +725,23 @@ __global__ static void per_v_transform_reduce_e_high_degree( using e_op_result_t = T; using key_t = typename thrust::iterator_traits::value_type; + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + auto idx = static_cast(blockIdx.x); using BlockReduce = cub::BlockReduce< @@ -657,6 +781,19 @@ __global__ static void per_v_transform_reduce_e_high_degree( indices, edge_offset}; + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + [[maybe_unused]] std::conditional_t reduced_e_op_result{}; [[maybe_unused]] std::conditional_t>, @@ -678,8 +815,8 @@ __global__ static void per_v_transform_reduce_e_high_degree( per_v_transform_reduce_e_kernel_block_size; for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { thrust::optional e_op_result{thrust::nullopt}; - if (i < static_cast(local_degree) && - ((*edge_partition_e_mask).get_(edge_offset + i))) { + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get_(edge_offset + i) && call_pred_op(edge_offset + i)) { e_op_result = call_e_op(i); } first_valid_thread_id = @@ -694,7 +831,7 @@ __global__ static void per_v_transform_reduce_e_high_degree( } } else { for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { + if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); @@ -717,7 +854,9 @@ __global__ static void per_v_transform_reduce_e_high_degree( per_v_transform_reduce_e_kernel_block_size; for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { thrust::optional e_op_result{thrust::nullopt}; - if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } + if ((i < static_cast(local_degree)) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } first_valid_thread_id = BlockReduce(temp_storage) .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, @@ -729,15 +868,17 @@ __global__ static void per_v_transform_reduce_e_high_degree( } } else { for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } } } } @@ -967,6 +1108,7 @@ template void per_v_transform_reduce_e(raft::handle_t const& handle, @@ -979,6 +1121,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, EdgeOp e_op, T init, ReduceOp reduce_op, + PredOp pred_op, VertexValueOutputIterator vertex_value_output_first) { constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); @@ -1495,7 +1638,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_output_buffer, e_op, major_init, - reduce_op); + reduce_op, + pred_op); } } if (h_offsets[3] - h_offsets[2]) { @@ -1528,7 +1672,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_output_buffer, e_op, major_init, - reduce_op); + reduce_op, + pred_op); } if (h_offsets[2] - h_offsets[1] > 0) { auto exec_stream = stream_pool_indices @@ -1560,7 +1705,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, e_op, major_init, ReduceOp::identity_element, - reduce_op); + reduce_op, + pred_op); } if (h_offsets[1] > 0) { auto exec_stream = stream_pool_indices @@ -1589,7 +1735,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, e_op, major_init, ReduceOp::identity_element, - reduce_op); + reduce_op, + pred_op); } } else { size_t num_keys{}; @@ -1622,7 +1769,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, output_buffer, e_op, major_init, - reduce_op); + reduce_op, + pred_op); } } @@ -2002,17 +2150,24 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, constexpr bool incoming = true; - detail::per_v_transform_reduce_e(handle, - graph_view, - static_cast(nullptr), - static_cast(nullptr), - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } /** @@ -2095,17 +2250,24 @@ void per_v_transform_reduce_incoming_e(raft::handle_t const& handle, constexpr bool incoming = true; - detail::per_v_transform_reduce_e(handle, - graph_view, - key_list.begin(), - key_list.end(), - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } /** @@ -2180,17 +2342,24 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, constexpr bool incoming = false; - detail::per_v_transform_reduce_e(handle, - graph_view, - static_cast(nullptr), - static_cast(nullptr), - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + detail::per_v_transform_reduce_e( + handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } /** @@ -2274,17 +2443,24 @@ void per_v_transform_reduce_outgoing_e(raft::handle_t const& handle, constexpr bool incoming = false; - detail::per_v_transform_reduce_e(handle, - graph_view, - key_list.begin(), - key_list.end(), - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - init, - reduce_op, - vertex_value_output_first); + detail::per_v_transform_reduce_e( + handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + detail::const_true_e_op_t{}, + vertex_value_output_first); } } // namespace cugraph From a6476b9a892ed13a25efa0074a0841dde0716b69 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 9 Aug 2024 10:48:44 -0700 Subject: [PATCH 018/126] split per_v_transform_reduce_incoming_outgoing_e implementation to two files --- .../prims/detail/per_v_transform_reduce_e.cuh | 2081 +++++++++++++++++ ...v_transform_reduce_incoming_outgoing_e.cuh | 2051 +--------------- 2 files changed, 2082 insertions(+), 2050 deletions(-) create mode 100644 cpp/src/prims/detail/per_v_transform_reduce_e.cuh diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh new file mode 100644 index 00000000000..6ba3dd5d070 --- /dev/null +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -0,0 +1,2081 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "detail/graph_partition_utils.cuh" +#include "prims/detail/optional_dataframe_buffer.hpp" +#include "prims/detail/prim_functors.cuh" +#include "prims/fill_edge_src_dst_property.cuh" +#include "prims/property_op_utils.cuh" +#include "prims/reduce_op.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cugraph { + +namespace detail { + +int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; + +template +struct iterator_value_type_or_default_t; + +template +struct iterator_value_type_or_default_t>> { + using value_type = default_t; // if Iterator is invalid (void*), value_type = default_t +}; + +template +struct iterator_value_type_or_default_t>> { + using value_type = typename thrust::iterator_traits< + Iterator>::value_type; // if iterator is valid, value_type = typename + // thrust::iterator_traits::value_type +}; + +template +struct transform_and_atomic_reduce_t { + edge_partition_device_view_t const& edge_partition{}; + vertex_t const* indices{nullptr}; + TransformOp const& transform_op{}; + PredOp const& pred_op{}; + ResultValueOutputIteratorOrWrapper& result_value_output{}; + + __device__ void operator()(edge_t i) const + { + if (pred_op(i)) { + auto e_op_result = transform_op(i); + auto minor = indices[i]; + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); + if constexpr (multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } +}; + +template +__device__ void update_result_value_output( + edge_partition_device_view_t const& edge_partition, + vertex_t const* indices, + edge_t local_degree, + TransformOp const& transform_op, + result_t init, + ReduceOp const& reduce_op, + PredOp const& pred_op, + size_t output_idx /* relevent only when update_major === true */, + ResultValueOutputIteratorOrWrapper& result_value_output) +{ + if constexpr (update_major) { + result_t val{}; + if constexpr (std::is_same_v>) { + if constexpr (std::is_same_v>) { // init is selected only when no + // edges return a valid value + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + auto tmp = transform_op(i); + val = tmp; + break; + } + } else { + val = thrust::transform_reduce(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_op, + init, + reduce_op); + } + } else { + val = init; + for (edge_t i = 0; i < local_degree; ++i) { + if (pred_op(i)) { + auto tmp = transform_op(i); + if constexpr (std::is_same_v>) { // init is selected only when + // no edges return a valid + // value + val = tmp; + break; + } else { + val = reduce_op(val, tmp); + } + } + } + } + *(result_value_output + output_idx) = val; + } else { + thrust::for_each(thrust::seq, + thrust::make_counting_iterator(edge_t{0}), + thrust::make_counting_iterator(local_degree), + transform_and_atomic_reduce_t{ + edge_partition, indices, transform_op, pred_op, result_value_output}); + } +} + +template +__global__ static void per_v_transform_reduce_e_hypersparse( + edge_partition_device_view_t edge_partition, + OptionalKeyIterator key_first, + OptionalKeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + constexpr bool use_input_key = !std::is_same_v; + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + static_assert(update_major || !use_input_key); + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + size_t key_count{}; + if constexpr (use_input_key) { + key_count = static_cast(thrust::distance(key_first, key_last)); + } else { + key_count = *(edge_partition.dcs_nzd_vertex_count()); + } + + while (idx < key_count) { + key_t key{}; + vertex_t major{}; + thrust::optional major_idx{}; + if constexpr (use_input_key) { + key = *(key_first + idx); + major = thrust_tuple_get_or_identity(key); + major_idx = edge_partition.major_idx_from_major_nocheck(major); + } else { + key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); + major = key; + auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - + edge_partition.major_range_first()); + major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region + } + + size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first()); + if (major_idx) { + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(*major_idx)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(edge_offset + i); + } else { + return false; + } + }, + output_idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + output_idx, + result_value_output); + } + } else { + if constexpr (update_major) { *(result_value_output + output_idx) = init; } + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_low_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = typename thrust::iterator_traits::value_type; + + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = + edge_partition.local_edges(static_cast(major_offset)); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + + if (edge_partition_e_mask) { + update_result_value_output( + edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { + if ((*edge_partition_e_mask).get(edge_offset + i)) { + return call_pred_op(edge_offset + i); + } else { + return false; + } + }, + idx, + result_value_output); + } else { + update_result_value_output(edge_partition, + indices, + local_degree, + call_e_op, + init, + reduce_op, + call_pred_op, + idx, + result_value_output); + } + idx += gridDim.x * blockDim.x; + } +} + +template +__global__ static void per_v_transform_reduce_e_mid_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true && !std::is_same_v> */ + , + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); + auto const lane_id = tid % raft::warp_size(); + auto idx = static_cast(tid / raft::warp_size()); + + using WarpReduce = cub::WarpReduce< + std::conditional_t>, int32_t, e_op_result_t>>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) + : int32_t{1} /* dummy */]; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_lane_id{}; + if constexpr (update_major) { reduced_e_op_result = (lane_id == 0) ? init : identity_element; } + if constexpr (update_major && std::is_same_v>) { + first_valid_lane_id = raft::warp_size(); + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(edge_offset + i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { + thrust::optional e_op_result{thrust::nullopt}; + if (i < static_cast(local_degree) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); + first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); + if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_lane_id != raft::warp_size()) { break; } + } + } else { + for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) + .Reduce(reduced_e_op_result, reduce_op); + if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x * (blockDim.x / raft::warp_size()); + } +} + +template +__global__ static void per_v_transform_reduce_e_high_degree( + edge_partition_device_view_t edge_partition, + KeyIterator key_first, + KeyIterator key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper result_value_output, + EdgeOp e_op, + T init /* relevant only if update_major == true */, + T identity_element /* relevant only if update_major == true && !std::is_same_v> */ + , + ReduceOp reduce_op, + PredOp pred_op) +{ + static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< + ReduceOp>); // atomic_reduce is defined only when + // has_compatible_raft_comms_op_t is true + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using e_op_result_t = T; + using key_t = typename thrust::iterator_traits::value_type; + + constexpr bool const_true_pred_op = + std::is_same_v>; + using call_pred_op_t = std::conditional_t, + call_e_op_t>; + + auto idx = static_cast(blockIdx.x); + + using BlockReduce = cub::BlockReduce< + std::conditional_t>, int32_t, e_op_result_t>, + per_v_transform_reduce_e_kernel_block_size>; + [[maybe_unused]] __shared__ + std::conditional_t + temp_storage; + [[maybe_unused]] __shared__ + std::conditional_t>, + int32_t, + std::byte /* dummy */> + output_thread_id; + + while (idx < static_cast(thrust::distance(key_first, key_last))) { + auto key = *(key_first + idx); + auto major = thrust_tuple_get_or_identity(key); + + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + edge_offset}; + + call_pred_op_t call_pred_op{}; + if constexpr (!const_true_pred_op) { + call_pred_op = call_pred_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } + + [[maybe_unused]] std::conditional_t + reduced_e_op_result{}; + [[maybe_unused]] std::conditional_t>, + int32_t, + std::byte /* dummy */> + first_valid_thread_id{}; + if constexpr (update_major) { + reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; + } + if constexpr (update_major && std::is_same_v>) { + first_valid_thread_id = per_v_transform_reduce_e_kernel_block_size; + } + + if (edge_partition_e_mask) { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / + per_v_transform_reduce_e_kernel_block_size) * + per_v_transform_reduce_e_kernel_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && + (*edge_partition_e_mask).get_(edge_offset + i) && call_pred_op(edge_offset + i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } else { + if constexpr (update_major && std::is_same_v>) { + auto rounded_up_local_degree = + ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / + per_v_transform_reduce_e_kernel_block_size) * + per_v_transform_reduce_e_kernel_block_size; + for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { + thrust::optional e_op_result{thrust::nullopt}; + if ((i < static_cast(local_degree)) && call_pred_op(i)) { + e_op_result = call_e_op(i); + } + first_valid_thread_id = + BlockReduce(temp_storage) + .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, + cub::Min()); + if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } + __syncthreads(); + if (threadIdx.x == output_thread_id) { reduced_e_op_result = *e_op_result; } + if (output_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + } + } else { + for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { + if (call_pred_op(i)) { + auto e_op_result = call_e_op(i); + if constexpr (update_major) { + reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); + } else { + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); + if constexpr (GraphViewType::is_multi_gpu) { + reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); + } else { + reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); + } + } + } + } + } + } + + if constexpr (update_major) { + if constexpr (std::is_same_v>) { + if (threadIdx.x == ((first_valid_thread_id == per_v_transform_reduce_e_kernel_block_size) + ? 0 + : first_valid_thread_id)) { + *(result_value_output + idx) = reduced_e_op_result; + } + } else { + reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); + if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } + } + } + + idx += gridDim.x; + } +} + +template +__host__ __device__ int rank_to_priority( + int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) +{ + if (rank == root) { // no need for communication (priority 0) + return int{0}; + } else if (rank / subgroup_size == + root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in + // [1, subgroup_size) + int modulo = subgroup_size - 1; + return int{1} + static_cast((static_cast(rank) + offset) % modulo); + } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) + int modulo = comm_size - subgroup_size; + return subgroup_size + static_cast((static_cast(rank) + offset) % modulo); + } +} + +template +__host__ __device__ int priority_to_rank( + int priority, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffict */) +{ + if (priority == int{0}) { + return root; + } else if (priority < subgroup_size) { + int modulo = subgroup_size - int{1}; + return static_cast( + (static_cast(priority - int{1}) + (modulo - static_cast(offset % modulo))) % + modulo); + } else { + int modulo = comm_size - subgroup_size; + return static_cast((static_cast(priority - subgroup_size) + + (modulo - static_cast(offset % modulo))) % + modulo); + } +} + +template +rmm::device_uvector compute_keep_flags( + raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are + // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same + // DGX node should be the next) + + rmm::device_uvector priorities(thrust::distance(value_first, value_last), + stream_view); + thrust::tabulate( + rmm::exec_policy(stream_view), + priorities.begin(), + priorities.end(), + [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { + auto val = *(value_first + offset); + return (val != init) + ? rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)) + : std::numeric_limits::max(); // lowest priority + }); + device_allreduce(comm, + priorities.data(), + priorities.data(), + priorities.size(), + raft::comms::op_t::MIN, + root, + stream_view); + + rmm::device_uvector keep_flags(priorities.size()); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + keep_flags.begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = + priority_to_rank(priority, root, subgroup_size, comm_size, offset); + return (rank == comm_rank); + }); + + return keep_flags; +} + +template +std::tuple, + dataframe_buffer_type_t::value_type>> +compute_offset_value_pairs(raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + rmm::device_uvector keep_flags(0, stream_view); + if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t + keep_flags = compute_keep_flags( + comm, value_first, value_last, root, subgroup_size, init, stream_view); + } else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t + keep_flags = compute_keep_flags( + comm, value_first, value_last, root, subgroup_size, init, stream_view); + } else { // priority_t == uint32_t + keep_flags = compute_keep_flags( + comm, value_first, value_last, root, subgroup_size, init, stream_view); + } + + auto copy_size = thrust::count_if( + rmm::exec_policy(stream_view), keep_flags.begin(), keep_flags.end(), thrust::identity{}); + + rmm::device_uvector offsets(copy_size, stream_view); + auto values = allocate_dataframe_buffer(copy_size, stream_view); + auto offset_value_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), value_first); + thrust::copy_if(rmm::exec_policy(stream_view), + offset_value_pair_first, + offset_value_pair_first + keep_flags.size(), + keep_flags.begin(), + thrust::make_zip_iterator(offsets.begin(), dataframe_buffer_begin(values)), + thrust::identity{}); + + return std::make_tuple(std::move(offsets), std::move(values)); +} + +template +void gather_offset_value_pairs_and_update_vertex_value_output( + raft::comms::comms_t const& comm, + rmm::device_uvector&& offsets, + dataframe_buffer_type_t&& values, + VertexValueOutputIterator vertex_value_output_first, + int root, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + + auto rx_sizes = host_scalar_gather(comm, offsets.size(), root, stream_view); + std::vector rx_displs{}; + rmm::device_uvector rx_offsets(0, stream_view); + if (comm_rank == root) { + rx_displs.resize(rx_sizes.size()); + std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); + rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); + } + + device_gatherv(comm, + offsets.begin(), + rx_offsets.begin(), + offsets.size(), + rx_sizes, + rx_displs, + root, + stream_view); + offsets.resize(0, stream_view); + offsets.shrink_to_fit(stream_view); + + auto rx_values = allocate_dataframe_buffer(rx_offsets.size(), stream_view); + device_gatherv(comm, + get_dataframe_buffer_begin(values), + get_dataframe_buffer_begin(rx_values), + values.size(), + rx_sizes, + rx_displs, + root, + stream_view); + resize_dataframe_buffer(values, 0, stream_view); + shrink_to_fit_dataframe_buffer(values, stream_view); + + if (comm_rank == root) { + thrust::scatter(rmm::exec_policy(stream_view), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + rx_offsets.begin(), + vertex_value_output_first); + } +} + +template +void per_v_transform_reduce_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + OptionalKeyIterator sorted_unique_key_first, + OptionalKeyIterator sorted_unique_key_last, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first) +{ + constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); + constexpr bool use_input_key = !std::is_same_v; + + static_assert(update_major || !use_input_key); + static_assert( + ReduceOp::pure_function && + ((reduce_op::has_compatible_raft_comms_op_v && + reduce_op::has_identity_element_v) || + (update_major && + std::is_same_v>))); // current restriction, to support general + // reduction, we may need to take a less + // efficient code path + + using vertex_t = typename GraphViewType::vertex_type; + using edge_t = typename GraphViewType::edge_type; + using key_t = + typename iterator_value_type_or_default_t::value_type; + + using edge_partition_src_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeSrcValueInputWrapper::value_iterator, + typename EdgeSrcValueInputWrapper::value_type>>; + using edge_partition_dst_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_endpoint_dummy_property_device_view_t, + detail::edge_partition_endpoint_property_device_view_t< + vertex_t, + typename EdgeDstValueInputWrapper::value_iterator, + typename EdgeDstValueInputWrapper::value_type>>; + using edge_partition_e_input_device_view_t = std::conditional_t< + std::is_same_v, + detail::edge_partition_edge_dummy_property_device_view_t, + detail::edge_partition_edge_property_device_view_t< + edge_t, + typename EdgeValueInputWrapper::value_iterator, + typename EdgeValueInputWrapper::value_type>>; + + static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); + + constexpr bool use_bitmap = GraphViewType::is_multi_gpu && + !std::is_same_v && + std::is_same_v; + + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + + // 1. prepare key list + + auto sorted_unique_nzd_key_last = sorted_unique_key_last; + if constexpr (use_input_key) { + size_t partition_idx = 0; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + auto sorted_uniue_nzd_key_last = compute_key_lower_bound( + sorted_unique_key_first, + sorted_unique_key_last, + graph_view.local_vertex_partition_range_first() + ((*segment_offsets).rbegin() + 1), + handle.get_stream()); + } + } + + std::conditional_t, std::byte /* dummy */> + local_key_list_sizes{}; + if constexpr (use_input_key) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + local_key_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), + handle.get_stream()); + } else { + local_key_list_sizes = std::vector{ + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; + } + } + + std:: + conditional_t>, std::byte /* dummy */> + key_list_bitmap{}; + std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + if constexpr (use_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(); + + std::tie(key_list_bitmap, use_bitmap_flags) = + compute_vertex_list_bitmap_info(minor_comm, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + bool_size, + handle.get_stream()); + } + + // 2. compute subgroup_size, set-up temporary buffers & stream pool, and initialize + + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (update_major && std::is_same_v>) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / minor_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + + using minor_tmp_buffer_type = std::conditional_t, + edge_dst_property_t>; + [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + minor_tmp_buffer = std::make_unique(handle, graph_view); + } + + using edge_partition_minor_output_device_view_t = + std::conditional_tmutable_view().value_first())>, + void /* dummy */>; + + if constexpr (update_major) { // no vertices in the zero degree segment are visited + if constexpr (use_input_key) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), + init); + } else { + size_t partition_idx = 0; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + *((*segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*segment_offsets).rbegin()), + init); + } + } + } else { + if constexpr (GraphViewType::is_multi_gpu) { + auto minor_init = init; + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may + // not store values for the entire minor range + minor_init = ReduceOp::identity_element; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; + } + fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); + } else { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + graph_view.local_vertex_partition_range_size(), + init); + } + } + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + if ((graph_view.local_edge_partition_segment_offsets(0)) && + (handle.get_stream_pool_size() >= max_segments)) { + for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) { + assert(graph_view.local_edge_partition_segment_offsets(i)); + } + + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + // memory footprint vs parallelism trade-off + // peak memory requirement per loop is + // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 + // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) + // FIXME: should we consider edge_partition_key_buffer as well? + + size_t num_streams = + std::min(static_cast(minor_comm_size) * max_segments, + raft::round_down_safe(handle.get_stream_pool_size(), max_segments)); + if constexpr (update_major) { + size_t value_size{0}; + if constexpr (is_thrust_tuple_of_arithmetic::value) { + auto elem_sizes = compute_thrust_tuple_element_sizes{}(); + value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); + } else { + value_size = sizeof(T); + } + size_t key_size{0}; + if constexpr (use_input_key) { + if constexpr (std::is_same_v) { + key_size = sizeof(vertex_t); + } else { + key_size = sizeof(thrust::tuple_element<0, key_t>::type) + + sizeof(thrust::tuple_element<1, key_t>::type); + } + } + + auto num_edges = graph_view.compute_number_of_edges(handle); + + size_t aggregate_major_range_size{}; + if constexpr (use_input_key) { + aggregate_major_range_size = + host_scalar_allreduce(handle.get_comms(), + static_cast(thrust::distance(sorted_unique_key_first, + sorted_unique_nzd_key_last)), + raft::comms::op_t::SUM, + handle.get_stream()); + } else { + aggregate_major_range_size = graph_view.number_of_vertices(); + } + num_streams = std::min( + static_cast( + (aggregate_major_range_size > 0 + ? (static_cast(num_edges) / static_cast(aggregate_major_range_size)) + : double{0}) * + (static_cast(sizeof(vertex_t)) / static_cast(value_size + key_size))) * + max_segments, + num_streams); + } + + if (num_streams >= max_segments) { + assert((num_streams % max_segments) == 0); + stream_pool_indices = std::vector(num_streams); + std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); + handle.sync_stream(); + } + } + } + + std::vector> major_tmp_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), + size_t{0}); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + if constexpr (use_input_key) { + major_tmp_buffer_sizes = local_key_list_sizes; + } else { + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + if (segment_offsets) { + major_tmp_buffer_sizes[i] = + *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment + } else { + if constexpr (GraphViewType::is_storage_transposed) { + major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); + } else { + major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); + } + } + } + } + if (stream_pool_indices) { + auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + major_tmp_buffers.reserve(num_concurrent_loops); + for (size_t i = 0; i < num_concurrent_loops; ++i) { + size_t max_size{0}; + for (size_t j = i; j < graph_view.number_of_local_edge_partitions(); + j += num_concurrent_loops) { + max_size = std::max(major_tmp_buffer_sizes[j], max_size); + } + major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); + } + } else { + major_tmp_buffers.reserve(1); + major_tmp_buffers.push_back(allocate_dataframe_buffer( + *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), + handle.get_stream())); + } + } else { // dummy + major_tmp_buffers.reserve(1); + major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); + } + + std::conditional_t>, + std::vector>, + std::byte /* dummy */> + offset_vectors{}; + std::conditional_t>, + std::vector>, + std::byte /* dummy */> + value_vectors{}; + if constexpr (update_major && std::is_same_v>) { + auto capacity = graph_view.number_of_local_edge_partitions() * + (graph_view.local_edge_partition_segment_offsets(0) ? max_segments : 1); + offset_vectors.reserve(capacity); + value_vectors.reserve(capacity); + + for (size_t i = 0; i < capacity; ++i) { + offset_vectors.emplace_back(0, handle.get_stream()); + value_vectors.emplace_back(0, handle.get_stream()); + } + } + + if (stream_pool_indices) { handle.sync_stream(); } + + // 3. proces local edge partitions + + auto edge_mask_view = graph_view.edge_mask_view(); + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, i) + : thrust::nullopt; + + auto major_init = ReduceOp::identity_element; + if constexpr (update_major) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + if constexpr (std::is_same_v>) { + major_init = init; // init is selected only when no edges return a valid value + } else { + major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; + } + } else { + major_init = init; + } + } + + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + auto loop_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) + : handle.get_stream(); + + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + auto edge_partition_key_buffer = allocate_optional_dataframe_buffer< + std::conditional_t>(0, + loop_stream); + std::conditional_t>, std::byte /* dummy */> + key_segment_offsets{}; + if constexpr (use_input_key) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + + resize_optional_dataframe_buffer( + edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); + + if constexpr (use_bitmap) { + std::variant, decltype(sorted_unique_key_first)> + v_list{}; + if (use_bitmap_flags[i]) { + v_list = raft::device_span((*key_list_bitmap).data(), + (*key_list_bitmap).size()); + } else { + v_list = sorted_unique_key_first; + } + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + device_bcast_vertex_list(minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + static_cast(thrust::distance( + sorted_unique_key_first, sorted_unique_nzd_key_last)), + static_cast(i), + loop_stream); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffer), + local_key_list_sizes[i], + static_cast(i), + loop_stream); + } + + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffer); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); + } + } + if (segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + edge_partition_key_first, + edge_partition_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), + graph_view.use_dcs(), + loop_stream); + } else { + key_segment_offsets = std::nullopt; + } + } + RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, i); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, i); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); + + auto major_buffer_first = + get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); + + std::conditional_t, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = major_buffer_first; + } else { + output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + } + } else { + output_buffer = vertex_value_output_first; + } + + using segment_key_iterator_t = + std::conditional_t; + + if (segment_offsets) { + static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + + std::vector h_offsets{}; + if constexpr (use_input_key) { + h_offsets = (*key_segment_offsets); + } else { + h_offsets.resize((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + h_offsets.begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + + // FIXME: we may further improve performance by 1) individually tuning block sizes for + // different segments; and 2) adding one more segment for very high degree vertices and + // running segmented reduction + if (edge_partition.dcs_nzd_vertex_count()) { + auto exec_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) + : handle.get_stream(); + + if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit + // every vertex in the hypersparse segment + thrust::fill(rmm::exec_policy(exec_stream), + output_buffer + h_offsets[3], + output_buffer + h_offsets[4], + major_init); + } + + auto segment_size = use_input_key + ? (h_offsets[4] - h_offsets[3]) + : static_cast(*(edge_partition.dcs_nzd_vertex_count())); + if (segment_size > 0) { + raft::grid_1d_thread_t update_grid(segment_size, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += h_offsets[3]; } + auto segment_key_first = edge_partition_key_first; + auto segment_key_last = edge_partition_key_last; + if constexpr (use_input_key) { + segment_key_first += h_offsets[3]; + segment_key_last += h_offsets[4]; + } else { + assert(segment_key_first == nullptr); + assert(segment_key_last == nullptr); + } + detail::per_v_transform_reduce_e_hypersparse + <<>>( + edge_partition, + segment_key_first, + segment_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } + if (h_offsets[3] - h_offsets[2]) { + auto exec_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += h_offsets[2]; } + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + segment_key_first += h_offsets[2]; + auto num_keys = h_offsets[3] - h_offsets[2]; + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + (h_offsets[3] - h_offsets[2]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + if (h_offsets[2] - h_offsets[1] > 0) { + auto exec_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += h_offsets[1]; } + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + segment_key_first += h_offsets[1]; + detail::per_v_transform_reduce_e_mid_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + (h_offsets[2] - h_offsets[1]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + ReduceOp::identity_element, + reduce_op, + pred_op); + } + if (h_offsets[1] > 0) { + auto exec_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()) + : handle.get_stream(); + raft::grid_1d_block_t update_grid(h_offsets[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_high_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + h_offsets[1], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + ReduceOp::identity_element, + reduce_op, + pred_op); + } + } else { + size_t num_keys{}; + if constexpr (use_input_key) { + num_keys = + static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + num_keys = static_cast(edge_partition.major_range_size()); + } + + if (edge_partition.major_range_size() > 0) { + raft::grid_1d_thread_t update_grid(num_keys, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + num_keys, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + if (segment_offsets && stream_pool_indices) { + if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[3]; + auto segment_size = (*segment_offsets)[4] - (*segment_offsets)[3]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 3] = std::move(offsets); + value_vectors[i * max_segments + 3] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[2]; + auto segment_size = (*segment_offsets)[3] - (*segment_offsets)[2]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 2] = std::move(offsets); + value_vectors[i * max_segments + 2] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } + } + if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[1]; + auto segment_size = (*segment_offsets)[2] - (*segment_offsets)[1]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 1] = std::move(offsets); + value_vectors[i * max_segments + 1] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } + } + if ((*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()); + auto segment_size = (*segment_offsets)[1]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = + compute_offset_value_pairs(minor_comm, + major_buffer_first, + major_buffer_first + segment_size, + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments] = std::move(offsets); + value_vectors[i * max_segments] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first, + vertex_value_output_first, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } + } + } else { + size_t reduction_size = static_cast( + segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size()); + if constexpr (std::is_same_v>) { + auto [offsets, values] = + compute_offset_value_pairs(minor_comm, + major_buffer_first, + major_buffer_first + reduction_size, + static_cast(i), + subgroup_size, + init, + handle.get_stream()); + offset_vectors[i] = std::move(offsets); + value_vectors[i] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first, + vertex_value_output_first, + reduction_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + handle.get_stream()); + } + } + } + + if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { + handle.sync_stream_pool( + *stream_pool_indices); // to prevent buffer over-write (this can happen as + // *segment_offsets do not necessarily coincide in different edge + // partitions). + } + } + + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + // 4. communication + + if constexpr (update_major && std::is_same_v>) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + + if (segment_offsets && stream_pool_indices) { + if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[3]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 3]), + std::move(value_vectors[i * max_segments + 3]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } + if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[2]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 2]), + std::move(value_vectors[i * max_segments + 2]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } + if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % + (*stream_pool_indices).size()); + auto segment_offset = (*segment_offsets)[1]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 1]), + std::move(value_vectors[i * max_segments + 1]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } + if ((*segment_offsets)[1] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % + (*stream_pool_indices).size()); + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments]), + std::move(value_vectors[i * max_segments]), + vertex_value_output_first, + static_cast(i), + segment_stream); + } + } else { + gather_offset_value_pairs_and_update_vertex_value_output(minor_comm, + std::move(offset_vectors[i]), + std::move(value_vectors[i]), + vertex_value_output_first, + static_cast(i), + handle.get_stream()); + } + } + } + + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + auto& comm = handle.get_comms(); + auto const comm_rank = comm.get_rank(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // applying the initial value is deferred to here + vertex_t max_vertex_partition_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + max_vertex_partition_size = + std::max(max_vertex_partition_size, + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); + } + auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); + auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); + std::optional> minor_key_offsets{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); + } else { + minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); + } + for (int i = 0; i < major_comm_size; ++i) { + auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + thrust::fill(handle.get_thrust_policy(), + tx_buffer_first, + tx_buffer_first + + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), + minor_init); + auto value_first = thrust::make_transform_iterator( + view.value_first(), + cuda::proclaim_return_type( + [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); + thrust::scatter(handle.get_thrust_policy(), + value_first + (*minor_key_offsets)[i], + value_first + (*minor_key_offsets)[i + 1], + thrust::make_transform_iterator( + (*(view.keys())).begin() + (*minor_key_offsets)[i], + cuda::proclaim_return_type( + [key_first = graph_view.vertex_partition_range_first( + this_segment_vertex_partition_id)] __device__(auto key) { + return key - key_first; + })), + tx_buffer_first); + device_reduce(major_comm, + tx_buffer_first, + vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } else { + auto first_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); + vertex_t minor_range_first = + graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); + for (int i = 0; i < major_comm_size; ++i) { + auto this_segment_vertex_partition_id = + compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - + minor_range_first; + device_reduce(major_comm, + view.value_first() + offset, + vertex_value_output_first, + static_cast( + graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), + ReduceOp::compatible_raft_comms_op, + i, + handle.get_stream()); + } + } + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh index e2e17ea8c90..5ba7edec894 100644 --- a/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_incoming_outgoing_e.cuh @@ -15,2069 +15,20 @@ */ #pragma once -#include "detail/graph_partition_utils.cuh" -#include "prims/detail/optional_dataframe_buffer.hpp" -#include "prims/detail/prim_functors.cuh" -#include "prims/fill_edge_src_dst_property.cuh" -#include "prims/property_op_utils.cuh" -#include "prims/reduce_op.cuh" +#include "prims/detail/per_v_transform_reduce_e.cuh" #include "prims/vertex_frontier.cuh" -#include -#include -#include #include #include -#include -#include -#include #include -#include -#include #include -#include -#include - -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include #include -#include #include namespace cugraph { -namespace detail { - -int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; - -template -struct iterator_value_type_or_default_t; - -template -struct iterator_value_type_or_default_t>> { - using value_type = default_t; // if Iterator is invalid (void*), value_type = default_t -}; - -template -struct iterator_value_type_or_default_t>> { - using value_type = typename thrust::iterator_traits< - Iterator>::value_type; // if iterator is valid, value_type = typename - // thrust::iterator_traits::value_type -}; - -template -struct transform_and_atomic_reduce_t { - edge_partition_device_view_t const& edge_partition{}; - vertex_t const* indices{nullptr}; - TransformOp const& transform_op{}; - PredOp const& pred_op{}; - ResultValueOutputIteratorOrWrapper& result_value_output{}; - - __device__ void operator()(edge_t i) const - { - if (pred_op(i)) { - auto e_op_result = transform_op(i); - auto minor = indices[i]; - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(minor); - if constexpr (multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } -}; - -template -__device__ void update_result_value_output( - edge_partition_device_view_t const& edge_partition, - vertex_t const* indices, - edge_t local_degree, - TransformOp const& transform_op, - result_t init, - ReduceOp const& reduce_op, - PredOp const& pred_op, - size_t output_idx /* relevent only when update_major === true */, - ResultValueOutputIteratorOrWrapper& result_value_output) -{ - if constexpr (update_major) { - result_t val{}; - if constexpr (std::is_same_v>) { - if constexpr (std::is_same_v>) { // init is selected only when no - // edges return a valid value - val = init; - for (edge_t i = 0; i < local_degree; ++i) { - auto tmp = transform_op(i); - val = tmp; - break; - } - } else { - val = thrust::transform_reduce(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_op, - init, - reduce_op); - } - } else { - val = init; - for (edge_t i = 0; i < local_degree; ++i) { - if (pred_op(i)) { - auto tmp = transform_op(i); - if constexpr (std::is_same_v>) { // init is selected only when - // no edges return a valid - // value - val = tmp; - break; - } else { - val = reduce_op(val, tmp); - } - } - } - } - *(result_value_output + output_idx) = val; - } else { - thrust::for_each(thrust::seq, - thrust::make_counting_iterator(edge_t{0}), - thrust::make_counting_iterator(local_degree), - transform_and_atomic_reduce_t{ - edge_partition, indices, transform_op, pred_op, result_value_output}); - } -} - -template -__global__ static void per_v_transform_reduce_e_hypersparse( - edge_partition_device_view_t edge_partition, - OptionalKeyIterator key_first, - OptionalKeyIterator key_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - ReduceOp reduce_op, - PredOp pred_op) -{ - constexpr bool use_input_key = !std::is_same_v; - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - static_assert(update_major || !use_input_key); - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = - typename iterator_value_type_or_default_t::value_type; - - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = static_cast(tid); - - size_t key_count{}; - if constexpr (use_input_key) { - key_count = static_cast(thrust::distance(key_first, key_last)); - } else { - key_count = *(edge_partition.dcs_nzd_vertex_count()); - } - - while (idx < key_count) { - key_t key{}; - vertex_t major{}; - thrust::optional major_idx{}; - if constexpr (use_input_key) { - key = *(key_first + idx); - major = thrust_tuple_get_or_identity(key); - major_idx = edge_partition.major_idx_from_major_nocheck(major); - } else { - key = *(edge_partition.major_from_major_hypersparse_idx_nocheck(static_cast(idx))); - major = key; - auto major_start_offset = static_cast(*(edge_partition.major_hypersparse_first()) - - edge_partition.major_range_first()); - major_idx = major_start_offset + idx; // major_offset != major_idx in the hypersparse region - } - - size_t output_idx = use_input_key ? idx : (major - *(edge_partition).major_hypersparse_first()); - if (major_idx) { - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(*major_idx)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - edge_offset}; - - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } - - if (edge_partition_e_mask) { - update_result_value_output( - edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_pred_op(edge_offset + i); - } else { - return false; - } - }, - output_idx, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - call_pred_op, - output_idx, - result_value_output); - } - } else { - if constexpr (update_major) { *(result_value_output + output_idx) = init; } - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_low_degree( - edge_partition_device_view_t edge_partition, - KeyIterator key_first, - KeyIterator key_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - ReduceOp reduce_op, - PredOp pred_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = typename thrust::iterator_traits::value_type; - - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = static_cast(tid); - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = - edge_partition.local_edges(static_cast(major_offset)); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - edge_offset}; - - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } - - if (edge_partition_e_mask) { - update_result_value_output( - edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_pred_op(edge_offset + i); - } else { - return false; - } - }, - idx, - result_value_output); - } else { - update_result_value_output(edge_partition, - indices, - local_degree, - call_e_op, - init, - reduce_op, - call_pred_op, - idx, - result_value_output); - } - idx += gridDim.x * blockDim.x; - } -} - -template -__global__ static void per_v_transform_reduce_e_mid_degree( - edge_partition_device_view_t edge_partition, - KeyIterator key_first, - KeyIterator key_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true && !std::is_same_v> */ - , - ReduceOp reduce_op, - PredOp pred_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - using key_t = typename thrust::iterator_traits::value_type; - - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); - auto const lane_id = tid % raft::warp_size(); - auto idx = static_cast(tid / raft::warp_size()); - - using WarpReduce = cub::WarpReduce< - std::conditional_t>, int32_t, e_op_result_t>>; - [[maybe_unused]] __shared__ - std::conditional_t - temp_storage[update_major ? (per_v_transform_reduce_e_kernel_block_size / raft::warp_size()) - : int32_t{1} /* dummy */]; - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - edge_offset}; - - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } - - [[maybe_unused]] std::conditional_t - reduced_e_op_result{}; - [[maybe_unused]] std::conditional_t>, - int32_t, - std::byte /* dummy */> - first_valid_lane_id{}; - if constexpr (update_major) { reduced_e_op_result = (lane_id == 0) ? init : identity_element; } - if constexpr (update_major && std::is_same_v>) { - first_valid_lane_id = raft::warp_size(); - } - - if (edge_partition_e_mask) { - if constexpr (update_major && std::is_same_v>) { - auto rounded_up_local_degree = - ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); - for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { - thrust::optional e_op_result{thrust::nullopt}; - if ((i < static_cast(local_degree)) && - (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { - e_op_result = call_e_op(i); - } - first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } - if (first_valid_lane_id != raft::warp_size()) { break; } - } - } else { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } - } else { - if constexpr (update_major && std::is_same_v>) { - auto rounded_up_local_degree = - ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); - for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { - thrust::optional e_op_result{thrust::nullopt}; - if (i < static_cast(local_degree) && call_pred_op(i)) { - e_op_result = call_e_op(i); - } - first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { reduced_e_op_result = *e_op_result; } - if (first_valid_lane_id != raft::warp_size()) { break; } - } - } else { - for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if (call_pred_op(i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } - } - - if constexpr (update_major) { - if constexpr (std::is_same_v>) { - if (lane_id == ((first_valid_lane_id == raft::warp_size()) ? 0 : first_valid_lane_id)) { - *(result_value_output + idx) = reduced_e_op_result; - } - } else { - reduced_e_op_result = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(reduced_e_op_result, reduce_op); - if (lane_id == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - } - - idx += gridDim.x * (blockDim.x / raft::warp_size()); - } -} - -template -__global__ static void per_v_transform_reduce_e_high_degree( - edge_partition_device_view_t edge_partition, - KeyIterator key_first, - KeyIterator key_last, - EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, - EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, - EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, - thrust::optional edge_partition_e_mask, - ResultValueOutputIteratorOrWrapper result_value_output, - EdgeOp e_op, - T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true && !std::is_same_v> */ - , - ReduceOp reduce_op, - PredOp pred_op) -{ - static_assert(update_major || reduce_op::has_compatible_raft_comms_op_v< - ReduceOp>); // atomic_reduce is defined only when - // has_compatible_raft_comms_op_t is true - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using e_op_result_t = T; - using key_t = typename thrust::iterator_traits::value_type; - - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - - auto idx = static_cast(blockIdx.x); - - using BlockReduce = cub::BlockReduce< - std::conditional_t>, int32_t, e_op_result_t>, - per_v_transform_reduce_e_kernel_block_size>; - [[maybe_unused]] __shared__ - std::conditional_t - temp_storage; - [[maybe_unused]] __shared__ - std::conditional_t>, - int32_t, - std::byte /* dummy */> - output_thread_id; - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t edge_offset{}; - edge_t local_degree{}; - thrust::tie(indices, edge_offset, local_degree) = edge_partition.local_edges(major_offset); - - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - edge_offset}; - - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } - - [[maybe_unused]] std::conditional_t - reduced_e_op_result{}; - [[maybe_unused]] std::conditional_t>, - int32_t, - std::byte /* dummy */> - first_valid_thread_id{}; - if constexpr (update_major) { - reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; - } - if constexpr (update_major && std::is_same_v>) { - first_valid_thread_id = per_v_transform_reduce_e_kernel_block_size; - } - - if (edge_partition_e_mask) { - if constexpr (update_major && std::is_same_v>) { - auto rounded_up_local_degree = - ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / - per_v_transform_reduce_e_kernel_block_size) * - per_v_transform_reduce_e_kernel_block_size; - for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { - thrust::optional e_op_result{thrust::nullopt}; - if ((i < static_cast(local_degree)) && - (*edge_partition_e_mask).get_(edge_offset + i) && call_pred_op(edge_offset + i)) { - e_op_result = call_e_op(i); - } - first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - first_valid_thread_id = output_thread_id; - if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } - if (first_valid_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } - } - } else { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } - } else { - if constexpr (update_major && std::is_same_v>) { - auto rounded_up_local_degree = - ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / - per_v_transform_reduce_e_kernel_block_size) * - per_v_transform_reduce_e_kernel_block_size; - for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { - thrust::optional e_op_result{thrust::nullopt}; - if ((i < static_cast(local_degree)) && call_pred_op(i)) { - e_op_result = call_e_op(i); - } - first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { reduced_e_op_result = *e_op_result; } - if (output_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } - } - } else { - for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if (call_pred_op(i)) { - auto e_op_result = call_e_op(i); - if constexpr (update_major) { - reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); - } else { - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(indices[i]); - if constexpr (GraphViewType::is_multi_gpu) { - reduce_op::atomic_reduce(result_value_output, minor_offset, e_op_result); - } else { - reduce_op::atomic_reduce(result_value_output + minor_offset, e_op_result); - } - } - } - } - } - } - - if constexpr (update_major) { - if constexpr (std::is_same_v>) { - if (threadIdx.x == ((first_valid_thread_id == per_v_transform_reduce_e_kernel_block_size) - ? 0 - : first_valid_thread_id)) { - *(result_value_output + idx) = reduced_e_op_result; - } - } else { - reduced_e_op_result = BlockReduce(temp_storage).Reduce(reduced_e_op_result, reduce_op); - if (threadIdx.x == 0) { *(result_value_output + idx) = reduced_e_op_result; } - } - } - - idx += gridDim.x; - } -} - -template -__host__ __device__ int rank_to_priority( - int rank, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - int comm_size, - vertex_t offset /* to evenly distribute traffic */) -{ - if (rank == root) { // no need for communication (priority 0) - return int{0}; - } else if (rank / subgroup_size == - root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in - // [1, subgroup_size) - int modulo = subgroup_size - 1; - return int{1} + static_cast((static_cast(rank) + offset) % modulo); - } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) - int modulo = comm_size - subgroup_size; - return subgroup_size + static_cast((static_cast(rank) + offset) % modulo); - } -} - -template -__host__ __device__ int priority_to_rank( - int priority, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - int comm_size, - vertex_t offset /* to evenly distribute traffict */) -{ - if (priority == int{0}) { - return root; - } else if (priority < subgroup_size) { - int modulo = subgroup_size - int{1}; - return static_cast( - (static_cast(priority - int{1}) + (modulo - static_cast(offset % modulo))) % - modulo); - } else { - int modulo = comm_size - subgroup_size; - return static_cast((static_cast(priority - subgroup_size) + - (modulo - static_cast(offset % modulo))) % - modulo); - } -} - -template -rmm::device_uvector compute_keep_flags( - raft::comms::comms_t const& comm, - ValueIterator value_first, - ValueIterator value_last, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - typename thrust::iterator_traits::value_type init, - rmm::cuda_stream_view stream_view) -{ - auto const comm_rank = comm.get_rank(); - auto const comm_size = comm.get_size(); - - // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are - // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same - // DGX node should be the next) - - rmm::device_uvector priorities(thrust::distance(value_first, value_last), - stream_view); - thrust::tabulate( - rmm::exec_policy(stream_view), - priorities.begin(), - priorities.end(), - [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { - auto val = *(value_first + offset); - return (val != init) - ? rank_to_priority( - comm_rank, root, subgroup_size, comm_size, static_cast(offset)) - : std::numeric_limits::max(); // lowest priority - }); - device_allreduce(comm, - priorities.data(), - priorities.data(), - priorities.size(), - raft::comms::op_t::MIN, - root, - stream_view); - - rmm::device_uvector keep_flags(priorities.size()); - auto offset_priority_pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); - thrust::transform(rmm::exec_policy(stream_view), - offset_priority_pair_first, - offset_priority_pair_first + priorities.size(), - keep_flags.begin(), - [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { - auto offset = thrust::get<0>(pair); - auto priority = thrust::get<1>(pair); - auto rank = - priority_to_rank(priority, root, subgroup_size, comm_size, offset); - return (rank == comm_rank); - }); - - return keep_flags; -} - -template -std::tuple, - dataframe_buffer_type_t::value_type>> -compute_offset_value_pairs(raft::comms::comms_t const& comm, - ValueIterator value_first, - ValueIterator value_last, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - typename thrust::iterator_traits::value_type init, - rmm::cuda_stream_view stream_view) -{ - using value_t = typename thrust::iterator_traits::value_type; - - auto const comm_rank = comm.get_rank(); - auto const comm_size = comm.get_size(); - - rmm::device_uvector keep_flags(0, stream_view); - if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t - keep_flags = compute_keep_flags( - comm, value_first, value_last, root, subgroup_size, init, stream_view); - } else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t - keep_flags = compute_keep_flags( - comm, value_first, value_last, root, subgroup_size, init, stream_view); - } else { // priority_t == uint32_t - keep_flags = compute_keep_flags( - comm, value_first, value_last, root, subgroup_size, init, stream_view); - } - - auto copy_size = thrust::count_if( - rmm::exec_policy(stream_view), keep_flags.begin(), keep_flags.end(), thrust::identity{}); - - rmm::device_uvector offsets(copy_size, stream_view); - auto values = allocate_dataframe_buffer(copy_size, stream_view); - auto offset_value_pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), value_first); - thrust::copy_if(rmm::exec_policy(stream_view), - offset_value_pair_first, - offset_value_pair_first + keep_flags.size(), - keep_flags.begin(), - thrust::make_zip_iterator(offsets.begin(), dataframe_buffer_begin(values)), - thrust::identity{}); - - return std::make_tuple(std::move(offsets), std::move(values)); -} - -template -void gather_offset_value_pairs_and_update_vertex_value_output( - raft::comms::comms_t const& comm, - rmm::device_uvector&& offsets, - dataframe_buffer_type_t&& values, - VertexValueOutputIterator vertex_value_output_first, - int root, - rmm::cuda_stream_view stream_view) -{ - auto const comm_rank = comm.get_rank(); - - auto rx_sizes = host_scalar_gather(comm, offsets.size(), root, stream_view); - std::vector rx_displs{}; - rmm::device_uvector rx_offsets(0, stream_view); - if (comm_rank == root) { - rx_displs.resize(rx_sizes.size()); - std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); - rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); - } - - device_gatherv(comm, - offsets.begin(), - rx_offsets.begin(), - offsets.size(), - rx_sizes, - rx_displs, - root, - stream_view); - offsets.resize(0, stream_view); - offsets.shrink_to_fit(stream_view); - - auto rx_values = allocate_dataframe_buffer(rx_offsets.size(), stream_view); - device_gatherv(comm, - get_dataframe_buffer_begin(values), - get_dataframe_buffer_begin(rx_values), - values.size(), - rx_sizes, - rx_displs, - root, - stream_view); - resize_dataframe_buffer(values, 0, stream_view); - shrink_to_fit_dataframe_buffer(values, stream_view); - - if (comm_rank == root) { - thrust::scatter(rmm::exec_policy(stream_view), - get_dataframe_buffer_begin(rx_values), - get_dataframe_buffer_end(rx_values), - rx_offsets.begin(), - vertex_value_output_first); - } -} - -template -void per_v_transform_reduce_e(raft::handle_t const& handle, - GraphViewType const& graph_view, - OptionalKeyIterator sorted_unique_key_first, - OptionalKeyIterator sorted_unique_key_last, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - T init, - ReduceOp reduce_op, - PredOp pred_op, - VertexValueOutputIterator vertex_value_output_first) -{ - constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); - constexpr bool use_input_key = !std::is_same_v; - - static_assert(update_major || !use_input_key); - static_assert( - ReduceOp::pure_function && - ((reduce_op::has_compatible_raft_comms_op_v && - reduce_op::has_identity_element_v) || - (update_major && - std::is_same_v>))); // current restriction, to support general - // reduction, we may need to take a less - // efficient code path - - using vertex_t = typename GraphViewType::vertex_type; - using edge_t = typename GraphViewType::edge_type; - using key_t = - typename iterator_value_type_or_default_t::value_type; - - using edge_partition_src_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeSrcValueInputWrapper::value_iterator, - typename EdgeSrcValueInputWrapper::value_type>>; - using edge_partition_dst_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_endpoint_dummy_property_device_view_t, - detail::edge_partition_endpoint_property_device_view_t< - vertex_t, - typename EdgeDstValueInputWrapper::value_iterator, - typename EdgeDstValueInputWrapper::value_type>>; - using edge_partition_e_input_device_view_t = std::conditional_t< - std::is_same_v, - detail::edge_partition_edge_dummy_property_device_view_t, - detail::edge_partition_edge_property_device_view_t< - edge_t, - typename EdgeValueInputWrapper::value_iterator, - typename EdgeValueInputWrapper::value_type>>; - - static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - - constexpr bool use_bitmap = GraphViewType::is_multi_gpu && - !std::is_same_v && - std::is_same_v; - - [[maybe_unused]] constexpr auto max_segments = - detail::num_sparse_segments_per_vertex_partition + size_t{1}; - - // 1. prepare key list - - auto sorted_unique_nzd_key_last = sorted_unique_key_last; - if constexpr (use_input_key) { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - auto sorted_uniue_nzd_key_last = compute_key_lower_bound( - sorted_unique_key_first, - sorted_unique_key_last, - graph_view.local_vertex_partition_range_first() + ((*segment_offsets).rbegin() + 1), - handle.get_stream()); - } - } - - std::conditional_t, std::byte /* dummy */> - local_key_list_sizes{}; - if constexpr (use_input_key) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_key_list_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), - handle.get_stream()); - } else { - local_key_list_sizes = std::vector{ - static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; - } - } - - std:: - conditional_t>, std::byte /* dummy */> - key_list_bitmap{}; - std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; - if constexpr (use_bitmap) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : graph_view.local_vertex_partition_range_size(); - - std::tie(key_list_bitmap, use_bitmap_flags) = - compute_vertex_list_bitmap_info(minor_comm, - sorted_unique_key_first, - sorted_unique_nzd_key_last, - graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_first() + bool_size, - handle.get_stream()); - } - - // 2. compute subgroup_size, set-up temporary buffers & stream pool, and initialize - - [[maybe_unused]] std::conditional_t>, - int, - std::byte /* dummy */> - subgroup_size{}; - if constexpr (update_major && std::is_same_v>) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - int num_gpus_per_node{}; - RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); - subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm - ? std::max(num_gpus_per_node / minor_comm_size, int{1}) - : std::min(minor_comm_size, num_gpus_per_node); - } - - using minor_tmp_buffer_type = std::conditional_t, - edge_dst_property_t>; - [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - minor_tmp_buffer = std::make_unique(handle, graph_view); - } - - using edge_partition_minor_output_device_view_t = - std::conditional_tmutable_view().value_first())>, - void /* dummy */>; - - if constexpr (update_major) { // no vertices in the zero degree segment are visited - if constexpr (use_input_key) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + - thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), - vertex_value_output_first + - thrust::distance(sorted_unique_key_first, sorted_unique_key_last), - init); - } else { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); - } - } - } else { - if constexpr (GraphViewType::is_multi_gpu) { - auto minor_init = init; - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may - // not store values for the entire minor range - minor_init = ReduceOp::identity_element; - } else { - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; - } - fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); - } else { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first, - vertex_value_output_first + graph_view.local_vertex_partition_range_size(), - init); - } - } - - std::optional> stream_pool_indices{std::nullopt}; - if constexpr (GraphViewType::is_multi_gpu) { - if ((graph_view.local_edge_partition_segment_offsets(0)) && - (handle.get_stream_pool_size() >= max_segments)) { - for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) { - assert(graph_view.local_edge_partition_segment_offsets(i)); - } - - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is - // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 - // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - // FIXME: should we consider edge_partition_key_buffer as well? - - size_t num_streams = - std::min(static_cast(minor_comm_size) * max_segments, - raft::round_down_safe(handle.get_stream_pool_size(), max_segments)); - if constexpr (update_major) { - size_t value_size{0}; - if constexpr (is_thrust_tuple_of_arithmetic::value) { - auto elem_sizes = compute_thrust_tuple_element_sizes{}(); - value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); - } else { - value_size = sizeof(T); - } - size_t key_size{0}; - if constexpr (use_input_key) { - if constexpr (std::is_same_v) { - key_size = sizeof(vertex_t); - } else { - key_size = sizeof(thrust::tuple_element<0, key_t>::type) + - sizeof(thrust::tuple_element<1, key_t>::type); - } - } - - auto num_edges = graph_view.compute_number_of_edges(handle); - - size_t aggregate_major_range_size{}; - if constexpr (use_input_key) { - aggregate_major_range_size = - host_scalar_allreduce(handle.get_comms(), - static_cast(thrust::distance(sorted_unique_key_first, - sorted_unique_nzd_key_last)), - raft::comms::op_t::SUM, - handle.get_stream()); - } else { - aggregate_major_range_size = graph_view.number_of_vertices(); - } - num_streams = std::min( - static_cast( - (aggregate_major_range_size > 0 - ? (static_cast(num_edges) / static_cast(aggregate_major_range_size)) - : double{0}) * - (static_cast(sizeof(vertex_t)) / static_cast(value_size + key_size))) * - max_segments, - num_streams); - } - - if (num_streams >= max_segments) { - assert((num_streams % max_segments) == 0); - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - } - - std::vector> major_tmp_buffers{}; - if constexpr (GraphViewType::is_multi_gpu && update_major) { - std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), - size_t{0}); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - if constexpr (use_input_key) { - major_tmp_buffer_sizes = local_key_list_sizes; - } else { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - major_tmp_buffer_sizes[i] = - *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment - } else { - if constexpr (GraphViewType::is_storage_transposed) { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); - } else { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); - } - } - } - } - if (stream_pool_indices) { - auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; - major_tmp_buffers.reserve(num_concurrent_loops); - for (size_t i = 0; i < num_concurrent_loops; ++i) { - size_t max_size{0}; - for (size_t j = i; j < graph_view.number_of_local_edge_partitions(); - j += num_concurrent_loops) { - max_size = std::max(major_tmp_buffer_sizes[j], max_size); - } - major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); - } - } else { - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer( - *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), - handle.get_stream())); - } - } else { // dummy - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); - } - - std::conditional_t>, - std::vector>, - std::byte /* dummy */> - offset_vectors{}; - std::conditional_t>, - std::vector>, - std::byte /* dummy */> - value_vectors{}; - if constexpr (update_major && std::is_same_v>) { - auto capacity = graph_view.number_of_local_edge_partitions() * - (graph_view.local_edge_partition_segment_offsets(0) ? max_segments : 1); - offset_vectors.reserve(capacity); - value_vectors.reserve(capacity); - - for (size_t i = 0; i < capacity; ++i) { - offset_vectors.emplace_back(0, handle.get_stream()); - value_vectors.emplace_back(0, handle.get_stream()); - } - } - - if (stream_pool_indices) { handle.sync_stream(); } - - // 3. proces local edge partitions - - auto edge_mask_view = graph_view.edge_mask_view(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto major_init = ReduceOp::identity_element; - if constexpr (update_major) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - if constexpr (std::is_same_v>) { - major_init = init; // init is selected only when no edges return a valid value - } else { - major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; - } - } else { - major_init = init; - } - } - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto loop_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) - : handle.get_stream(); - - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - auto edge_partition_key_buffer = allocate_optional_dataframe_buffer< - std::conditional_t>(0, - loop_stream); - std::conditional_t>, std::byte /* dummy */> - key_segment_offsets{}; - if constexpr (use_input_key) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); - - resize_optional_dataframe_buffer( - edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); - - if constexpr (use_bitmap) { - std::variant, decltype(sorted_unique_key_first)> - v_list{}; - if (use_bitmap_flags[i]) { - v_list = raft::device_span((*key_list_bitmap).data(), - (*key_list_bitmap).size()); - } else { - v_list = sorted_unique_key_first; - } - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); - device_bcast_vertex_list(minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, - static_cast(thrust::distance( - sorted_unique_key_first, sorted_unique_nzd_key_last)), - static_cast(i), - loop_stream); - } else { - device_bcast(minor_comm, - sorted_unique_key_first, - get_dataframe_buffer_begin(edge_partition_key_buffer), - local_key_list_sizes[i], - static_cast(i), - loop_stream); - } - - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffer); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); - } - } - if (segment_offsets) { - key_segment_offsets = compute_key_segment_offsets( - edge_partition_key_first, - edge_partition_key_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - edge_partition.major_range_first(), - graph_view.use_dcs(), - loop_stream); - } else { - key_segment_offsets = std::nullopt; - } - } - RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); - } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); - - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); - - std::conditional_t, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = major_buffer_first; - } else { - output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); - } - } else { - output_buffer = vertex_value_output_first; - } - - using segment_key_iterator_t = - std::conditional_t; - - if (segment_offsets) { - static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - - std::vector h_offsets{}; - if constexpr (use_input_key) { - h_offsets = (*key_segment_offsets); - } else { - h_offsets.resize((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), - h_offsets.begin(), - [](vertex_t offset) { return static_cast(offset); }); - } - - // FIXME: we may further improve performance by 1) individually tuning block sizes for - // different segments; and 2) adding one more segment for very high degree vertices and - // running segmented reduction - if (edge_partition.dcs_nzd_vertex_count()) { - auto exec_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) - : handle.get_stream(); - - if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit - // every vertex in the hypersparse segment - thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + h_offsets[3], - output_buffer + h_offsets[4], - major_init); - } - - auto segment_size = use_input_key - ? (h_offsets[4] - h_offsets[3]) - : static_cast(*(edge_partition.dcs_nzd_vertex_count())); - if (segment_size > 0) { - raft::grid_1d_thread_t update_grid(segment_size, - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += h_offsets[3]; } - auto segment_key_first = edge_partition_key_first; - auto segment_key_last = edge_partition_key_last; - if constexpr (use_input_key) { - segment_key_first += h_offsets[3]; - segment_key_last += h_offsets[4]; - } else { - assert(segment_key_first == nullptr); - assert(segment_key_last == nullptr); - } - detail::per_v_transform_reduce_e_hypersparse - <<>>( - edge_partition, - segment_key_first, - segment_key_last, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - reduce_op, - pred_op); - } - } - if (h_offsets[3] - h_offsets[2]) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += h_offsets[2]; } - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - segment_key_first += h_offsets[2]; - auto num_keys = h_offsets[3] - h_offsets[2]; - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + (h_offsets[3] - h_offsets[2]), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - reduce_op, - pred_op); - } - if (h_offsets[2] - h_offsets[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += h_offsets[1]; } - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - segment_key_first += h_offsets[1]; - detail::per_v_transform_reduce_e_mid_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + (h_offsets[2] - h_offsets[1]), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op, - pred_op); - } - if (h_offsets[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_block_t update_grid(h_offsets[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - detail::per_v_transform_reduce_e_high_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + h_offsets[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - ReduceOp::identity_element, - reduce_op, - pred_op); - } - } else { - size_t num_keys{}; - if constexpr (use_input_key) { - num_keys = - static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); - } else { - num_keys = static_cast(edge_partition.major_range_size()); - } - - if (edge_partition.major_range_size() > 0) { - raft::grid_1d_thread_t update_grid(num_keys, - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + num_keys, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - reduce_op, - pred_op); - } - } - - if constexpr (GraphViewType::is_multi_gpu && update_major) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - if (segment_offsets && stream_pool_indices) { - if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[3]; - auto segment_size = (*segment_offsets)[4] - (*segment_offsets)[3]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - segment_stream); - offset_vectors[i * max_segments + 3] = std::move(offsets); - value_vectors[i * max_segments + 3] = std::move(values); - } else { - device_reduce(minor_comm, - major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); - } - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[2]; - auto segment_size = (*segment_offsets)[3] - (*segment_offsets)[2]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - segment_stream); - offset_vectors[i * max_segments + 2] = std::move(offsets); - value_vectors[i * max_segments + 2] = std::move(values); - } else { - device_reduce(minor_comm, - major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); - } - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[1]; - auto segment_size = (*segment_offsets)[2] - (*segment_offsets)[1]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - segment_stream); - offset_vectors[i * max_segments + 1] = std::move(offsets); - value_vectors[i * max_segments + 1] = std::move(values); - } else { - device_reduce(minor_comm, - major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); - } - } - if ((*segment_offsets)[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()); - auto segment_size = (*segment_offsets)[1]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = - compute_offset_value_pairs(minor_comm, - major_buffer_first, - major_buffer_first + segment_size, - static_cast(i), - subgroup_size, - init, - segment_stream); - offset_vectors[i * max_segments] = std::move(offsets); - value_vectors[i * max_segments] = std::move(values); - } else { - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); - } - } - } else { - size_t reduction_size = static_cast( - segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - if constexpr (std::is_same_v>) { - auto [offsets, values] = - compute_offset_value_pairs(minor_comm, - major_buffer_first, - major_buffer_first + reduction_size, - static_cast(i), - subgroup_size, - init, - handle.get_stream()); - offset_vectors[i] = std::move(offsets); - value_vectors[i] = std::move(values); - } else { - device_reduce(minor_comm, - major_buffer_first, - vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream()); - } - } - } - - if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { - handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen as - // *segment_offsets do not necessarily coincide in different edge - // partitions). - } - } - - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - - // 4. communication - - if constexpr (update_major && std::is_same_v>) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - - if (segment_offsets && stream_pool_indices) { - if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[3]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 3]), - std::move(value_vectors[i * max_segments + 3]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); - } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[2]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 2]), - std::move(value_vectors[i * max_segments + 2]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); - } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[1]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 1]), - std::move(value_vectors[i * max_segments + 1]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); - } - if ((*segment_offsets)[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()); - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments]), - std::move(value_vectors[i * max_segments]), - vertex_value_output_first, - static_cast(i), - segment_stream); - } - } else { - gather_offset_value_pairs_and_update_vertex_value_output(minor_comm, - std::move(offset_vectors[i]), - std::move(value_vectors[i]), - vertex_value_output_first, - static_cast(i), - handle.get_stream()); - } - } - } - - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // applying the initial value is deferred to here - vertex_t max_vertex_partition_size{0}; - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - max_vertex_partition_size = - std::max(max_vertex_partition_size, - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)); - } - auto tx_buffer = allocate_dataframe_buffer(max_vertex_partition_size, handle.get_stream()); - auto tx_buffer_first = get_dataframe_buffer_begin(tx_buffer); - std::optional> minor_key_offsets{}; - if constexpr (GraphViewType::is_storage_transposed) { - minor_key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); - } else { - minor_key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); - } - for (int i = 0; i < major_comm_size; ++i) { - auto minor_init = (major_comm_rank == i) ? init : ReduceOp::identity_element; - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - thrust::fill(handle.get_thrust_policy(), - tx_buffer_first, - tx_buffer_first + - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id), - minor_init); - auto value_first = thrust::make_transform_iterator( - view.value_first(), - cuda::proclaim_return_type( - [reduce_op, minor_init] __device__(auto val) { return reduce_op(val, minor_init); })); - thrust::scatter(handle.get_thrust_policy(), - value_first + (*minor_key_offsets)[i], - value_first + (*minor_key_offsets)[i + 1], - thrust::make_transform_iterator( - (*(view.keys())).begin() + (*minor_key_offsets)[i], - cuda::proclaim_return_type( - [key_first = graph_view.vertex_partition_range_first( - this_segment_vertex_partition_id)] __device__(auto key) { - return key - key_first; - })), - tx_buffer_first); - device_reduce(major_comm, - tx_buffer_first, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } else { - auto first_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(0); - vertex_t minor_range_first = - graph_view.vertex_partition_range_first(first_segment_vertex_partition_id); - for (int i = 0; i < major_comm_size; ++i) { - auto this_segment_vertex_partition_id = - compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - auto offset = graph_view.vertex_partition_range_first(this_segment_vertex_partition_id) - - minor_range_first; - device_reduce(major_comm, - view.value_first() + offset, - vertex_value_output_first, - static_cast( - graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), - ReduceOp::compatible_raft_comms_op, - i, - handle.get_stream()); - } - } - } -} - -} // namespace detail - /** * @brief Iterate over every vertex's incoming edges to update vertex properties. * From ec247580928ea285e5e5ca8aa0ded48794383470 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 9 Aug 2024 17:18:05 -0700 Subject: [PATCH 019/126] implement per_v_transform_reduce_if_incoming|outgoing_e --- ...ransform_reduce_if_incoming_outgoing_e.cuh | 421 ++++++++++++++++++ 1 file changed, 421 insertions(+) create mode 100644 cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh diff --git a/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh new file mode 100644 index 00000000000..1e0d366429e --- /dev/null +++ b/cpp/src/prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "prims/detail/per_v_transform_reduce_e.cuh" +#include "prims/vertex_frontier.cuh" + +#include +#include +#include + +#include + +#include +#include + +namespace cugraph { + +/** + * @brief Iterate over every vertex's incoming edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce. In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to + * fill the wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the first + * (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the incoming + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 incoming edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_incoming_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(GraphViewType::is_storage_transposed); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = true; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief Iterate over every vertex's outgoing edges to update vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be added to the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the vertex property variables for the + * first (inclusive) vertex (assigned to this process in multi-GPU). `vertex_value_output_last` + * (exclusive) is deduced as @p vertex_value_output_first + @p + * graph_view.local_vertex_partition_range_size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + static_cast(nullptr), + static_cast(nullptr), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +/** + * @brief For each (tagged-)vertex in the input (tagged-)vertex list, iterate over the outgoing + * edges to update (tagged-)vertex properties. + * + * This function is inspired by thrust::transform_reduce(). In addition, this function excludes the + * edges that return false when the predicate @p pred_op is applied. + * + * @tparam GraphViewType Type of the passed non-owning graph object. + * @tparam KeyBucketType Type of the key bucket class which abstracts the current (tagged-)vertex + * list. + * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. + * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. + * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. + * @tparam EdgeOp Type of the quinary edge operator. + * @tparam ReduceOp Type of the binary reduction operator. + * @tparam PredOp Type of the quinary predicate operator. + * @tparam T Type of the initial value for per-vertex reduction. + * @tparam VertexValueOutputIterator Type of the iterator for vertex output property variables. + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param graph_view Non-owning graph object. + * @param key_list KeyBucketType class object to store the (tagged-)vertex list to update + * (tagged-)vertex properties. + * @param edge_src_value_input Wrapper used to access source input property values (for the edge + * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() + * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() + * (if @p e_op does not access source property values). Use update_edge_src_property to fill the + * wrapper. + * @param edge_dst_value_input Wrapper used to access destination input property values (for the + * edge destinations assigned to this process in multi-GPU). Use either + * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or + * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property + * values). Use update_edge_dst_property to fill the wrapper. + * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned + * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to + * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not + * access edge property values). + * @param e_op Quinary operator takes edge source, edge destination, property values for the source, + * destination, and edge and returns a value to be reduced. + * @param init Initial value to be reduced with the reduced @p e_op return values for each vertex. + * If @p reduce_op is cugraph::reduce_op::any, init value is never selected except for the + * (tagged-)vertices with 0 outgoing edges. + * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. + * There are pre-defined reduction operators in src/prims/reduce_op.cuh. It is + * recommended to use the pre-defined reduction operators whenever possible as the current (and + * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has + * known member variables) to take a more optimized code path. See the documentation in the + * reduce_op.cuh file for instructions on writing custom reduction operators. + * @param pred_op Quinary operator takes edge source, edge destination, property values for the + * source, destination, and edge and returns whether this edge should be included (if true is + * returned) or excluded. + * @param vertex_value_output_first Iterator pointing to the (tagged-)vertex property variables for + * the first (inclusive) (tagged-)vertex in @p key_list. `vertex_value_output_last` (exclusive) is + * deduced as @p vertex_value_output_first + @p key_list.size(). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void per_v_transform_reduce_if_outgoing_e(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& key_list, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + T init, + ReduceOp reduce_op, + PredOp pred_op, + VertexValueOutputIterator vertex_value_output_first, + bool do_expensive_check = false) +{ + static_assert(!GraphViewType::is_storage_transposed); + static_assert(KeyBucketType::is_sorted_unique); + + if (do_expensive_check) { + // currently, nothing to do + } + + constexpr bool incoming = false; + + detail::per_v_transform_reduce_e(handle, + graph_view, + key_list.begin(), + key_list.end(), + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + init, + reduce_op, + pred_op, + vertex_value_output_first); +} + +} // namespace cugraph From df751e7d84835fca15d96acd46a743d458dd5bb0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 13 Aug 2024 00:30:02 -0700 Subject: [PATCH 020/126] update BFS to use per_v_transform_reduce_if_outoging_e --- .../detail/extract_transform_v_frontier_e.cuh | 49 +- .../prims/detail/per_v_transform_reduce_e.cuh | 655 +++++++++--------- cpp/src/prims/vertex_frontier.cuh | 81 ++- cpp/src/traversal/bfs_impl.cuh | 78 ++- 4 files changed, 466 insertions(+), 397 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 79203af08c3..183e85765c5 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -798,22 +798,24 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if constexpr (use_bitmap) { std::variant, decltype(frontier_key_first)> v_list{}; if (use_bitmap_flags[i]) { - v_list = raft::device_span((*frontier_bitmap).data(), - (*frontier_bitmap).size()); + v_list = (static_cast(i) == minor_comm_rank) + ? raft::device_span((*frontier_bitmap).data(), + (*frontier_bitmap).size()) + : raft::device_span(static_cast(nullptr), + size_t{0}); } else { v_list = frontier_key_first; } auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) : edge_partition.major_range_size(); - device_bcast_vertex_list( - minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - static_cast(i), - handle.get_stream()); + device_bcast_vertex_list(minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + local_frontier_sizes[i], + static_cast(i), + handle.get_stream()); } else { device_bcast(minor_comm, frontier_key_first, @@ -872,21 +874,20 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_frontier_major_last, raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), edge_partition.major_range_first(), - graph_view.use_dcs(), handle.get_stream()); // FIXME: we may further improve performance by 1) concurrently running kernels on different // segments; 2) individually tuning block sizes for different segments; and 3) adding one // more segment for very high degree vertices and running segmented reduction - if (h_offsets[0] > 0) { - raft::grid_1d_block_t update_grid(h_offsets[0], + if (h_offsets[1] > 0) { + raft::grid_1d_block_t update_grid(h_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_high_degree <<>>( edge_partition, edge_partition_frontier_key_first, - edge_partition_frontier_key_first + h_offsets[0], + edge_partition_frontier_key_first + h_offsets[1], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -896,15 +897,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, tmp_buffer_idx.data(), e_op); } - if (h_offsets[1] - h_offsets[0] > 0) { - raft::grid_1d_warp_t update_grid(h_offsets[1] - h_offsets[0], + if (h_offsets[2] - h_offsets[1] > 0) { + raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_mid_degree <<>>( edge_partition, - edge_partition_frontier_key_first + h_offsets[0], edge_partition_frontier_key_first + h_offsets[1], + edge_partition_frontier_key_first + h_offsets[2], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -914,8 +915,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, tmp_buffer_idx.data(), e_op); } - if (h_offsets[2] - h_offsets[1] > 0) { - raft::grid_1d_thread_t update_grid(h_offsets[2] - h_offsets[1], + if (h_offsets[3] - h_offsets[2] > 0) { + raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, - edge_partition_frontier_key_first + h_offsets[1], edge_partition_frontier_key_first + h_offsets[2], + edge_partition_frontier_key_first + h_offsets[3], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -934,8 +935,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, tmp_buffer_idx.data(), e_op); } - if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[3] - h_offsets[2] > 0)) { - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], + if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[4] - h_offsets[3] > 0)) { + raft::grid_1d_thread_t update_grid(h_offsets[4] - h_offsets[3], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, - edge_partition_frontier_key_first + h_offsets[2], edge_partition_frontier_key_first + h_offsets[3], + edge_partition_frontier_key_first + h_offsets[4], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 6ba3dd5d070..bbbf8e54a44 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -86,6 +86,52 @@ struct iterator_value_type_or_default_t::value_type }; +template +__device__ auto init_pred_op( + edge_partition_device_view_t const& edge_partition, + EdgePartitionSrcValueInputWrapper const& edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper const& edge_partition_dst_value_input, + EdgePartitionEdgeValueInputWrapper const& edge_partition_e_value_input, + PredOp const& pred_op, + key_t key, + typename GraphViewType::vertex_type major_offset, + typename GraphViewType::vertex_type const* indices, + typename GraphViewType::edge_type edge_offset) +{ + if constexpr (std::is_same_v< + PredOp, + const_true_e_op_t>) { + return call_const_true_e_op_t{}; + } else { + return call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset}; + } +} + template ::value_type; - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto idx = static_cast(tid); @@ -299,18 +328,15 @@ __global__ static void per_v_transform_reduce_e_hypersparse( indices, edge_offset}; - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); if (edge_partition_e_mask) { update_result_value_output( @@ -322,7 +348,7 @@ __global__ static void per_v_transform_reduce_e_hypersparse( reduce_op, [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_pred_op(edge_offset + i); + return call_pred_op(i); } else { return false; } @@ -386,24 +412,8 @@ __global__ static void per_v_transform_reduce_e_low_degree( using edge_t = typename GraphViewType::edge_type; using key_t = typename thrust::iterator_traits::value_type; - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; - auto idx = static_cast(tid); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; + auto idx = static_cast(tid); while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); @@ -431,18 +441,15 @@ __global__ static void per_v_transform_reduce_e_low_degree( indices, edge_offset}; - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); if (edge_partition_e_mask) { update_result_value_output( @@ -454,7 +461,7 @@ __global__ static void per_v_transform_reduce_e_low_degree( reduce_op, [&edge_partition_e_mask, &call_pred_op, edge_offset] __device__(edge_t i) { if ((*edge_partition_e_mask).get(edge_offset + i)) { - return call_pred_op(edge_offset + i); + return call_pred_op(i); } else { return false; } @@ -504,9 +511,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree( ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true && !std::is_same_v> */ - , + T identity_element /* relevant only if update_major == true */, ReduceOp reduce_op, PredOp pred_op) { @@ -519,23 +524,6 @@ __global__ static void per_v_transform_reduce_e_mid_degree( using e_op_result_t = T; using key_t = typename thrust::iterator_traits::value_type; - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - auto const tid = threadIdx.x + blockIdx.x * blockDim.x; static_assert(per_v_transform_reduce_e_kernel_block_size % raft::warp_size() == 0); auto const lane_id = tid % raft::warp_size(); @@ -573,18 +561,15 @@ __global__ static void per_v_transform_reduce_e_mid_degree( indices, edge_offset}; - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); [[maybe_unused]] std::conditional_t reduced_e_op_result{}; @@ -592,9 +577,12 @@ __global__ static void per_v_transform_reduce_e_mid_degree( int32_t, std::byte /* dummy */> first_valid_lane_id{}; - if constexpr (update_major) { reduced_e_op_result = (lane_id == 0) ? init : identity_element; } - if constexpr (update_major && std::is_same_v>) { - first_valid_lane_id = raft::warp_size(); + if constexpr (update_major) { + reduced_e_op_result = + (lane_id == 0) ? init : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_lane_id = raft::warp_size(); + } } if (edge_partition_e_mask) { @@ -605,7 +593,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree( for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { thrust::optional e_op_result{thrust::nullopt}; if ((i < static_cast(local_degree)) && - (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { e_op_result = call_e_op(i); } first_valid_lane_id = WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) @@ -616,7 +604,7 @@ __global__ static void per_v_transform_reduce_e_mid_degree( } } else { for (edge_t i = lane_id; i < local_degree; i += raft::warp_size()) { - if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(edge_offset + i)) { + if ((*edge_partition_e_mask).get(edge_offset + i) & call_pred_op(i)) { auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); @@ -710,9 +698,7 @@ __global__ static void per_v_transform_reduce_e_high_degree( ResultValueOutputIteratorOrWrapper result_value_output, EdgeOp e_op, T init /* relevant only if update_major == true */, - T identity_element /* relevant only if update_major == true && !std::is_same_v> */ - , + T identity_element /* relevant only if update_major == true */, ReduceOp reduce_op, PredOp pred_op) { @@ -725,23 +711,6 @@ __global__ static void per_v_transform_reduce_e_high_degree( using e_op_result_t = T; using key_t = typename thrust::iterator_traits::value_type; - constexpr bool const_true_pred_op = - std::is_same_v>; - using call_pred_op_t = std::conditional_t, - call_e_op_t>; - auto idx = static_cast(blockIdx.x); using BlockReduce = cub::BlockReduce< @@ -781,18 +750,15 @@ __global__ static void per_v_transform_reduce_e_high_degree( indices, edge_offset}; - call_pred_op_t call_pred_op{}; - if constexpr (!const_true_pred_op) { - call_pred_op = call_pred_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - pred_op, - key, - major_offset, - indices, - edge_offset}; - } + auto call_pred_op = init_pred_op(edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + pred_op, + key, + major_offset, + indices, + edge_offset); [[maybe_unused]] std::conditional_t reduced_e_op_result{}; @@ -801,10 +767,12 @@ __global__ static void per_v_transform_reduce_e_high_degree( std::byte /* dummy */> first_valid_thread_id{}; if constexpr (update_major) { - reduced_e_op_result = threadIdx.x == 0 ? init : identity_element; - } - if constexpr (update_major && std::is_same_v>) { - first_valid_thread_id = per_v_transform_reduce_e_kernel_block_size; + reduced_e_op_result = threadIdx.x == 0 + ? init + : identity_element; // init == identity_element for reduce_op::any + if constexpr (std::is_same_v>) { + first_valid_thread_id = per_v_transform_reduce_e_kernel_block_size; + } } if (edge_partition_e_mask) { @@ -816,7 +784,7 @@ __global__ static void per_v_transform_reduce_e_high_degree( for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { thrust::optional e_op_result{thrust::nullopt}; if ((i < static_cast(local_degree)) && - (*edge_partition_e_mask).get_(edge_offset + i) && call_pred_op(edge_offset + i)) { + (*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { e_op_result = call_e_op(i); } first_valid_thread_id = @@ -831,7 +799,7 @@ __global__ static void per_v_transform_reduce_e_high_degree( } } else { for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { - if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(edge_offset + i)) { + if ((*edge_partition_e_mask).get(edge_offset + i) && call_pred_op(i)) { auto e_op_result = call_e_op(i); if constexpr (update_major) { reduced_e_op_result = reduce_op(reduced_e_op_result, e_op_result); @@ -863,8 +831,9 @@ __global__ static void per_v_transform_reduce_e_high_degree( cub::Min()); if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } __syncthreads(); - if (threadIdx.x == output_thread_id) { reduced_e_op_result = *e_op_result; } - if (output_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + first_valid_thread_id = output_thread_id; + if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } + if (first_valid_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } } } else { for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { @@ -915,11 +884,20 @@ __host__ __device__ int rank_to_priority( } else if (rank / subgroup_size == root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in // [1, subgroup_size) - int modulo = subgroup_size - 1; - return int{1} + static_cast((static_cast(rank) + offset) % modulo); + int modulo = subgroup_size - 1; + auto rank_dist = (rank + subgroup_size - root) % subgroup_size; + return 1 + ((rank_dist - 1) + (offset % modulo)) % modulo; } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) int modulo = comm_size - subgroup_size; - return subgroup_size + static_cast((static_cast(rank) + offset) % modulo); + auto subgroup_dist = + ((rank / subgroup_size) + (comm_size / subgroup_size) - (root / subgroup_size)) % + (comm_size / subgroup_size); + auto intra_subgroup_rank_dist = + ((rank % subgroup_size) + subgroup_size - (root % subgroup_size)) % subgroup_size; + return subgroup_size + + ((subgroup_dist * subgroup_size + intra_subgroup_rank_dist - subgroup_size) + + (offset % modulo)) % + modulo; } } @@ -934,15 +912,17 @@ __host__ __device__ int priority_to_rank( if (priority == int{0}) { return root; } else if (priority < subgroup_size) { - int modulo = subgroup_size - int{1}; - return static_cast( - (static_cast(priority - int{1}) + (modulo - static_cast(offset % modulo))) % - modulo); + int modulo = subgroup_size - 1; + auto rank_dist = 1 + (priority - 1 + modulo - (offset % modulo)) % modulo; + return (root + rank_dist) % subgroup_size; } else { int modulo = comm_size - subgroup_size; - return static_cast((static_cast(priority - subgroup_size) + - (modulo - static_cast(offset % modulo))) % - modulo); + auto rank_dist = + subgroup_size + (priority - subgroup_size + modulo - (offset % modulo)) % modulo; + auto subgroup_dist = rank_dist / subgroup_size; + auto intra_subgroup_rank_dist = rank_dist % subgroup_size; + return ((root / subgroup_size + subgroup_dist) % (comm_size / subgroup_size)) * subgroup_size + + (root % subgroup_size + intra_subgroup_rank_dist) % subgroup_size; } } @@ -981,10 +961,9 @@ rmm::device_uvector compute_keep_flags( priorities.data(), priorities.size(), raft::comms::op_t::MIN, - root, stream_view); - rmm::device_uvector keep_flags(priorities.size()); + rmm::device_uvector keep_flags(priorities.size(), stream_view); auto offset_priority_pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); thrust::transform(rmm::exec_policy(stream_view), @@ -1022,10 +1001,14 @@ compute_offset_value_pairs(raft::comms::comms_t const& comm, if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t keep_flags = compute_keep_flags( comm, value_first, value_last, root, subgroup_size, init, stream_view); - } else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t + } +#if 0 // FIXME: this should be enabled (currently, raft does not support allreduce on uint16_t). + else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t keep_flags = compute_keep_flags( comm, value_first, value_last, root, subgroup_size, init, stream_view); - } else { // priority_t == uint32_t + } +#endif + else { // priority_t == uint32_t keep_flags = compute_keep_flags( comm, value_first, value_last, root, subgroup_size, init, stream_view); } @@ -1041,7 +1024,7 @@ compute_offset_value_pairs(raft::comms::comms_t const& comm, offset_value_pair_first, offset_value_pair_first + keep_flags.size(), keep_flags.begin(), - thrust::make_zip_iterator(offsets.begin(), dataframe_buffer_begin(values)), + thrust::make_zip_iterator(offsets.begin(), get_dataframe_buffer_begin(values)), thrust::identity{}); return std::make_tuple(std::move(offsets), std::move(values)); @@ -1188,7 +1171,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto sorted_uniue_nzd_key_last = compute_key_lower_bound( sorted_unique_key_first, sorted_unique_key_last, - graph_view.local_vertex_partition_range_first() + ((*segment_offsets).rbegin() + 1), + graph_view.local_vertex_partition_range_first() + *((*segment_offsets).rbegin() + 1), handle.get_stream()); } } @@ -1231,11 +1214,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // 2. compute subgroup_size, set-up temporary buffers & stream pool, and initialize - [[maybe_unused]] std::conditional_t>, + [[maybe_unused]] std::conditional_t>, int, std::byte /* dummy */> subgroup_size{}; - if constexpr (update_major && std::is_same_v>) { + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -1320,7 +1305,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // peak memory requirement per loop is // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - // FIXME: should we consider edge_partition_key_buffer as well? size_t num_streams = std::min(static_cast(minor_comm_size) * max_segments, @@ -1444,6 +1428,22 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto edge_mask_view = graph_view.edge_mask_view(); + std::conditional_t>, + std::optional>>, + std::byte /* dummy */> + key_segment_offset_vectors{}; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + if (graph_view.local_edge_partition_segment_offsets(0)) { + key_segment_offset_vectors = + std::vector>(graph_view.number_of_local_edge_partitions()); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + assert(graph_view.local_edge_partition_segment_offsets(i)); + } + } + } + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { auto edge_partition = edge_partition_device_view_t( @@ -1455,18 +1455,25 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, *edge_mask_view, i) : thrust::nullopt; - auto major_init = ReduceOp::identity_element; + T major_init{}; + T major_identity_element{}; if constexpr (update_major) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - if constexpr (std::is_same_v>) { - major_init = init; // init is selected only when no edges return a valid value - } else { + if constexpr (std::is_same_v>) { // if any edge has a non-init value, one + // of the non-init values will be + // selected. + major_init = init; + major_identity_element = init; + } else { + major_init = ReduceOp::identity_element; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; + } else { + major_init = init; } - } else { - major_init = init; + major_identity_element = ReduceOp::identity_element; } } @@ -1481,62 +1488,69 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto edge_partition_key_buffer = allocate_optional_dataframe_buffer< std::conditional_t>(0, loop_stream); - std::conditional_t>, std::byte /* dummy */> - key_segment_offsets{}; - if constexpr (use_input_key) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); - - resize_optional_dataframe_buffer( - edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); - if constexpr (use_bitmap) { - std::variant, decltype(sorted_unique_key_first)> - v_list{}; - if (use_bitmap_flags[i]) { - v_list = raft::device_span((*key_list_bitmap).data(), - (*key_list_bitmap).size()); - } else { - v_list = sorted_unique_key_first; - } - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); - device_bcast_vertex_list(minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, - static_cast(thrust::distance( - sorted_unique_key_first, sorted_unique_nzd_key_last)), - static_cast(i), - loop_stream); + resize_optional_dataframe_buffer( + edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); + + if constexpr (use_bitmap) { + std::variant, decltype(sorted_unique_key_first)> + v_list{}; + if (use_bitmap_flags[i]) { + v_list = (static_cast(i) == minor_comm_rank) + ? raft::device_span((*key_list_bitmap).data(), + (*key_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), + size_t{0}); } else { - device_bcast(minor_comm, - sorted_unique_key_first, - get_dataframe_buffer_begin(edge_partition_key_buffer), - local_key_list_sizes[i], - static_cast(i), - loop_stream); + v_list = sorted_unique_key_first; } - - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffer); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + device_bcast_vertex_list(minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + local_key_list_sizes[i], + static_cast(i), + loop_stream); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffer), + local_key_list_sizes[i], + static_cast(i), + loop_stream); } + + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffer); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); } - if (segment_offsets) { + } + + std::optional> key_segment_offsets{std::nullopt}; + if (segment_offsets) { + if constexpr (use_input_key) { key_segment_offsets = compute_key_segment_offsets( edge_partition_key_first, edge_partition_key_last, raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), edge_partition.major_range_first(), - graph_view.use_dcs(), loop_stream); } else { - key_segment_offsets = std::nullopt; + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); } + } else { + key_segment_offsets = std::nullopt; } RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); @@ -1577,20 +1591,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, decltype(edge_partition_key_first), decltype(thrust::make_counting_iterator(vertex_t{0}))>; - if (segment_offsets) { + if (key_segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - std::vector h_offsets{}; - if constexpr (use_input_key) { - h_offsets = (*key_segment_offsets); - } else { - h_offsets.resize((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), - h_offsets.begin(), - [](vertex_t offset) { return static_cast(offset); }); - } - // FIXME: we may further improve performance by 1) individually tuning block sizes for // different segments; and 2) adding one more segment for very high degree vertices and // running segmented reduction @@ -1603,25 +1606,25 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit // every vertex in the hypersparse segment thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + h_offsets[3], - output_buffer + h_offsets[4], + output_buffer + (*key_segment_offsets)[3], + output_buffer + (*key_segment_offsets)[4], major_init); } auto segment_size = use_input_key - ? (h_offsets[4] - h_offsets[3]) + ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]) : static_cast(*(edge_partition.dcs_nzd_vertex_count())); if (segment_size > 0) { raft::grid_1d_thread_t update_grid(segment_size, detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += h_offsets[3]; } + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; } auto segment_key_first = edge_partition_key_first; auto segment_key_last = edge_partition_key_last; if constexpr (use_input_key) { - segment_key_first += h_offsets[3]; - segment_key_last += h_offsets[4]; + segment_key_first += (*key_segment_offsets)[3]; + segment_key_last += (*key_segment_offsets)[4]; } else { assert(segment_key_first == nullptr); assert(segment_key_last == nullptr); @@ -1642,29 +1645,29 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, pred_op); } } - if (h_offsets[3] - h_offsets[2]) { + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()) : handle.get_stream(); - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += h_offsets[2]; } + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } segment_key_iterator_t segment_key_first{}; if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; } else { segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); } - segment_key_first += h_offsets[2]; - auto num_keys = h_offsets[3] - h_offsets[2]; + segment_key_first += (*key_segment_offsets)[2]; + auto num_keys = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, segment_key_first, - segment_key_first + (h_offsets[3] - h_offsets[2]), + segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1675,28 +1678,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, reduce_op, pred_op); } - if (h_offsets[2] - h_offsets[1] > 0) { + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()) : handle.get_stream(); - raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += h_offsets[1]; } + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } segment_key_iterator_t segment_key_first{}; if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; } else { segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); } - segment_key_first += h_offsets[1]; + segment_key_first += (*key_segment_offsets)[1]; detail::per_v_transform_reduce_e_mid_degree <<>>( edge_partition, segment_key_first, - segment_key_first + (h_offsets[2] - h_offsets[1]), + segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1704,16 +1707,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_output_buffer, e_op, major_init, - ReduceOp::identity_element, + major_identity_element, reduce_op, pred_op); } - if (h_offsets[1] > 0) { + if ((*key_segment_offsets)[1] > 0) { auto exec_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()) : handle.get_stream(); - raft::grid_1d_block_t update_grid(h_offsets[1], + raft::grid_1d_block_t update_grid((*key_segment_offsets)[1], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); segment_key_iterator_t segment_key_first{}; @@ -1726,7 +1729,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, <<>>( edge_partition, segment_key_first, - segment_key_first + h_offsets[1], + segment_key_first + (*key_segment_offsets)[1], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1734,7 +1737,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, output_buffer, e_op, major_init, - ReduceOp::identity_element, + major_identity_element, reduce_op, pred_op); } @@ -1747,7 +1750,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, num_keys = static_cast(edge_partition.major_range_size()); } - if (edge_partition.major_range_size() > 0) { + if (num_keys > size_t{0}) { raft::grid_1d_thread_t update_grid(num_keys, detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); @@ -1779,38 +1782,40 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - if (segment_offsets && stream_pool_indices) { - if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[3]; - auto segment_size = (*segment_offsets)[4] - (*segment_offsets)[3]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - segment_stream); - offset_vectors[i * max_segments + 3] = std::move(offsets); - value_vectors[i * max_segments + 3] = std::move(values); - } else { - device_reduce(minor_comm, - major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); + if (key_segment_offsets && stream_pool_indices) { + if (edge_partition.dcs_nzd_vertex_count()) { + if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments) % + (*stream_pool_indices).size()); + auto segment_offset = (*key_segment_offsets)[3]; + auto segment_size = (*key_segment_offsets)[4] - (*key_segment_offsets)[3]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = compute_offset_value_pairs( + minor_comm, + major_buffer_first + segment_offset, + major_buffer_first + (segment_offset + segment_size), + static_cast(i), + subgroup_size, + init, + segment_stream); + offset_vectors[i * max_segments + 3] = std::move(offsets); + value_vectors[i * max_segments + 3] = std::move(values); + } else { + device_reduce(minor_comm, + major_buffer_first + segment_offset, + vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(i), + segment_stream); + } } } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[2]; - auto segment_size = (*segment_offsets)[3] - (*segment_offsets)[2]; + auto segment_offset = (*key_segment_offsets)[2]; + auto segment_size = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; if constexpr (std::is_same_v>) { auto [offsets, values] = compute_offset_value_pairs( minor_comm, @@ -1832,11 +1837,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_stream); } } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[1]; - auto segment_size = (*segment_offsets)[2] - (*segment_offsets)[1]; + auto segment_offset = (*key_segment_offsets)[1]; + auto segment_size = (*key_segment_offsets)[2] - (*key_segment_offsets)[1]; if constexpr (std::is_same_v>) { auto [offsets, values] = compute_offset_value_pairs( minor_comm, @@ -1858,10 +1863,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_stream); } } - if ((*segment_offsets)[1] > 0) { + if ((*key_segment_offsets)[1] > 0) { auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()); - auto segment_size = (*segment_offsets)[1]; + auto segment_size = (*key_segment_offsets)[1]; if constexpr (std::is_same_v>) { auto [offsets, values] = compute_offset_value_pairs(minor_comm, @@ -1884,9 +1889,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } else { - size_t reduction_size = static_cast( - segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); + size_t reduction_size{}; + if constexpr (use_input_key) { + reduction_size = static_cast( + thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + reduction_size = static_cast( + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size()); + } if constexpr (std::is_same_v>) { auto [offsets, values] = compute_offset_value_pairs(minor_comm, @@ -1910,43 +1922,55 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + if (key_segment_offsets) { + (*key_segment_offset_vectors)[i] = std::move(*key_segment_offsets); + } + } + if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen as - // *segment_offsets do not necessarily coincide in different edge - // partitions). + *stream_pool_indices); // to prevent buffer over-write (this can happen + // as *segment_offsets do not necessarily coincide + // in different edge partitions). } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - // 4. communication - if constexpr (update_major && std::is_same_v>) { + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - - if (segment_offsets && stream_pool_indices) { - if ((*segment_offsets)[4] - (*segment_offsets)[3] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[3]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 3]), - std::move(value_vectors[i * max_segments + 3]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); + if (key_segment_offset_vectors && stream_pool_indices) { + auto key_segment_offsets = (*key_segment_offset_vectors)[i]; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + + if (edge_partition.dcs_nzd_vertex_count()) { + if (key_segment_offsets[4] - key_segment_offsets[3] > 0) { + auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments) % + (*stream_pool_indices).size()); + auto segment_offset = key_segment_offsets[3]; + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i * max_segments + 3]), + std::move(value_vectors[i * max_segments + 3]), + vertex_value_output_first + segment_offset, + static_cast(i), + segment_stream); + } } - if ((*segment_offsets)[3] - (*segment_offsets)[2] > 0) { + if (key_segment_offsets[3] - key_segment_offsets[2] > 0) { auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[2]; - gather_offset_value_pairs_and_update_vertex_value_output( + auto segment_offset = key_segment_offsets[2]; + gather_offset_value_pairs_and_update_vertex_value_output( minor_comm, std::move(offset_vectors[i * max_segments + 2]), std::move(value_vectors[i * max_segments + 2]), @@ -1954,11 +1978,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_cast(i), segment_stream); } - if ((*segment_offsets)[2] - (*segment_offsets)[1] > 0) { + if (key_segment_offsets[2] - key_segment_offsets[1] > 0) { auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % (*stream_pool_indices).size()); - auto segment_offset = (*segment_offsets)[1]; - gather_offset_value_pairs_and_update_vertex_value_output( + auto segment_offset = key_segment_offsets[1]; + gather_offset_value_pairs_and_update_vertex_value_output( minor_comm, std::move(offset_vectors[i * max_segments + 1]), std::move(value_vectors[i * max_segments + 1]), @@ -1966,10 +1990,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_cast(i), segment_stream); } - if ((*segment_offsets)[1] > 0) { + if (key_segment_offsets[1] > 0) { auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % (*stream_pool_indices).size()); - gather_offset_value_pairs_and_update_vertex_value_output( + gather_offset_value_pairs_and_update_vertex_value_output( minor_comm, std::move(offset_vectors[i * max_segments]), std::move(value_vectors[i * max_segments]), @@ -1978,16 +2002,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_stream); } } else { - gather_offset_value_pairs_and_update_vertex_value_output(minor_comm, - std::move(offset_vectors[i]), - std::move(value_vectors[i]), - vertex_value_output_first, - static_cast(i), - handle.get_stream()); + gather_offset_value_pairs_and_update_vertex_value_output( + minor_comm, + std::move(offset_vectors[i]), + std::move(value_vectors[i]), + vertex_value_output_first, + static_cast(i), + handle.get_stream()); } } } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if constexpr (GraphViewType::is_multi_gpu && !update_major) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index b48ed775de3..2915328a15f 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -63,11 +63,7 @@ KeyIterator compute_key_lower_bound(KeyIterator sorted_unique_key_first, rmm::exec_policy(stream_view), sorted_unique_key_first, sorted_unique_key_last, v_threshold); } else { key_t k_threshold{}; - if constexpr (std::is_same_v) { - k_threshold = v_threshold; - } else { - thrust::get<0>(k_threshold) = v_threshold; - } + thrust::get<0>(k_threshold) = v_threshold; return thrust::lower_bound( rmm::exec_policy(stream_view), sorted_unique_key_first, @@ -82,16 +78,14 @@ std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, KeyIterator sorted_key_last, raft::host_span segment_offsets, vertex_t vertex_range_first, - bool use_dcs, rmm::cuda_stream_view stream_view) { using key_t = typename thrust::iterator_traits::value_type; - assert(segment_offsets.size() == 6 /* high, mid, low, hypersparse, zero + 1 */); - std::vector h_thresholds(use_dcs ? 3 : 2); - h_thresholds[0] = vertex_range_first + segment_offsets[1]; // high, mid boundary - h_thresholds[1] = vertex_range_first + segment_offsets[2]; // mid, low boundary - if (use_dcs) { h_thresholds[2] = vertex_range_first + segment_offsets[3]; } // low, hypersparse boundary + std::vector h_thresholds(segment_offsets.size() - 2); + for (size_t i = 0; i < h_thresholds.size(); ++i) { + h_thresholds[i] = vertex_range_first + segment_offsets[i + 1]; + } rmm::device_uvector d_thresholds(h_thresholds.size(), stream_view); raft::update_device(d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), stream_view); @@ -115,10 +109,11 @@ std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, d_offsets.begin()); } - std::vector h_offsets(d_offsets.size()); - raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), stream_view); + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); - h_offsets.push_back(static_cast(thrust::distance(sorted_key_first, sorted_key_last))); + h_offsets[0] = size_t{0}; + h_offsets.back() = static_cast(thrust::distance(sorted_key_first, sorted_key_last)); return h_offsets; } @@ -195,7 +190,9 @@ void device_bcast_vertex_list( std::is_same_v::value_type, vertex_t>); if (v_list.index() == 0) { // bitmap - rmm::device_uvector tmp_bitmap(std::get<0>(v_list).size(), stream_view); + rmm::device_uvector tmp_bitmap( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size())); device_bcast( comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); thrust::copy_if(rmm::exec_policy(stream_view), @@ -496,20 +493,6 @@ class key_bucket_t { } } - auto const begin() const - { - if constexpr (std::is_same_v) { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() - : std::get<1>(vertices_).begin(); - } else { - return vertices_.index() == 0 - ? thrust::make_zip_iterator( - thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) - : thrust::make_zip_iterator( - thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); - } - } - auto begin() { CUGRAPH_EXPECTS( @@ -523,12 +506,22 @@ class key_bucket_t { } } - auto const end() const + auto const cbegin() const { - return begin() + - (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); + if constexpr (std::is_same_v) { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() + : std::get<1>(vertices_).begin(); + } else { + return vertices_.index() == 0 + ? thrust::make_zip_iterator( + thrust::make_tuple(std::get<0>(vertices_).begin(), std::get<0>(tags_).begin())) + : thrust::make_zip_iterator( + thrust::make_tuple(std::get<1>(vertices_).begin(), std::get<1>(tags_).begin())); + } } + auto const begin() const { return cbegin(); } + auto end() { CUGRAPH_EXPECTS( @@ -537,15 +530,13 @@ class key_bucket_t { return begin() + std::get<0>(vertices_).size(); } - auto const vertex_begin() const + auto const cend() const { - return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + return begin() + + (vertices_.index() == 0 ? std::get<0>(vertices_).size() : std::get<1>(vertices_).size()); } - auto const vertex_end() const - { - return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); - } + auto const end() const { return cend(); } auto vertex_begin() { @@ -555,6 +546,13 @@ class key_bucket_t { return std::get<0>(vertices_).begin(); } + auto const vertex_cbegin() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).begin() : std::get<1>(vertices_).begin(); + } + + auto const vertex_begin() const { return vertex_cbegin(); } + auto vertex_end() { CUGRAPH_EXPECTS( @@ -563,6 +561,13 @@ class key_bucket_t { return std::get<0>(vertices_).end(); } + auto const vertex_cend() const + { + return vertices_.index() == 0 ? std::get<0>(vertices_).end() : std::get<1>(vertices_).end(); + } + + auto const vertex_end() const { return vertex_cend(); } + bool is_owning() { return (vertices_.index() == 0); } private: diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 67b890e5f8b..998335f4d03 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -18,6 +18,7 @@ #include "prims/fill_edge_src_dst_property.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -69,18 +70,25 @@ struct topdown_e_op_t { } }; -template +template struct bottomup_e_op_t { - detail::edge_partition_endpoint_property_device_view_t + __device__ vertex_t operator()( + vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const + { + return dst; + } +}; + +template +struct bottomup_pred_op_t { + detail::edge_partition_endpoint_property_device_view_t prev_visited_flags{}; // visited in the previous iterations vertex_t dst_first{}; - __device__ thrust::optional operator()( + __device__ bool operator()( vertex_t src, vertex_t dst, thrust::nullopt_t, thrust::nullopt_t, thrust::nullopt_t) const { - auto dst_offset = dst - dst_first; - auto old = prev_visited_flags.get(dst_offset); - return old ? thrust::optional{dst} : thrust::nullopt; + return prev_visited_flags.get(dst - dst_first); } }; @@ -270,6 +278,7 @@ void bfs(raft::handle_t const& handle, #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown0 = std::chrono::steady_clock::now(); + std::cout << "topdown0 " << std::endl; #endif topdown_e_op_t e_op{}; e_op.prev_visited_flags = @@ -431,25 +440,52 @@ void bfs(raft::handle_t const& handle, << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," << dur5.count() << ") s." << std::endl; #endif - } else { // bottom up + } else { // bottom up #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup0 = std::chrono::steady_clock::now(); #endif - bottomup_e_op_t e_op{}; - e_op.prev_visited_flags = - detail::edge_partition_endpoint_property_device_view_t( - prev_dst_visited_flags.mutable_view()); - e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); - auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_src(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + bottomup_e_op_t e_op{}; + bottomup_pred_op_t pred_op{}; + pred_op.prev_visited_flags = + detail::edge_partition_endpoint_property_device_view_t( + prev_dst_visited_flags.view()); + pred_op.dst_first = graph_view.local_edge_partition_dst_range_first(); + + rmm::device_uvector predecessor_buffer( + vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream()); + per_v_transform_reduce_if_outgoing_e(handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + invalid_vertex, + reduce_op::any(), + pred_op, + predecessor_buffer.begin(), true); + + rmm::device_uvector new_frontier_vertex_buffer( + thrust::count_if(handle.get_thrust_policy(), + predecessor_buffer.begin(), + predecessor_buffer.end(), + detail::is_not_equal_t{invalid_vertex}), + handle.get_stream()); + { + rmm::device_uvector tmp_predecessor_buffer(new_frontier_vertex_buffer.size(), + handle.get_stream()); + auto pair_first = thrust::make_zip_iterator(vertex_frontier.bucket(bucket_idx_cur).cbegin(), + predecessor_buffer.begin()); + thrust::copy_if( + handle.get_thrust_policy(), + pair_first, + pair_first + vertex_frontier.bucket(bucket_idx_cur).size(), + thrust::make_zip_iterator(new_frontier_vertex_buffer.begin(), + tmp_predecessor_buffer.begin()), + cuda::proclaim_return_type([] __device__(auto pair) { return thrust::get<1>(pair) != invalid_vertex; })); + predecessor_buffer = std::move(tmp_predecessor_buffer); + } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup1 = std::chrono::steady_clock::now(); From 4661b9b2d9d68183af14cf160be90f65c15ac392 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 13 Aug 2024 00:31:08 -0700 Subject: [PATCH 021/126] file rename --- ..._dst.cuh => transform_reduce_v_frontier_outgoing_e_by_dst.cuh} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename cpp/src/prims/{transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh => transform_reduce_v_frontier_outgoing_e_by_dst.cuh} (100%) diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh similarity index 100% rename from cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh rename to cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh From 0951741cacdb818a7a6d5fe5b27a765df067c6fa Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 13 Aug 2024 11:29:08 -0700 Subject: [PATCH 022/126] remove transform_reduce_v_frontier_outgoing_e_by_src (this can be better supported by per_v_transform_reduce_if_outgoing_e) --- .../betweenness_centrality_impl.cuh | 20 +- .../weakly_connected_components_impl.cuh | 39 +-- cpp/src/cores/core_number_impl.cuh | 19 +- .../detail/extract_transform_v_frontier_e.cuh | 277 +++++------------- .../prims/detail/per_v_transform_reduce_e.cuh | 1 + cpp/src/prims/extract_transform_e.cuh | 4 +- ...xtract_transform_v_frontier_outgoing_e.cuh | 16 +- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 178 +++-------- cpp/src/traversal/bfs_impl.cuh | 39 +-- cpp/src/traversal/k_hop_nbrs_impl.cuh | 20 +- .../traversal/od_shortest_distances_impl.cuh | 7 +- cpp/src/traversal/sssp_impl.cuh | 4 +- cpp/tests/CMakeLists.txt | 2 +- ...rm_reduce_v_frontier_outgoing_e_by_dst.cu} | 181 +----------- .../sampling/detail/nbr_sampling_validate.cu | 2 + 15 files changed, 210 insertions(+), 599 deletions(-) rename cpp/tests/prims/{mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu => mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu} (74%) diff --git a/cpp/src/centrality/betweenness_centrality_impl.cuh b/cpp/src/centrality/betweenness_centrality_impl.cuh index 8ae49ed207c..88ef3987a03 100644 --- a/cpp/src/centrality/betweenness_centrality_impl.cuh +++ b/cpp/src/centrality/betweenness_centrality_impl.cuh @@ -23,7 +23,7 @@ #include "prims/per_v_transform_reduce_incoming_outgoing_e.cuh" #include "prims/transform_e.cuh" #include "prims/transform_reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -133,15 +133,15 @@ std::tuple, rmm::device_uvector> brandes_b update_edge_src_property(handle, graph_view, sigmas.begin(), src_sigmas.mutable_view()); update_edge_dst_property(handle, graph_view, distances.begin(), dst_distances.mutable_view()); - auto [new_frontier, new_sigma] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - src_sigmas.view(), - dst_distances.view(), - cugraph::edge_dummy_property_t{}.view(), - brandes_e_op_t{}, - reduce_op::plus()); + auto [new_frontier, new_sigma] = cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + src_sigmas.view(), + dst_distances.view(), + cugraph::edge_dummy_property_t{}.view(), + brandes_e_op_t{}, + reduce_op::plus()); update_v_frontier(handle, graph_view, diff --git a/cpp/src/components/weakly_connected_components_impl.cuh b/cpp/src/components/weakly_connected_components_impl.cuh index 468f4f7280f..219bc3c4d1d 100644 --- a/cpp/src/components/weakly_connected_components_impl.cuh +++ b/cpp/src/components/weakly_connected_components_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -550,24 +550,25 @@ void weakly_connected_components_impl(raft::handle_t const& handle, auto old_num_edge_inserts = num_edge_inserts.value(handle.get_stream()); resize_dataframe_buffer(edge_buffer, old_num_edge_inserts + max_pushes, handle.get_stream()); - auto new_frontier_tagged_vertex_buffer = transform_reduce_v_frontier_outgoing_e_by_dst( - handle, - level_graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{ - GraphViewType::is_multi_gpu - ? detail::edge_partition_endpoint_property_device_view_t( - edge_dst_components.mutable_view()) - : detail::edge_partition_endpoint_property_device_view_t( - detail::edge_minor_property_view_t(level_components, - vertex_t{0})), - level_graph_view.local_edge_partition_dst_range_first(), - get_dataframe_buffer_begin(edge_buffer), - num_edge_inserts.data()}, - reduce_op::null()); + auto new_frontier_tagged_vertex_buffer = + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + level_graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{ + GraphViewType::is_multi_gpu + ? detail::edge_partition_endpoint_property_device_view_t( + edge_dst_components.mutable_view()) + : detail::edge_partition_endpoint_property_device_view_t( + detail::edge_minor_property_view_t(level_components, + vertex_t{0})), + level_graph_view.local_edge_partition_dst_range_first(), + get_dataframe_buffer_begin(edge_buffer), + num_edge_inserts.data()}, + reduce_op::null()); update_v_frontier(handle, level_graph_view, diff --git a/cpp/src/cores/core_number_impl.cuh b/cpp/src/cores/core_number_impl.cuh index d807ccac5a5..a2b6f6430f0 100644 --- a/cpp/src/cores/core_number_impl.cuh +++ b/cpp/src/cores/core_number_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_v.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -222,14 +222,15 @@ void core_number(raft::handle_t const& handle, if (graph_view.is_symmetric() || ((degree_type == k_core_degree_type_t::IN) || (degree_type == k_core_degree_type_t::INOUT))) { auto [new_frontier_vertex_buffer, delta_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - dst_core_numbers.view(), - edge_dummy_property_t{}.view(), - e_op_t{k, delta}, - reduce_op::plus()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + dst_core_numbers.view(), + edge_dummy_property_t{}.view(), + e_op_t{k, delta}, + reduce_op::plus()); update_v_frontier( handle, diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 183e85765c5..8aaa91ca9e6 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -119,7 +119,6 @@ __device__ void warp_push_buffer_elements( } template buffer_idx(*buffer_idx_ptr); - int32_t constexpr shared_array_size = max_one_e_per_frontier_key - ? int32_t{1} /* dummy */ - : extract_transform_v_frontier_e_kernel_block_size; - __shared__ std::conditional_t - warp_local_degree_inclusive_sums[shared_array_size]; - __shared__ std::conditional_t - warp_key_local_edge_offsets[shared_array_size]; + __shared__ edge_t + warp_local_degree_inclusive_sums[extract_transform_v_frontier_e_kernel_block_size]; + __shared__ edge_t warp_key_local_edge_offsets[extract_transform_v_frontier_e_kernel_block_size]; using WarpScan = cub::WarpScan; - __shared__ std:: - conditional_t - temp_storage; + __shared__ typename WarpScan::TempStorage temp_storage; auto indices = edge_partition.indices(); @@ -217,98 +210,74 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } - if constexpr (max_one_e_per_frontier_key) { - // each thread processes one frontier key, exits if any edge returns a valid output + auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive + auto max_key_idx = + static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), + static_cast(num_keys))); // exclusive - e_op_result_t e_op_result{thrust::nullopt}; - auto key = *(key_first + idx); + // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - if (edge_partition_e_mask) { - for (edge_t i = 0; i < local_degree; ++i) { - if ((*edge_partition_e_mask).get(edge_offset + i)) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } + warp_key_local_edge_offsets[threadIdx.x] = edge_offset; + WarpScan(temp_storage) + .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); + __syncwarp(); + + // all the threads in a warp collectively process local edges for the keys in [key_first + + // min_key_idx, key_first + max_key_idx) + + auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + + (max_key_idx - min_key_idx) - 1]; + auto rounded_up_num_edges_this_warp = + ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + + auto this_warp_inclusive_sum_first = + warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); + auto this_warp_inclusive_sum_last = this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); + + if (edge_partition_e_mask) { + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; + + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + if ((*edge_partition_e_mask).get(local_edge_offset)) { + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); } } - } else { - for (edge_t i = 0; i < local_degree; ++i) { - e_op_result = call_e_op(key, edge_offset + i); - if (e_op_result) { break; } - } + + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } else { - auto min_key_idx = static_cast(idx - (idx % raft::warp_size())); // inclusive - auto max_key_idx = - static_cast(std::min(static_cast(min_key_idx) + raft::warp_size(), - static_cast(num_keys))); // exclusive - - // update warp_local_degree_inclusive_sums & warp_key_local_edge_offsets - - warp_key_local_edge_offsets[threadIdx.x] = edge_offset; - WarpScan(temp_storage) - .InclusiveSum(local_degree, warp_local_degree_inclusive_sums[threadIdx.x]); - __syncwarp(); - - // all the threads in a warp collectively process local edges for the keys in [key_first + - // min_key_idx, key_first + max_key_idx) - - auto num_edges_this_warp = warp_local_degree_inclusive_sums[warp_id * raft::warp_size() + - (max_key_idx - min_key_idx) - 1]; - auto rounded_up_num_edges_this_warp = - ((static_cast(num_edges_this_warp) + (raft::warp_size() - 1)) / raft::warp_size()) * - raft::warp_size(); - - auto this_warp_inclusive_sum_first = - warp_local_degree_inclusive_sums + warp_id * raft::warp_size(); - auto this_warp_inclusive_sum_last = - this_warp_inclusive_sum_first + (max_key_idx - min_key_idx); - - if (edge_partition_e_mask) { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); - if ((*edge_partition_e_mask).get(local_edge_offset)) { - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - e_op_result = call_e_op(key, local_edge_offset); - } - } + for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { + e_op_result_t e_op_result{thrust::nullopt}; - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + if (i < static_cast(num_edges_this_warp)) { + auto key_idx_this_warp = static_cast(thrust::distance( + this_warp_inclusive_sum_first, + thrust::upper_bound( + thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); + auto local_edge_offset = + warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + + static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} + : *(this_warp_inclusive_sum_first + + (key_idx_this_warp - 1)))); + auto key = *(key_first + (min_key_idx + key_idx_this_warp)); + e_op_result = call_e_op(key, local_edge_offset); } - } else { - for (size_t i = lane_id; i < rounded_up_num_edges_this_warp; i += raft::warp_size()) { - e_op_result_t e_op_result{thrust::nullopt}; - - if (i < static_cast(num_edges_this_warp)) { - auto key_idx_this_warp = static_cast(thrust::distance( - this_warp_inclusive_sum_first, - thrust::upper_bound( - thrust::seq, this_warp_inclusive_sum_first, this_warp_inclusive_sum_last, i))); - auto local_edge_offset = - warp_key_local_edge_offsets[warp_id * raft::warp_size() + key_idx_this_warp] + - static_cast(i - ((key_idx_this_warp == 0) ? edge_t{0} - : *(this_warp_inclusive_sum_first + - (key_idx_this_warp - 1)))); - auto key = *(key_first + (min_key_idx + key_idx_this_warp)); - e_op_result = call_e_op(key, local_edge_offset); - } - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -316,8 +285,7 @@ __global__ static void extract_transform_v_frontier_e_hypersparse_or_low_degree( } } -template buffer_idx(*buffer_idx_ptr); - using WarpReduce = cub::WarpReduce; - __shared__ std::conditional_t - temp_storage[max_one_e_per_frontier_key - ? (extract_transform_v_frontier_e_kernel_block_size / raft::warp_size()) - : int32_t{1} /* dummy */]; - while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); auto major = thrust_tuple_get_or_identity(key); @@ -404,42 +364,16 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } else { for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { e_op_result_t e_op_result{thrust::nullopt}; if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_lane_id = - WarpReduce(temp_storage[threadIdx.x / raft::warp_size()]) - .Reduce(e_op_result ? lane_id : raft::warp_size(), cub::Min()); - first_valid_lane_id = __shfl_sync(raft::warp_full_mask(), first_valid_lane_id, int{0}); - if (lane_id == first_valid_lane_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (first_valid_lane_id != raft::warp_size()) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -447,8 +381,7 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( } } -template buffer_idx(*buffer_idx_ptr); - using BlockReduce = cub::BlockReduce; - __shared__ std::conditional_t - temp_storage; - __shared__ int32_t output_thread_id; - while (idx < static_cast(thrust::distance(key_first, key_last))) { auto key = *(key_first + idx); auto major = thrust_tuple_get_or_identity(key); @@ -533,46 +459,16 @@ __global__ static void extract_transform_v_frontier_e_high_degree( e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } else { for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { e_op_result_t e_op_result{thrust::nullopt}; if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - if constexpr (max_one_e_per_frontier_key) { - auto first_valid_thread_id = - BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : extract_transform_v_frontier_e_kernel_block_size, - cub::Min()); - if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } - __syncthreads(); - if (threadIdx.x == output_thread_id) { - auto push_buffer_idx = buffer_idx.fetch_add(1, cuda::std::memory_order_relaxed); - push_buffer_element( - buffer_key_output_first, buffer_value_output_first, push_buffer_idx, e_op_result); - } - if (output_thread_id != extract_transform_v_frontier_e_kernel_block_size) { break; } - } else { - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } @@ -584,10 +480,6 @@ __global__ static void extract_transform_v_frontier_e_high_degree( template ( edge_partition_frontier_key_last); - auto max_pushes = max_one_e_per_frontier_key ? local_frontier_sizes[i] - : edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - handle.get_stream()); + auto max_pushes = edge_partition.compute_number_of_edges( + edge_partition_frontier_major_first, edge_partition_frontier_major_last, handle.get_stream()); auto tmp_key_buffer = allocate_optional_dataframe_buffer(max_pushes, handle.get_stream()); @@ -883,7 +772,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::grid_1d_block_t update_grid(h_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_high_degree + extract_transform_v_frontier_e_high_degree <<>>( edge_partition, edge_partition_frontier_key_first, @@ -901,7 +790,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_mid_degree + extract_transform_v_frontier_e_mid_degree <<>>( edge_partition, edge_partition_frontier_key_first + h_offsets[1], @@ -919,9 +808,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree + extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, edge_partition_frontier_key_first + h_offsets[2], @@ -939,9 +826,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::grid_1d_thread_t update_grid(h_offsets[4] - h_offsets[3], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree + extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, edge_partition_frontier_key_first + h_offsets[3], @@ -961,9 +846,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree + extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, edge_partition_frontier_key_first, diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index bbbf8e54a44..0ba20f6f1ce 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1305,6 +1305,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // peak memory requirement per loop is // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) + // FIXME: should we consider edge_partition_key_buffer as well? size_t num_streams = std::min(static_cast(minor_comm_size) * max_segments, diff --git a/cpp/src/prims/extract_transform_e.cuh b/cpp/src/prims/extract_transform_e.cuh index d51e03628e1..5741c98d90e 100644 --- a/cpp/src/prims/extract_transform_e.cuh +++ b/cpp/src/prims/extract_transform_e.cuh @@ -116,8 +116,8 @@ extract_transform_e(raft::handle_t const& handle, thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last())); auto value_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(std::ignore, value_buffer) = detail:: - extract_transform_v_frontier_e( + std::tie(std::ignore, value_buffer) = + detail::extract_transform_v_frontier_e( handle, graph_view, frontier, diff --git a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh index 413f46aeb57..ba227b263bc 100644 --- a/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh +++ b/cpp/src/prims/extract_transform_v_frontier_outgoing_e.cuh @@ -100,14 +100,14 @@ extract_transform_v_frontier_outgoing_e(raft::handle_t const& handle, auto value_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); std::tie(std::ignore, value_buffer) = - detail::extract_transform_v_frontier_e(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - do_expensive_check); + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + do_expensive_check); return value_buffer; } diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 5a87f8c8f33..c85b8ceae1a 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -74,8 +74,7 @@ namespace detail { int32_t constexpr update_v_frontier_from_outgoing_e_kernel_block_size = 512; -template (key) : dst; + auto reduce_by = dst; if constexpr (std::is_same_v && std::is_same_v) { return reduce_by; } else if constexpr (std::is_same_v && !std::is_same_v) { @@ -182,8 +181,7 @@ auto sort_and_reduce_buffer_elements( #define TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT 1 #endif -template ( 0, rmm::cuda_stream_view{}))>, decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) +transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, + GraphViewType const& graph_view, + KeyBucketType const& frontier, + EdgeSrcValueInputWrapper edge_src_value_input, + EdgeDstValueInputWrapper edge_dst_value_input, + EdgeValueInputWrapper edge_value_input, + EdgeOp e_op, + ReduceOp reduce_op, + bool do_expensive_check = false) { static_assert(!GraphViewType::is_storage_transposed, "GraphViewType should support the push model."); @@ -225,8 +223,7 @@ transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time0 = std::chrono::steady_clock::now(); #endif - detail::transform_reduce_v_frontier_call_e_op_t e_op_wrapper{e_op}; - bool constexpr max_one_e_per_frontier_key = - reduce_by_src && std::is_same_v>; auto [key_buffer, payload_buffer] = - detail::extract_transform_v_frontier_e( - handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op_wrapper, - do_expensive_check); + detail::extract_transform_v_frontier_e(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op_wrapper, + do_expensive_check); #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time1 = std::chrono::steady_clock::now(); @@ -271,17 +265,11 @@ transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - std::vector h_vertex_lasts(reduce_by_src ? minor_comm_size : major_comm_size); + std::vector h_vertex_lasts(major_comm_size); for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { auto vertex_partition_id = - reduce_by_src - ? detail::compute_local_edge_partition_major_range_vertex_partition_id_t{major_comm_size, - minor_comm_size, - major_comm_rank, - minor_comm_rank}( - i) - : detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id); } @@ -310,19 +298,14 @@ transform_reduce_v_frontier_outgoing_e_by_src_dst(raft::handle_t const& handle, h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(key_buffer), - tx_counts, - handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); key_buffer = std::move(rx_key_buffer); if constexpr (!std::is_same_v) { auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, std::ignore) = - shuffle_values(reduce_by_src ? minor_comm : major_comm, - get_dataframe_buffer_begin(payload_buffer), - tx_counts, - handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); payload_buffer = std::move(rx_payload_buffer); } @@ -430,93 +413,6 @@ size_t compute_num_out_nbrs_from_frontier(raft::handle_t const& handle, return ret; } -/** - * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor - * outputs by (tagged-)source ID. - * - * Edge functor outputs are thrust::optional objects and invalid if thrust::nullopt. Vertices are - * assumed to be tagged if KeyBucketType::key_type is a tuple of a vertex type and a tag - * type (KeyBucketType::key_type is identical to a vertex type otherwise). - * - * @tparam GraphViewType Type of the passed non-owning graph object. - * @tparam KeyBucketType Type of the vertex frontier bucket class which abstracts the - * current (tagged-)vertex frontier. - * @tparam EdgeSrcValueInputWrapper Type of the wrapper for edge source property values. - * @tparam EdgeDstValueInputWrapper Type of the wrapper for edge destination property values. - * @tparam EdgeValueInputWrapper Type of the wrapper for edge property values. - * @tparam EdgeOp Type of the quinary edge operator. - * @tparam ReduceOp Type of the binary reduction operator. - * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and - * handles to various CUDA libraries) to run graph algorithms. - * @param graph_view Non-owning graph object. - * @param frontier KeyBucketType class object for the current vertex frontier. - * @param edge_src_value_input Wrapper used to access source input property values (for the edge - * sources assigned to this process in multi-GPU). Use either cugraph::edge_src_property_t::view() - * (if @p e_op needs to access source property values) or cugraph::edge_src_dummy_property_t::view() - * (if @p e_op does not access source property values). Use update_edge_src_property to fill the - * wrapper. - * @param edge_dst_value_input Wrapper used to access destination input property values (for the - * edge destinations assigned to this process in multi-GPU). Use either - * cugraph::edge_dst_property_t::view() (if @p e_op needs to access destination property values) or - * cugraph::edge_dst_dummy_property_t::view() (if @p e_op does not access destination property - * values). Use update_edge_dst_property to fill the wrapper. - * @param edge_value_input Wrapper used to access edge input property values (for the edges assigned - * to this process in multi-GPU). Use either cugraph::edge_property_t::view() (if @p e_op needs to - * access edge property values) or cugraph::edge_dummy_property_t::view() (if @p e_op does not - * access edge property values). - * @param e_op Quinary operator takes edge (tagged-)source, edge destination, property values for - * the source, destination, and edge and returns 1) thrust::nullopt (if invalid and to be - * discarded); 2) dummy (but valid) thrust::optional object (e.g. - * thrust::optional{std::byte{0}}, if vertices are not tagged and ReduceOp::value_type is - * void); 3) a tag (if vertices are tagged and ReduceOp::value_type is void); 4) a value to be - * reduced using the @p reduce_op (if vertices are not tagged and ReduceOp::value_type is not void); - * or 5) a tuple of a tag and a value to be reduced (if vertices are tagged and ReduceOp::value_type - * is not void). - * @param reduce_op Binary operator that takes two input arguments and reduce the two values to one. - * There are pre-defined reduction operators in prims/reduce_op.cuh. It is - * recommended to use the pre-defined reduction operators whenever possible as the current (and - * future) implementations of graph primitives may check whether @p ReduceOp is a known type (or has - * known member variables) to take a more optimized code path. See the documentation in the - * reduce_op.cuh file for instructions on writing custom reduction operators. - * @return Tuple of key values and payload values (if ReduceOp::value_type is not void) or just key - * values (if ReduceOp::value_type is void). Keys in the return values are sorted in ascending order - * using a vertex ID as the primary key and a tag (if relevant) as the secondary key. - */ -template -std::conditional_t< - !std::is_same_v, - std::tuple( - 0, rmm::cuda_stream_view{})), - decltype(detail::allocate_optional_dataframe_buffer( - 0, rmm::cuda_stream_view{}))>, - decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))> -transform_reduce_v_frontier_outgoing_e_by_src(raft::handle_t const& handle, - GraphViewType const& graph_view, - KeyBucketType const& frontier, - EdgeSrcValueInputWrapper edge_src_value_input, - EdgeDstValueInputWrapper edge_dst_value_input, - EdgeValueInputWrapper edge_value_input, - EdgeOp e_op, - ReduceOp reduce_op, - bool do_expensive_check = false) -{ - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); -} - /** * @brief Iterate over outgoing edges from the current vertex frontier and reduce valid edge functor * outputs by (tagged-)destination ID. @@ -593,15 +489,15 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, ReduceOp reduce_op, bool do_expensive_check = false) { - return detail::transform_reduce_v_frontier_outgoing_e_by_src_dst(handle, - graph_view, - frontier, - edge_src_value_input, - edge_dst_value_input, - edge_value_input, - e_op, - reduce_op, - do_expensive_check); + return detail::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + graph_view, + frontier, + edge_src_value_input, + edge_dst_value_input, + edge_value_input, + e_op, + reduce_op, + do_expensive_check); } } // namespace cugraph diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 998335f4d03..a3cb78b037a 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -16,9 +16,9 @@ #pragma once #include "prims/fill_edge_src_dst_property.cuh" -#include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" #include "prims/per_v_transform_reduce_if_incoming_outgoing_e.cuh" +#include "prims/reduce_op.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -290,14 +290,15 @@ void bfs(raft::handle_t const& handle, e_op.dst_first = graph_view.local_edge_partition_dst_range_first(); auto [new_frontier_vertex_buffer, predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - reduce_op::any()); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( + handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + reduce_op::any()); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown1 = std::chrono::steady_clock::now(); @@ -464,7 +465,8 @@ void bfs(raft::handle_t const& handle, invalid_vertex, reduce_op::any(), pred_op, - predecessor_buffer.begin(), true); + predecessor_buffer.begin(), + true); rmm::device_uvector new_frontier_vertex_buffer( thrust::count_if(handle.get_thrust_policy(), @@ -477,13 +479,14 @@ void bfs(raft::handle_t const& handle, handle.get_stream()); auto pair_first = thrust::make_zip_iterator(vertex_frontier.bucket(bucket_idx_cur).cbegin(), predecessor_buffer.begin()); - thrust::copy_if( - handle.get_thrust_policy(), - pair_first, - pair_first + vertex_frontier.bucket(bucket_idx_cur).size(), - thrust::make_zip_iterator(new_frontier_vertex_buffer.begin(), - tmp_predecessor_buffer.begin()), - cuda::proclaim_return_type([] __device__(auto pair) { return thrust::get<1>(pair) != invalid_vertex; })); + thrust::copy_if(handle.get_thrust_policy(), + pair_first, + pair_first + vertex_frontier.bucket(bucket_idx_cur).size(), + thrust::make_zip_iterator(new_frontier_vertex_buffer.begin(), + tmp_predecessor_buffer.begin()), + cuda::proclaim_return_type([] __device__(auto pair) { + return thrust::get<1>(pair) != invalid_vertex; + })); predecessor_buffer = std::move(tmp_predecessor_buffer); } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete diff --git a/cpp/src/traversal/k_hop_nbrs_impl.cuh b/cpp/src/traversal/k_hop_nbrs_impl.cuh index acf3cfe8fc5..44fa21a5252 100644 --- a/cpp/src/traversal/k_hop_nbrs_impl.cuh +++ b/cpp/src/traversal/k_hop_nbrs_impl.cuh @@ -16,7 +16,7 @@ #pragma once #include "prims/reduce_op.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include @@ -147,15 +147,15 @@ k_hop_nbrs(raft::handle_t const& handle, rmm::device_uvector nbrs(0, handle.get_stream()); for (size_t iter = 0; iter < k; ++iter) { auto new_frontier_key_buffer = - transform_reduce_v_frontier_outgoing_e_by_dst(handle, - push_graph_view, - frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op_t{}, - reduce_op::null{}, - do_expensive_check); + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst(handle, + push_graph_view, + frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op_t{}, + reduce_op::null{}, + do_expensive_check); if (iter < (k - 1)) { frontier.bucket(bucket_idx_cur).clear(); frontier.bucket(bucket_idx_cur) diff --git a/cpp/src/traversal/od_shortest_distances_impl.cuh b/cpp/src/traversal/od_shortest_distances_impl.cuh index e1b7444b92f..594f3b933e5 100644 --- a/cpp/src/traversal/od_shortest_distances_impl.cuh +++ b/cpp/src/traversal/od_shortest_distances_impl.cuh @@ -22,7 +22,7 @@ #include "prims/kv_store.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -641,7 +641,6 @@ rmm::device_uvector od_shortest_distances( cutoff, invalid_distance}; detail::transform_reduce_v_frontier_call_e_op_t< - false, thrust::tuple, weight_t, vertex_t, @@ -653,8 +652,8 @@ rmm::device_uvector od_shortest_distances( auto new_frontier_tagged_vertex_buffer = allocate_dataframe_buffer>(0, handle.get_stream()); - std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = detail:: - extract_transform_v_frontier_e, weight_t>( + std::tie(new_frontier_tagged_vertex_buffer, distance_buffer) = + detail::extract_transform_v_frontier_e, weight_t>( handle, graph_view, vertex_frontier.bucket(bucket_idx_near), diff --git a/cpp/src/traversal/sssp_impl.cuh b/cpp/src/traversal/sssp_impl.cuh index 47908524feb..3429672b151 100644 --- a/cpp/src/traversal/sssp_impl.cuh +++ b/cpp/src/traversal/sssp_impl.cuh @@ -19,7 +19,7 @@ #include "prims/fill_edge_src_dst_property.cuh" #include "prims/reduce_op.cuh" #include "prims/transform_reduce_e.cuh" -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/update_edge_src_dst_property.cuh" #include "prims/update_v_frontier.cuh" #include "prims/vertex_frontier.cuh" @@ -197,7 +197,7 @@ void sssp(raft::handle_t const& handle, push_graph_view.local_vertex_partition_view()); auto [new_frontier_vertex_buffer, distance_predecessor_buffer] = - transform_reduce_v_frontier_outgoing_e_by_dst( + cugraph::transform_reduce_v_frontier_outgoing_e_by_dst( handle, push_graph_view, vertex_frontier.bucket(bucket_idx_cur_near), diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 52d257b9bea..5a47173abcb 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -667,7 +667,7 @@ if(BUILD_CUGRAPH_MG_TESTS) ############################################################################################### # - MG PRIMS TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST tests -------------------------- ConfigureTestMG(MG_TRANSFORM_REDUCE_V_FRONTIER_OUTGOING_E_BY_SRC_DST_TEST - prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu) + prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu) ############################################################################################### # - MG PRIMS REDUCE_V tests ------------------------------------------------------------------- diff --git a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu similarity index 74% rename from cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu rename to cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu index 5947dd9a560..51c536bb97f 100644 --- a/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_src_dst.cu +++ b/cpp/tests/prims/mg_transform_reduce_v_frontier_outgoing_e_by_dst.cu @@ -14,7 +14,7 @@ * limitations under the License. */ -#include "prims/transform_reduce_v_frontier_outgoing_e_by_src_dst.cuh" +#include "prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh" #include "prims/vertex_frontier.cuh" #include "utilities/base_fixture.hpp" #include "utilities/conversion_utilities.hpp" @@ -203,48 +203,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (cugraph::test::g_perf) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); - } - - auto mg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto mg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - - if constexpr (std::is_same_v) { - mg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(mg_reduce_by_src_new_frontier_key_buffer, mg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - mg_graph_view, - mg_vertex_frontier.bucket(bucket_idx_cur), - mg_src_prop.view(), - mg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.stop(); - hr_timer.display_and_clear(std::cout); - } - if (cugraph::test::g_perf) { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement - handle_->get_comms().barrier(); - hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_src"); + hr_timer.start("MG transform_reduce_v_frontier_outgoing_e_by_dst"); } auto mg_reduce_by_dst_new_frontier_key_buffer = @@ -286,56 +245,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst // 3. compare SG & MG results if (prims_usecase.check_correctness) { - if constexpr (std::is_same_v) { - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_src_new_frontier_key_buffer.begin(), - mg_reduce_by_src_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - mg_reduce_by_dst_new_frontier_key_buffer.begin(), - mg_reduce_by_dst_new_frontier_key_buffer.size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } else { - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - - cugraph::unrenumber_int_vertices( - *handle_, - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).begin(), - std::get<0>(mg_reduce_by_dst_new_frontier_key_buffer).size(), - (*mg_renumber_map).data(), - mg_graph_view.vertex_partition_range_lasts()); - } - - auto mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - mg_reduce_by_src_aggregate_new_frontier_key_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_new_frontier_key_buffer.data(), - mg_reduce_by_src_new_frontier_key_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<0>(mg_reduce_by_src_new_frontier_key_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_new_frontier_key_buffer) = - cugraph::test::device_gatherv( - *handle_, - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).data(), - std::get<1>(mg_reduce_by_src_new_frontier_key_buffer).size()); - } - auto mg_reduce_by_dst_aggregate_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); if constexpr (std::is_same_v) { @@ -356,26 +265,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst std::get<1>(mg_reduce_by_dst_new_frontier_key_buffer).size()); } - [[maybe_unused]] auto mg_reduce_by_src_aggregate_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - mg_reduce_by_src_aggregate_payload_buffer = - cugraph::test::device_gatherv(*handle_, - mg_reduce_by_src_payload_buffer.data(), - mg_reduce_by_src_payload_buffer.size()); - } else { - std::get<0>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<0>(mg_reduce_by_src_payload_buffer).data(), - std::get<0>(mg_reduce_by_src_payload_buffer).size()); - std::get<1>(mg_reduce_by_src_aggregate_payload_buffer) = - cugraph::test::device_gatherv(*handle_, - std::get<1>(mg_reduce_by_src_payload_buffer).data(), - std::get<1>(mg_reduce_by_src_payload_buffer).size()); - } - } - [[maybe_unused]] auto mg_reduce_by_dst_aggregate_payload_buffer = cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); if constexpr (!std::is_same_v) { @@ -409,22 +298,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if (handle_->get_comms().get_rank() == int{0}) { if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(mg_reduce_by_dst_aggregate_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(mg_reduce_by_dst_aggregate_new_frontier_key_buffer), @@ -471,34 +349,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst .insert(cugraph::get_dataframe_buffer_begin(sg_key_buffer), cugraph::get_dataframe_buffer_end(sg_key_buffer)); - auto sg_reduce_by_src_new_frontier_key_buffer = - cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); - [[maybe_unused]] auto sg_reduce_by_src_payload_buffer = - cugraph::detail::allocate_optional_dataframe_buffer(0, handle_->get_stream()); - if constexpr (std::is_same_v) { - sg_reduce_by_src_new_frontier_key_buffer = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::null{}); - } else { - std::tie(sg_reduce_by_src_new_frontier_key_buffer, sg_reduce_by_src_payload_buffer) = - cugraph::transform_reduce_v_frontier_outgoing_e_by_src( - *handle_, - sg_graph_view, - sg_vertex_frontier.bucket(bucket_idx_cur), - sg_src_prop.view(), - sg_dst_prop.view(), - cugraph::edge_dummy_property_t{}.view(), - e_op_t{}, - cugraph::reduce_op::plus{}); - } - auto sg_reduce_by_dst_new_frontier_key_buffer = cugraph::allocate_dataframe_buffer(0, handle_->get_stream()); [[maybe_unused]] auto sg_reduce_by_dst_payload_buffer = @@ -528,22 +378,11 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst } if constexpr (std::is_same_v) { - thrust::sort( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer)); - thrust::sort( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer)); } else { - thrust::sort_by_key( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer)); - thrust::sort_by_key( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), @@ -551,14 +390,7 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer)); } - bool key_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_end(sg_reduce_by_src_new_frontier_key_buffer), - cugraph::get_dataframe_buffer_begin(mg_reduce_by_src_aggregate_new_frontier_key_buffer)); - ASSERT_TRUE(key_passed); - - key_passed = thrust::equal( + auto key_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_new_frontier_key_buffer), cugraph::get_dataframe_buffer_end(sg_reduce_by_dst_new_frontier_key_buffer), @@ -567,13 +399,6 @@ class Tests_MGTransformReduceVFrontierOutgoingEBySrcDst if constexpr (!std::is_same_v) { bool payload_passed = thrust::equal( - handle_->get_thrust_policy(), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_begin(sg_reduce_by_src_payload_buffer), - cugraph::get_dataframe_buffer_end(mg_reduce_by_src_aggregate_payload_buffer)); - ASSERT_TRUE(payload_passed); - - payload_passed = thrust::equal( handle_->get_thrust_policy(), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), cugraph::get_dataframe_buffer_begin(sg_reduce_by_dst_payload_buffer), diff --git a/cpp/tests/sampling/detail/nbr_sampling_validate.cu b/cpp/tests/sampling/detail/nbr_sampling_validate.cu index 61731e2e15c..00d572ee8d2 100644 --- a/cpp/tests/sampling/detail/nbr_sampling_validate.cu +++ b/cpp/tests/sampling/detail/nbr_sampling_validate.cu @@ -75,6 +75,8 @@ struct ArithmeticZipLess { } else { return thrust::get<1>(left) < thrust::get<1>(right); } + } else { + return left < right; } } }; From 7b98e3a5c5af8ab176cb8327043b3b098a49b9a9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 15 Aug 2024 09:51:07 -0700 Subject: [PATCH 023/126] code cleanup, add few FIXMEs to improve performance, and add performance measurement code --- .../detail/extract_transform_v_frontier_e.cuh | 44 ++- .../prims/detail/per_v_transform_reduce_e.cuh | 310 +++++++++++------- cpp/src/prims/fill_edge_src_dst_property.cuh | 1 + cpp/src/prims/vertex_frontier.cuh | 62 ++-- cpp/src/traversal/bfs_impl.cuh | 139 ++++---- 5 files changed, 310 insertions(+), 246 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 8aaa91ca9e6..1fe81198eb8 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -551,7 +551,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust::optional, thrust::optional>>>); - constexpr bool use_bitmap = GraphViewType::is_multi_gpu && std::is_same_v && + constexpr bool try_bitmap = GraphViewType::is_multi_gpu && std::is_same_v && KeyBucketType::is_sorted_unique; if (do_expensive_check) { @@ -624,24 +624,32 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // update frontier bitmap (used to reduce broadcast bandwidth size) std:: - conditional_t>, std::byte /* dummy */> + conditional_t>, std::byte /* dummy */> frontier_bitmap{}; - std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; - if constexpr (use_bitmap) { + std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + if constexpr (try_bitmap) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : graph_view.local_vertex_partition_range_size(); - - std::tie(frontier_bitmap, use_bitmap_flags) = - compute_vertex_list_bitmap_info(minor_comm, - frontier_key_first, - frontier_key_last, - graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_first() + bool_size, - handle.get_stream()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(); + + frontier_bitmap = + compute_vertex_list_bitmap_info(frontier_key_first, + frontier_key_last, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + bool_size, + handle.get_stream()); + } + auto tmp_flags = host_scalar_allgather( + minor_comm, frontier_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + use_bitmap_flags.resize(tmp_flags.size()); + std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { + return flag == uint8_t{1}; + }); } // 2. fill the buffers @@ -687,7 +695,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, resize_dataframe_buffer( edge_partition_frontier_key_buffer, local_frontier_sizes[i], handle.get_stream()); - if constexpr (use_bitmap) { + if constexpr (try_bitmap) { std::variant, decltype(frontier_key_first)> v_list{}; if (use_bitmap_flags[i]) { v_list = (static_cast(i) == minor_comm_rank) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 0ba20f6f1ce..24abe57437c 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -943,6 +943,8 @@ rmm::device_uvector compute_keep_flags( // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same // DGX node should be the next) + // FIXME: for high & mid, it will be mostly local... should I do this? Or ask for just missing + // ones??? rmm::device_uvector priorities(thrust::distance(value_first, value_last), stream_view); thrust::tabulate( @@ -1074,6 +1076,7 @@ void gather_offset_value_pairs_and_update_vertex_value_output( shrink_to_fit_dataframe_buffer(values, stream_view); if (comm_rank == root) { + // FIXME: this scatter can sequentialize GPU operations... thrust::scatter(rmm::exec_policy(stream_view), get_dataframe_buffer_begin(rx_values), get_dataframe_buffer_end(rx_values), @@ -1082,6 +1085,8 @@ void gather_offset_value_pairs_and_update_vertex_value_output( } } +#define PER_V_PERFORMANCE_MEASUREMENT 1 + template ; @@ -1149,14 +1158,33 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - constexpr bool use_bitmap = GraphViewType::is_multi_gpu && + constexpr bool try_bitmap = GraphViewType::is_multi_gpu && !std::is_same_v && std::is_same_v; [[maybe_unused]] constexpr auto max_segments = detail::num_sparse_segments_per_vertex_partition + size_t{1}; - // 1. prepare key list + /* 1. compute subgroup_size */ + + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / minor_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + + // 2. prepare key list auto sorted_unique_nzd_key_last = sorted_unique_key_last; if constexpr (use_input_key) { @@ -1176,120 +1204,60 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - std::conditional_t, std::byte /* dummy */> - local_key_list_sizes{}; - if constexpr (use_input_key) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_key_list_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), - handle.get_stream()); - } else { - local_key_list_sizes = std::vector{ - static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; - } - } + // 3. compute optional bitmap info std:: - conditional_t>, std::byte /* dummy */> + conditional_t>, std::byte /* dummy */> key_list_bitmap{}; - std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; - if constexpr (use_bitmap) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : graph_view.local_vertex_partition_range_size(); - - std::tie(key_list_bitmap, use_bitmap_flags) = - compute_vertex_list_bitmap_info(minor_comm, - sorted_unique_key_first, - sorted_unique_nzd_key_last, - graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_first() + bool_size, - handle.get_stream()); - } - - // 2. compute subgroup_size, set-up temporary buffers & stream pool, and initialize - - [[maybe_unused]] std::conditional_t>, - int, - std::byte /* dummy */> - subgroup_size{}; - if constexpr (GraphViewType::is_multi_gpu && update_major && - std::is_same_v>) { + if constexpr (try_bitmap) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - - int num_gpus_per_node{}; - RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); - subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm - ? std::max(num_gpus_per_node / minor_comm_size, int{1}) - : std::min(minor_comm_size, num_gpus_per_node); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(); + + key_list_bitmap = + compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_first() + bool_size, + handle.get_stream()); + } } - using minor_tmp_buffer_type = std::conditional_t, - edge_dst_property_t>; - [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - minor_tmp_buffer = std::make_unique(handle, graph_view); - } + // 4. collect local_key_list_sizes & use_bitmap_flags - using edge_partition_minor_output_device_view_t = - std::conditional_tmutable_view().value_first())>, - void /* dummy */>; - - if constexpr (update_major) { // no vertices in the zero degree segment are visited + std::conditional_t, std::byte /* dummy */> + local_key_list_sizes{}; + std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); if constexpr (use_input_key) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + - thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), - vertex_value_output_first + - thrust::distance(sorted_unique_key_first, sorted_unique_key_last), - init); - } else { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); - } + local_key_list_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), + handle.get_stream()); + } + if constexpr (try_bitmap) { + auto tmp_flags = host_scalar_allgather( + minor_comm, key_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + use_bitmap_flags.resize(tmp_flags.size()); + std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { + return flag == uint8_t{1}; + }); } } else { - if constexpr (GraphViewType::is_multi_gpu) { - auto minor_init = init; - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may - // not store values for the entire minor range - minor_init = ReduceOp::identity_element; - } else { - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; - } - fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); - } else { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first, - vertex_value_output_first + graph_view.local_vertex_partition_range_size(), - init); + if constexpr (use_input_key) { + local_key_list_sizes = std::vector{ + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; } } + // 5. set-up stream pool + std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { if ((graph_view.local_edge_partition_segment_offsets(0)) && @@ -1305,7 +1273,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // peak memory requirement per loop is // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - // FIXME: should we consider edge_partition_key_buffer as well? size_t num_streams = std::min(static_cast(minor_comm_size) * max_segments, @@ -1360,6 +1327,23 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + // 6. set-up temporary buffers + + using minor_tmp_buffer_type = std::conditional_t, + edge_dst_property_t>; + [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + minor_tmp_buffer = std::make_unique(handle, graph_view); + } + + using edge_partition_minor_output_device_view_t = + std::conditional_tmutable_view().value_first())>, + void /* dummy */>; + std::vector> major_tmp_buffers{}; if constexpr (GraphViewType::is_multi_gpu && update_major) { std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), @@ -1423,11 +1407,53 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - if (stream_pool_indices) { handle.sync_stream(); } + // 7. initialize - // 3. proces local edge partitions + if constexpr (update_major) { // no vertices in the zero degree segment are visited + if constexpr (use_input_key) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), + init); + } else { + size_t partition_idx = 0; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + *((*segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*segment_offsets).rbegin()), + init); + } + } + } else { + if constexpr (GraphViewType::is_multi_gpu) { + auto minor_init = init; + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may + // not store values for the entire minor range + minor_init = ReduceOp::identity_element; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; + } + fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); + } else { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + graph_view.local_vertex_partition_range_size(), + init); + } + } - auto edge_mask_view = graph_view.edge_mask_view(); + // 8. create key_segment_offset_vectors std::conditional_t>, @@ -1445,7 +1471,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + if (stream_pool_indices) { handle.sync_stream(); } + + // 9. proces local edge partitions + + auto edge_mask_view = graph_view.edge_mask_view(); + +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time1 = std::chrono::steady_clock::now(); +#endif for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime0 = std::chrono::steady_clock::now(); +#endif auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); @@ -1489,6 +1529,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto edge_partition_key_buffer = allocate_optional_dataframe_buffer< std::conditional_t>(0, loop_stream); +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime1 = std::chrono::steady_clock::now(); +#endif if constexpr (GraphViewType::is_multi_gpu && use_input_key) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -1498,7 +1542,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, resize_optional_dataframe_buffer( edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); - if constexpr (use_bitmap) { + if constexpr (try_bitmap) { std::variant, decltype(sorted_unique_key_first)> v_list{}; if (use_bitmap_flags[i]) { @@ -1533,6 +1577,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime2 = std::chrono::steady_clock::now(); +#endif std::optional> key_segment_offsets{std::nullopt}; if (segment_offsets) { @@ -1550,10 +1598,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, (*key_segment_offsets).begin(), [](vertex_t offset) { return static_cast(offset); }); } - } else { - key_segment_offsets = std::nullopt; } RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime3 = std::chrono::steady_clock::now(); +#endif edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -1592,6 +1642,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, decltype(edge_partition_key_first), decltype(thrust::make_counting_iterator(vertex_t{0}))>; +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime4 = std::chrono::steady_clock::now(); +#endif if (key_segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); @@ -1663,7 +1717,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); } segment_key_first += (*key_segment_offsets)[2]; - auto num_keys = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, @@ -1777,6 +1830,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, pred_op); } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime5 = std::chrono::steady_clock::now(); +#endif if constexpr (GraphViewType::is_multi_gpu && update_major) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -1936,9 +1993,26 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // as *segment_offsets do not necessarily coincide // in different edge partitions). } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime6 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; + std::chrono::duration subdur3 = subtime4 - subtime3; + std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; + std::cout << "\t\t\tdetail::per_v i=" << i << " took (" << subdur0.count() << "," + << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," + << subdur4.count() << "," << subdur5.count() << ")" << std::endl; +#endif } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); +#endif - // 4. communication + // 10. communication if constexpr (GraphViewType::is_multi_gpu && update_major && std::is_same_v>) { @@ -2013,6 +2087,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time3 = std::chrono::steady_clock::now(); +#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } @@ -2102,6 +2180,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time4 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::chrono::duration dur3 = time4 - time3; + std::cout << "\t\tdetail::per_v took (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ")" << std::endl; +#endif } } // namespace detail diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 58dbf7e74a0..a1c4000d806 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -329,6 +329,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { + // FIXME: we can optionally use bitmap for this broadcast // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() device_bcast( diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 2915328a15f..7483907549c 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -119,9 +119,7 @@ std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, } template -std::tuple>, std::vector> -compute_vertex_list_bitmap_info( - raft::comms::comms_t const& comm, +std::optional> compute_vertex_list_bitmap_info( VertexIterator sorted_unique_vertex_first, VertexIterator sorted_unique_vertex_last, typename thrust::iterator_traits::value_type vertex_range_first, @@ -134,43 +132,29 @@ compute_vertex_list_bitmap_info( 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); std::optional> bitmap{std::nullopt}; - std::vector use_bitmap_flags{}; - - if (comm.get_size() > 1) { - auto v_list_size = static_cast( - thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); - auto bool_size = vertex_range_last - vertex_range_first; - - if (v_list_size > static_cast(bool_size * threshold_ratio)) { - bitmap = rmm::device_uvector(packed_bool_size(bool_size), stream_view); - thrust::fill(rmm::exec_policy(stream_view), - (*bitmap).begin(), - (*bitmap).end(), - packed_bool_empty_mask()); - thrust::for_each(rmm::exec_policy(stream_view), - sorted_unique_vertex_first, - sorted_unique_vertex_last, - [bitmap = raft::device_span((*bitmap).data(), (*bitmap).size()), - v_first = vertex_range_first] __device__(vertex_t v) { - auto v_offset = v - v_first; - cuda::atomic_ref word( - bitmap[packed_bool_offset(v_offset)]); - word.fetch_or(cugraph::packed_bool_mask(v_offset), - cuda::std::memory_order_relaxed); - }); - } - - auto tmp_flags = host_scalar_allgather(comm, bitmap ? uint8_t{1} : uint8_t{0}, stream_view); - use_bitmap_flags.resize(tmp_flags.size()); - std::transform( - tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](uint8_t tmp_flag) { - return (tmp_flag == uint8_t{1}); - }); - } else { - use_bitmap_flags = {false}; - } - return std::make_tuple(std::move(bitmap), std::move(use_bitmap_flags)); + auto v_list_size = + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + auto bool_size = vertex_range_last - vertex_range_first; + + if (v_list_size > static_cast(bool_size * threshold_ratio)) { + bitmap = rmm::device_uvector(packed_bool_size(bool_size), stream_view); + thrust::fill( + rmm::exec_policy(stream_view), (*bitmap).begin(), (*bitmap).end(), packed_bool_empty_mask()); + thrust::for_each(rmm::exec_policy(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + [bitmap = raft::device_span((*bitmap).data(), (*bitmap).size()), + v_first = vertex_range_first] __device__(vertex_t v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(cugraph::packed_bool_mask(v_offset), + cuda::std::memory_order_relaxed); + }); + } + + return bitmap; } template diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index a3cb78b037a..26b30d87c53 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -278,7 +278,6 @@ void bfs(raft::handle_t const& handle, #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown0 = std::chrono::steady_clock::now(); - std::cout << "topdown0 " << std::endl; #endif topdown_e_op_t e_op{}; e_op.prev_visited_flags = @@ -352,6 +351,8 @@ void bfs(raft::handle_t const& handle, #endif if (direction_optimizing) { + // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed concurrently. + // FIXME: also the above fill_edge_dst_property can be executed concurrently. auto m_f = thrust::transform_reduce( handle.get_thrust_policy(), vertex_frontier.bucket(bucket_idx_next).begin(), @@ -446,72 +447,57 @@ void bfs(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup0 = std::chrono::steady_clock::now(); #endif - bottomup_e_op_t e_op{}; - bottomup_pred_op_t pred_op{}; - pred_op.prev_visited_flags = - detail::edge_partition_endpoint_property_device_view_t( - prev_dst_visited_flags.view()); - pred_op.dst_first = graph_view.local_edge_partition_dst_range_first(); - - rmm::device_uvector predecessor_buffer( - vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream()); - per_v_transform_reduce_if_outgoing_e(handle, - graph_view, - vertex_frontier.bucket(bucket_idx_cur), - edge_src_dummy_property_t{}.view(), - edge_dst_dummy_property_t{}.view(), - edge_dummy_property_t{}.view(), - e_op, - invalid_vertex, - reduce_op::any(), - pred_op, - predecessor_buffer.begin(), - true); - - rmm::device_uvector new_frontier_vertex_buffer( - thrust::count_if(handle.get_thrust_policy(), - predecessor_buffer.begin(), - predecessor_buffer.end(), - detail::is_not_equal_t{invalid_vertex}), - handle.get_stream()); + rmm::device_uvector new_frontier_vertex_buffer(0, handle.get_stream()); { - rmm::device_uvector tmp_predecessor_buffer(new_frontier_vertex_buffer.size(), - handle.get_stream()); - auto pair_first = thrust::make_zip_iterator(vertex_frontier.bucket(bucket_idx_cur).cbegin(), - predecessor_buffer.begin()); - thrust::copy_if(handle.get_thrust_policy(), - pair_first, - pair_first + vertex_frontier.bucket(bucket_idx_cur).size(), - thrust::make_zip_iterator(new_frontier_vertex_buffer.begin(), - tmp_predecessor_buffer.begin()), - cuda::proclaim_return_type([] __device__(auto pair) { - return thrust::get<1>(pair) != invalid_vertex; - })); - predecessor_buffer = std::move(tmp_predecessor_buffer); - } -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup1 = std::chrono::steady_clock::now(); -#endif - - auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), - predecessor_buffer.begin()); - thrust::scatter( - handle.get_thrust_policy(), - input_pair_first, - input_pair_first + new_frontier_vertex_buffer.size(), - thrust::make_transform_iterator( - new_frontier_vertex_buffer.begin(), - detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), - thrust::make_zip_iterator(distances, predecessor_first)); + bottomup_e_op_t e_op{}; + bottomup_pred_op_t pred_op{}; + pred_op.prev_visited_flags = + detail::edge_partition_endpoint_property_device_view_t( + prev_dst_visited_flags.view()); + pred_op.dst_first = graph_view.local_edge_partition_dst_range_first(); + + rmm::device_uvector predecessor_buffer( + vertex_frontier.bucket(bucket_idx_cur).size(), handle.get_stream()); + per_v_transform_reduce_if_outgoing_e(handle, + graph_view, + vertex_frontier.bucket(bucket_idx_cur), + edge_src_dummy_property_t{}.view(), + edge_dst_dummy_property_t{}.view(), + edge_dummy_property_t{}.view(), + e_op, + invalid_vertex, + reduce_op::any(), + pred_op, + predecessor_buffer.begin(), + true); + auto input_pair_first = thrust::make_zip_iterator(thrust::make_constant_iterator(depth + 1), + predecessor_buffer.begin()); + + // FIXME: this scatter_if and the resize below can be concurrently executed. + thrust::scatter_if( + handle.get_thrust_policy(), + input_pair_first, + input_pair_first + predecessor_buffer.size(), + thrust::make_transform_iterator( + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + detail::shift_left_t{graph_view.local_vertex_partition_range_first()}), + predecessor_buffer.begin(), + thrust::make_zip_iterator(distances, predecessor_first), + detail::is_not_equal_t{invalid_vertex}); + + new_frontier_vertex_buffer.resize(predecessor_buffer.size(), handle.get_stream()); + new_frontier_vertex_buffer.resize( + thrust::distance(new_frontier_vertex_buffer.begin(), + thrust::copy_if(handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_cur).cbegin(), + vertex_frontier.bucket(bucket_idx_cur).cend(), + predecessor_buffer.begin(), + new_frontier_vertex_buffer.begin(), + detail::is_not_equal_t{invalid_vertex})), + handle.get_stream()); - assert(direction_optimizing); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup2 = std::chrono::steady_clock::now(); -#endif + assert(direction_optimizing); - { rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), handle.get_stream()); tmp_vertices.resize( @@ -527,7 +513,7 @@ void bfs(raft::handle_t const& handle, } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup3 = std::chrono::steady_clock::now(); + auto bottomup1 = std::chrono::steady_clock::now(); #endif next_aggregate_vertex_frontier_size = @@ -539,17 +525,15 @@ void bfs(raft::handle_t const& handle, : static_cast(new_frontier_vertex_buffer.size()); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup4 = std::chrono::steady_clock::now(); + auto bottomup2 = std::chrono::steady_clock::now(); #endif if (next_aggregate_vertex_frontier_size == 0) { #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; - std::chrono::duration dur2 = bottomup3 - bottomup2; - std::chrono::duration dur3 = bottomup4 - bottomup3; - std::chrono::duration dur = bottomup4 - bottomup0; + std::chrono::duration dur = bottomup2 - bottomup0; std::cout << "bottomup took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << "," << dur3.count() << " s." << std::endl; + << ") s." << std::endl; #endif break; } @@ -562,9 +546,10 @@ void bfs(raft::handle_t const& handle, true); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup5 = std::chrono::steady_clock::now(); + auto bottomup3 = std::chrono::steady_clock::now(); #endif + // FIXME: better move this right after host_scalar_allreduce??? auto aggregate_nzd_unvisted_vertices = GraphViewType::is_multi_gpu ? host_scalar_allreduce(handle.get_comms(), @@ -580,7 +565,7 @@ void bfs(raft::handle_t const& handle, } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup6 = std::chrono::steady_clock::now(); + auto bottomup4 = std::chrono::steady_clock::now(); #endif if (top_down) { // swithcing to top-down @@ -596,18 +581,16 @@ void bfs(raft::handle_t const& handle, } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup7 = std::chrono::steady_clock::now(); + auto bottomup5 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur2 = bottomup3 - bottomup2; std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; - std::chrono::duration dur5 = bottomup6 - bottomup5; - std::chrono::duration dur6 = bottomup7 - bottomup6; - std::chrono::duration dur = bottomup7 - bottomup0; + std::chrono::duration dur = bottomup5 - bottomup0; std::cout << "bottomup took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," - << dur5.count() << "," << dur6.count() << ") s." << std::endl; + << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." + << std::endl; #endif } cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size; From 3f77ee13310971138bd051151d3593b673f06a7a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 16 Aug 2024 16:44:02 -0700 Subject: [PATCH 024/126] performance tuning for BFS --- .../prims/detail/per_v_transform_reduce_e.cuh | 1209 ++++++++++------- 1 file changed, 705 insertions(+), 504 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 24abe57437c..0ad014fea96 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -36,6 +36,7 @@ #include #include +#include #include #include @@ -927,13 +928,14 @@ __host__ __device__ int priority_to_rank( } template -rmm::device_uvector compute_keep_flags( +std::optional> compute_keep_flags( raft::comms::comms_t const& comm, ValueIterator value_first, ValueIterator value_last, int root, int subgroup_size /* faster interconnect within a subgroup */, typename thrust::iterator_traits::value_type init, + bool ignore_local_values, rmm::cuda_stream_view stream_view) { auto const comm_rank = comm.get_rank(); @@ -943,21 +945,26 @@ rmm::device_uvector compute_keep_flags( // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same // DGX node should be the next) - // FIXME: for high & mid, it will be mostly local... should I do this? Or ask for just missing - // ones??? rmm::device_uvector priorities(thrust::distance(value_first, value_last), stream_view); - thrust::tabulate( - rmm::exec_policy(stream_view), - priorities.begin(), - priorities.end(), - [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { - auto val = *(value_first + offset); - return (val != init) - ? rank_to_priority( - comm_rank, root, subgroup_size, comm_size, static_cast(offset)) - : std::numeric_limits::max(); // lowest priority - }); + if (ignore_local_values) { + thrust::fill(rmm::exec_policy(stream_view), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + } else { + thrust::tabulate( + rmm::exec_policy(stream_view), + priorities.begin(), + priorities.end(), + [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { + auto val = *(value_first + offset); + return (val != init) + ? rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)) + : std::numeric_limits::max(); // lowest priority + }); + } device_allreduce(comm, priorities.data(), priorities.data(), @@ -965,20 +972,23 @@ rmm::device_uvector compute_keep_flags( raft::comms::op_t::MIN, stream_view); - rmm::device_uvector keep_flags(priorities.size(), stream_view); - auto offset_priority_pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); - thrust::transform(rmm::exec_policy(stream_view), - offset_priority_pair_first, - offset_priority_pair_first + priorities.size(), - keep_flags.begin(), - [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { - auto offset = thrust::get<0>(pair); - auto priority = thrust::get<1>(pair); - auto rank = - priority_to_rank(priority, root, subgroup_size, comm_size, offset); - return (rank == comm_rank); - }); + std::optional> keep_flags{std::nullopt}; + if (!ignore_local_values) { + keep_flags = rmm::device_uvector(priorities.size(), stream_view); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + (*keep_flags).begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = + priority_to_rank(priority, root, subgroup_size, comm_size, offset); + return (rank == comm_rank); + }); + } return keep_flags; } @@ -986,23 +996,24 @@ rmm::device_uvector compute_keep_flags( template std::tuple, dataframe_buffer_type_t::value_type>> -compute_offset_value_pairs(raft::comms::comms_t const& comm, - ValueIterator value_first, - ValueIterator value_last, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - typename thrust::iterator_traits::value_type init, - rmm::cuda_stream_view stream_view) +gather_offset_value_pairs(raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + bool ignore_local_values, // no valid value in [value_first, value_last) + rmm::cuda_stream_view stream_view) { using value_t = typename thrust::iterator_traits::value_type; auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); - rmm::device_uvector keep_flags(0, stream_view); + std::optional> keep_flags{std::nullopt}; if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t keep_flags = compute_keep_flags( - comm, value_first, value_last, root, subgroup_size, init, stream_view); + comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); } #if 0 // FIXME: this should be enabled (currently, raft does not support allreduce on uint16_t). else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t @@ -1012,46 +1023,37 @@ compute_offset_value_pairs(raft::comms::comms_t const& comm, #endif else { // priority_t == uint32_t keep_flags = compute_keep_flags( - comm, value_first, value_last, root, subgroup_size, init, stream_view); + comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); } - auto copy_size = thrust::count_if( - rmm::exec_policy(stream_view), keep_flags.begin(), keep_flags.end(), thrust::identity{}); - - rmm::device_uvector offsets(copy_size, stream_view); - auto values = allocate_dataframe_buffer(copy_size, stream_view); - auto offset_value_pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), value_first); - thrust::copy_if(rmm::exec_policy(stream_view), - offset_value_pair_first, - offset_value_pair_first + keep_flags.size(), - keep_flags.begin(), - thrust::make_zip_iterator(offsets.begin(), get_dataframe_buffer_begin(values)), - thrust::identity{}); - - return std::make_tuple(std::move(offsets), std::move(values)); -} - -template -void gather_offset_value_pairs_and_update_vertex_value_output( - raft::comms::comms_t const& comm, - rmm::device_uvector&& offsets, - dataframe_buffer_type_t&& values, - VertexValueOutputIterator vertex_value_output_first, - int root, - rmm::cuda_stream_view stream_view) -{ - auto const comm_rank = comm.get_rank(); + rmm::device_uvector offsets(0, stream_view); + auto values = allocate_dataframe_buffer(0, stream_view); + if (keep_flags) { + auto copy_size = thrust::count_if(rmm::exec_policy(stream_view), + (*keep_flags).begin(), + (*keep_flags).end(), + thrust::identity{}); + + offsets.resize(copy_size, stream_view); + resize_dataframe_buffer(values, copy_size, stream_view); + auto offset_value_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), value_first); + thrust::copy_if(rmm::exec_policy(stream_view), + offset_value_pair_first, + offset_value_pair_first + (*keep_flags).size(), + (*keep_flags).begin(), + thrust::make_zip_iterator(offsets.begin(), get_dataframe_buffer_begin(values)), + thrust::identity{}); + } auto rx_sizes = host_scalar_gather(comm, offsets.size(), root, stream_view); std::vector rx_displs{}; - rmm::device_uvector rx_offsets(0, stream_view); if (comm_rank == root) { rx_displs.resize(rx_sizes.size()); std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); - rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); } + rmm::device_uvector rx_offsets(comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); device_gatherv(comm, offsets.begin(), rx_offsets.begin(), @@ -1063,7 +1065,8 @@ void gather_offset_value_pairs_and_update_vertex_value_output( offsets.resize(0, stream_view); offsets.shrink_to_fit(stream_view); - auto rx_values = allocate_dataframe_buffer(rx_offsets.size(), stream_view); + auto rx_values = + allocate_dataframe_buffer(comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); device_gatherv(comm, get_dataframe_buffer_begin(values), get_dataframe_buffer_begin(rx_values), @@ -1075,13 +1078,232 @@ void gather_offset_value_pairs_and_update_vertex_value_output( resize_dataframe_buffer(values, 0, stream_view); shrink_to_fit_dataframe_buffer(values, stream_view); - if (comm_rank == root) { - // FIXME: this scatter can sequentialize GPU operations... - thrust::scatter(rmm::exec_policy(stream_view), - get_dataframe_buffer_begin(rx_values), - get_dataframe_buffer_end(rx_values), - rx_offsets.begin(), - vertex_value_output_first); + return std::make_tuple(std::move(rx_offsets), std::move(rx_values)); +} + +template +void per_v_transform_reduce_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + OptionalKeyIterator edge_partition_key_first, + OptionalKeyIterator edge_partition_key_last, + EdgeSrcValueInputWrapper edge_partition_src_value_input, + EdgeDstValueInputWrapper edge_partition_dst_value_input, + EdgeValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + ResultValueOutputIteratorOrWrapper output_buffer, + EdgeOp e_op, + T major_init, + T major_identity_element, + ReduceOp reduce_op, + PredOp pred_op, + std::optional> const& key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + constexpr bool use_input_key = !std::is_same_v; + + using vertex_t = typename GraphViewType::vertex_type; + using segment_key_iterator_t = + std::conditional_t; + + if (key_segment_offsets) { + static_assert(detail::num_sparse_segments_per_vertex_partition == 3); + + if (edge_partition.dcs_nzd_vertex_count()) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) + : handle.get_stream(); + + if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit + // every vertex in the hypersparse segment + thrust::fill(rmm::exec_policy(exec_stream), + output_buffer + (*key_segment_offsets)[3], + output_buffer + (*key_segment_offsets)[4], + major_init); + } + + auto segment_size = use_input_key + ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]) + : static_cast(*(edge_partition.dcs_nzd_vertex_count())); + if (segment_size > 0) { + raft::grid_1d_thread_t update_grid(segment_size, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; } + auto segment_key_first = edge_partition_key_first; + auto segment_key_last = edge_partition_key_last; + if constexpr (use_input_key) { + segment_key_first += (*key_segment_offsets)[3]; + segment_key_last += (*key_segment_offsets)[4]; + } else { + assert(segment_key_first == nullptr); + assert(segment_key_last == nullptr); + } + detail::per_v_transform_reduce_e_hypersparse + <<>>( + edge_partition, + segment_key_first, + segment_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + segment_key_first += (*key_segment_offsets)[2]; + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + auto segment_output_buffer = output_buffer; + if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + segment_key_first += (*key_segment_offsets)[1]; + detail::per_v_transform_reduce_e_mid_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + segment_output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + if ((*key_segment_offsets)[1] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) + : handle.get_stream(); + raft::grid_1d_block_t update_grid((*key_segment_offsets)[1], + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_high_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + (*key_segment_offsets)[1], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op); + } + } else { + size_t num_keys{}; + if constexpr (use_input_key) { + num_keys = + static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); + } else { + num_keys = static_cast(edge_partition.major_range_size()); + } + + if (num_keys > size_t{0}) { + raft::grid_1d_thread_t update_grid(num_keys, + detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + segment_key_iterator_t segment_key_first{}; + if constexpr (use_input_key) { + segment_key_first = edge_partition_key_first; + } else { + segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); + } + detail::per_v_transform_reduce_e_low_degree + <<>>( + edge_partition, + segment_key_first, + segment_key_first + num_keys, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + reduce_op, + pred_op); + } } } @@ -1118,6 +1340,14 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif constexpr bool update_major = (incoming == GraphViewType::is_storage_transposed); constexpr bool use_input_key = !std::is_same_v; + constexpr bool filter_input_key = + GraphViewType::is_multi_gpu && update_major && use_input_key && + std::is_same_v>; // if GraphViewType::is_multi_gpu && update_major && + // std::is_same_v>, for any + // vertex in the frontier, we need to visit only local edges + // if we find any valid local edge (FIXME: this is + // applicable even when use_input_key is false). static_assert(update_major || !use_input_key); static_assert( @@ -1165,26 +1395,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, [[maybe_unused]] constexpr auto max_segments = detail::num_sparse_segments_per_vertex_partition + size_t{1}; - /* 1. compute subgroup_size */ - - [[maybe_unused]] std::conditional_t>, - int, - std::byte /* dummy */> - subgroup_size{}; - if constexpr (GraphViewType::is_multi_gpu && update_major && - std::is_same_v>) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - int num_gpus_per_node{}; - RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); - subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm - ? std::max(num_gpus_per_node / minor_comm_size, int{1}) - : std::min(minor_comm_size, num_gpus_per_node); - } - - // 2. prepare key list + // 1. exclude zero degree keys auto sorted_unique_nzd_key_last = sorted_unique_key_last; if constexpr (use_input_key) { @@ -1204,7 +1415,188 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 3. compute optional bitmap info + // 2. initialize vertex value output buffer + + if constexpr (update_major) { // no vertices in the zero degree segment are visited (otherwise, + // no need to initialize) + if constexpr (use_input_key) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_key_last), + init); + } else { + size_t partition_idx = 0; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first + *((*segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*segment_offsets).rbegin()), + init); + } + } + } else { + if constexpr (GraphViewType::is_multi_gpu) { + /* no need to initialize (we use minor_tmp_buffer) */ + } else { + thrust::fill(handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + graph_view.local_vertex_partition_range_size(), + init); + } + } + + // 3. filter input keys + + auto edge_mask_view = graph_view.edge_mask_view(); + + auto tmp_key_buffer = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + auto tmp_output_indices = + allocate_optional_dataframe_buffer>( + 0, handle.get_stream()); + std::conditional_t, + VertexValueOutputIterator> + tmp_vertex_value_output_first{}; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(static_cast(minor_comm_rank))); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, static_cast(minor_comm_rank)) + : thrust::nullopt; + + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); + + std::optional> edge_partition_stream_pool_indices{std::nullopt}; + if (segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + edge_partition_stream_pool_indices = std::vector(max_segments); + std::iota((*edge_partition_stream_pool_indices).begin(), + (*edge_partition_stream_pool_indices).end(), + size_t{0}); + } + + std::optional> key_segment_offsets{std::nullopt}; + if (segment_offsets) { + if constexpr (use_input_key) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), + handle.get_stream()); + } else { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + } + + handle.sync_stream(); + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t( + edge_dst_value_input, static_cast(minor_comm_rank)); + } else { + edge_partition_src_value_input = edge_partition_src_input_device_view_t( + edge_src_value_input, static_cast(minor_comm_rank)); + edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, static_cast(minor_comm_rank)); + + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + vertex_value_output_first, + e_op, + init, + init, + reduce_op, + pred_op, + key_segment_offsets, + edge_partition_stream_pool_indices ? std::make_optional>( + (*edge_partition_stream_pool_indices).data(), + (*edge_partition_stream_pool_indices).size()) + : std::nullopt); + + handle.sync_stream_pool(*edge_partition_stream_pool_indices); + + auto num_tmp_keys = thrust::count( + handle.get_thrust_policy(), + vertex_value_output_first, + vertex_value_output_first + + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + init); // we allow false positives (some edge operations may actually return init) + + resize_optional_dataframe_buffer(tmp_key_buffer, num_tmp_keys, handle.get_stream()); + resize_optional_dataframe_buffer(tmp_output_indices, num_tmp_keys, handle.get_stream()); + + auto input_first = + thrust::make_zip_iterator(sorted_unique_key_first, thrust::make_counting_iterator(size_t{0})); + thrust::copy_if( + handle.get_thrust_policy(), + input_first, + input_first + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), + vertex_value_output_first, + thrust::make_zip_iterator(get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_output_indices)), + is_equal_t{init}); + + sorted_unique_key_first = get_optional_dataframe_buffer_begin(tmp_key_buffer); + sorted_unique_nzd_key_last = get_optional_dataframe_buffer_end(tmp_key_buffer); + tmp_vertex_value_output_first = thrust::make_permutation_iterator( + vertex_value_output_first, get_optional_dataframe_buffer_begin(tmp_output_indices)); + } else { + tmp_vertex_value_output_first = vertex_value_output_first; + } + + /* 4. compute subgroup_size */ + + [[maybe_unused]] std::conditional_t>, + int, + std::byte /* dummy */> + subgroup_size{}; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / minor_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } + + // 5. compute optional bitmap info std:: conditional_t>, std::byte /* dummy */> @@ -1216,8 +1608,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto segment_offsets = graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : graph_view.local_vertex_partition_range_size(); + size_t bool_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) + : graph_view + .local_vertex_partition_range_size(); // FIXME: if filtered, we can reduce bool_size key_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, @@ -1228,7 +1623,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 4. collect local_key_list_sizes & use_bitmap_flags + // 6. collect local_key_list_sizes & use_bitmap_flags std::conditional_t, std::byte /* dummy */> local_key_list_sizes{}; @@ -1256,7 +1651,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 5. set-up stream pool + // 7. set-up stream pool std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { @@ -1322,27 +1717,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, assert((num_streams % max_segments) == 0); stream_pool_indices = std::vector(num_streams); std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); } } } - // 6. set-up temporary buffers - - using minor_tmp_buffer_type = std::conditional_t, - edge_dst_property_t>; - [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; - if constexpr (GraphViewType::is_multi_gpu && !update_major) { - minor_tmp_buffer = std::make_unique(handle, graph_view); - } - - using edge_partition_minor_output_device_view_t = - std::conditional_tmutable_view().value_first())>, - void /* dummy */>; + // 8. set-up temporary buffers std::vector> major_tmp_buffers{}; if constexpr (GraphViewType::is_multi_gpu && update_major) { @@ -1387,6 +1766,32 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); } + using minor_tmp_buffer_type = std::conditional_t, + edge_dst_property_t>; + [[maybe_unused]] std::unique_ptr minor_tmp_buffer{}; + if constexpr (GraphViewType::is_multi_gpu && !update_major) { + minor_tmp_buffer = std::make_unique(handle, graph_view); + auto minor_init = init; + auto view = minor_tmp_buffer->view(); + if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer ma not + // store values for the entire minor rangey + minor_init = ReduceOp::identity_element; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; + } + fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); + } + + using edge_partition_minor_output_device_view_t = + std::conditional_tmutable_view().value_first())>, + void /* dummy */>; + std::conditional_t>, std::vector>, std::byte /* dummy */> @@ -1396,8 +1801,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::byte /* dummy */> value_vectors{}; if constexpr (update_major && std::is_same_v>) { - auto capacity = graph_view.number_of_local_edge_partitions() * - (graph_view.local_edge_partition_segment_offsets(0) ? max_segments : 1); + auto capacity = graph_view.local_edge_partition_segment_offsets(0) ? max_segments : 1; offset_vectors.reserve(capacity); value_vectors.reserve(capacity); @@ -1407,83 +1811,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 7. initialize - - if constexpr (update_major) { // no vertices in the zero degree segment are visited - if constexpr (use_input_key) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + - thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last), - vertex_value_output_first + - thrust::distance(sorted_unique_key_first, sorted_unique_key_last), - init); - } else { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); - } - } - } else { - if constexpr (GraphViewType::is_multi_gpu) { - auto minor_init = init; - auto view = minor_tmp_buffer->view(); - if (view.keys()) { // defer applying the initial value to the end as minor_tmp_buffer may - // not store values for the entire minor range - minor_init = ReduceOp::identity_element; - } else { - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - minor_init = (major_comm_rank == 0) ? init : ReduceOp::identity_element; - } - fill_edge_minor_property(handle, graph_view, minor_tmp_buffer->mutable_view(), minor_init); - } else { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first, - vertex_value_output_first + graph_view.local_vertex_partition_range_size(), - init); - } - } - - // 8. create key_segment_offset_vectors - - std::conditional_t>, - std::optional>>, - std::byte /* dummy */> - key_segment_offset_vectors{}; - if constexpr (GraphViewType::is_multi_gpu && update_major && - std::is_same_v>) { - if (graph_view.local_edge_partition_segment_offsets(0)) { - key_segment_offset_vectors = - std::vector>(graph_view.number_of_local_edge_partitions()); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - assert(graph_view.local_edge_partition_segment_offsets(i)); - } - } - } - if (stream_pool_indices) { handle.sync_stream(); } // 9. proces local edge partitions - auto edge_mask_view = graph_view.edge_mask_view(); - #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time1 = std::chrono::steady_clock::now(); #endif for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto subtime0 = std::chrono::steady_clock::now(); #endif auto edge_partition = @@ -1495,6 +1833,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, detail::edge_partition_edge_property_device_view_t>( *edge_mask_view, i) : thrust::nullopt; + auto edge_partition_stream_pool_indices = + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + ((i * max_segments) % (*stream_pool_indices).size()), + max_segments) + : std::nullopt; T major_init{}; T major_identity_element{}; @@ -1520,8 +1864,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); auto loop_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) : handle.get_stream(); auto edge_partition_key_first = sorted_unique_key_first; @@ -1530,7 +1874,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::conditional_t>(0, loop_stream); #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto subtime1 = std::chrono::steady_clock::now(); #endif if constexpr (GraphViewType::is_multi_gpu && use_input_key) { @@ -1578,7 +1922,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto subtime2 = std::chrono::steady_clock::now(); #endif @@ -1600,10 +1944,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime3 = std::chrono::steady_clock::now(); -#endif edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -1634,205 +1974,42 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); } } else { - output_buffer = vertex_value_output_first; + output_buffer = tmp_vertex_value_output_first; } - using segment_key_iterator_t = - std::conditional_t; - #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime4 = std::chrono::steady_clock::now(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime3 = std::chrono::steady_clock::now(); #endif - if (key_segment_offsets) { - static_assert(detail::num_sparse_segments_per_vertex_partition == 3); - - // FIXME: we may further improve performance by 1) individually tuning block sizes for - // different segments; and 2) adding one more segment for very high degree vertices and - // running segmented reduction - if (edge_partition.dcs_nzd_vertex_count()) { - auto exec_stream = - stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments) % (*stream_pool_indices).size()) - : handle.get_stream(); - - if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit - // every vertex in the hypersparse segment - thrust::fill(rmm::exec_policy(exec_stream), - output_buffer + (*key_segment_offsets)[3], - output_buffer + (*key_segment_offsets)[4], - major_init); - } - - auto segment_size = use_input_key - ? ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]) - : static_cast(*(edge_partition.dcs_nzd_vertex_count())); - if (segment_size > 0) { - raft::grid_1d_thread_t update_grid(segment_size, - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[3]; } - auto segment_key_first = edge_partition_key_first; - auto segment_key_last = edge_partition_key_last; - if constexpr (use_input_key) { - segment_key_first += (*key_segment_offsets)[3]; - segment_key_last += (*key_segment_offsets)[4]; - } else { - assert(segment_key_first == nullptr); - assert(segment_key_last == nullptr); - } - detail::per_v_transform_reduce_e_hypersparse - <<>>( - edge_partition, - segment_key_first, - segment_key_last, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - reduce_op, - pred_op); - } - } - if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - segment_key_first += (*key_segment_offsets)[2]; - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - reduce_op, - pred_op); - } - if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - auto segment_output_buffer = output_buffer; - if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - segment_key_first += (*key_segment_offsets)[1]; - detail::per_v_transform_reduce_e_mid_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - segment_output_buffer, - e_op, - major_init, - major_identity_element, - reduce_op, - pred_op); - } - if ((*key_segment_offsets)[1] > 0) { - auto exec_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()) - : handle.get_stream(); - raft::grid_1d_block_t update_grid((*key_segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - detail::per_v_transform_reduce_e_high_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + (*key_segment_offsets)[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - major_identity_element, - reduce_op, - pred_op); - } - } else { - size_t num_keys{}; - if constexpr (use_input_key) { - num_keys = - static_cast(thrust::distance(edge_partition_key_first, edge_partition_key_last)); - } else { - num_keys = static_cast(edge_partition.major_range_size()); - } + bool process_local_edges = true; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + if (static_cast(i) == minor_comm_rank) { process_local_edges = false; } + } - if (num_keys > size_t{0}) { - raft::grid_1d_thread_t update_grid(num_keys, - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - segment_key_iterator_t segment_key_first{}; - if constexpr (use_input_key) { - segment_key_first = edge_partition_key_first; - } else { - segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); - } - detail::per_v_transform_reduce_e_low_degree - <<>>( - edge_partition, - segment_key_first, - segment_key_first + num_keys, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - reduce_op, - pred_op); - } + if (process_local_edges) { + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets, + edge_partition_stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime5 = std::chrono::steady_clock::now(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto subtime4 = std::chrono::steady_clock::now(); #endif if constexpr (GraphViewType::is_multi_gpu && update_major) { @@ -1840,28 +2017,31 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - if (key_segment_offsets && stream_pool_indices) { + if (key_segment_offsets && edge_partition_stream_pool_indices) { if (edge_partition.dcs_nzd_vertex_count()) { if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments) % - (*stream_pool_indices).size()); + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]); auto segment_offset = (*key_segment_offsets)[3]; auto segment_size = (*key_segment_offsets)[4] - (*key_segment_offsets)[3]; if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( + auto [offsets, values] = gather_offset_value_pairs( minor_comm, major_buffer_first + segment_offset, major_buffer_first + (segment_offset + segment_size), static_cast(i), subgroup_size, init, + process_local_edges ? false : true, segment_stream); - offset_vectors[i * max_segments + 3] = std::move(offsets); - value_vectors[i * max_segments + 3] = std::move(values); + if (static_cast(i) == minor_comm_rank) { + offset_vectors[3] = std::move(offsets); + value_vectors[3] = std::move(values); + } } else { device_reduce(minor_comm, major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, + tmp_vertex_value_output_first + segment_offset, segment_size, ReduceOp::compatible_raft_comms_op, static_cast(i), @@ -1870,25 +2050,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()); + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]); auto segment_offset = (*key_segment_offsets)[2]; auto segment_size = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( + auto [offsets, values] = gather_offset_value_pairs( minor_comm, major_buffer_first + segment_offset, major_buffer_first + (segment_offset + segment_size), static_cast(i), subgroup_size, init, + process_local_edges ? false : true, segment_stream); - offset_vectors[i * max_segments + 2] = std::move(offsets); - value_vectors[i * max_segments + 2] = std::move(values); + if (static_cast(i) == minor_comm_rank) { + offset_vectors[2] = std::move(offsets); + value_vectors[2] = std::move(values); + } } else { device_reduce(minor_comm, major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, + tmp_vertex_value_output_first + segment_offset, segment_size, ReduceOp::compatible_raft_comms_op, static_cast(i), @@ -1896,25 +2079,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()); + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]); auto segment_offset = (*key_segment_offsets)[1]; auto segment_size = (*key_segment_offsets)[2] - (*key_segment_offsets)[1]; if constexpr (std::is_same_v>) { - auto [offsets, values] = compute_offset_value_pairs( + auto [offsets, values] = gather_offset_value_pairs( minor_comm, major_buffer_first + segment_offset, major_buffer_first + (segment_offset + segment_size), static_cast(i), subgroup_size, init, + process_local_edges ? false : true, segment_stream); - offset_vectors[i * max_segments + 1] = std::move(offsets); - value_vectors[i * max_segments + 1] = std::move(values); + if (static_cast(i) == minor_comm_rank) { + offset_vectors[1] = std::move(offsets); + value_vectors[1] = std::move(values); + } } else { device_reduce(minor_comm, major_buffer_first + segment_offset, - vertex_value_output_first + segment_offset, + tmp_vertex_value_output_first + segment_offset, segment_size, ReduceOp::compatible_raft_comms_op, static_cast(i), @@ -1922,24 +2108,27 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } if ((*key_segment_offsets)[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()); - auto segment_size = (*key_segment_offsets)[1]; + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]); + auto segment_size = (*key_segment_offsets)[1]; if constexpr (std::is_same_v>) { auto [offsets, values] = - compute_offset_value_pairs(minor_comm, - major_buffer_first, - major_buffer_first + segment_size, - static_cast(i), - subgroup_size, - init, - segment_stream); - offset_vectors[i * max_segments] = std::move(offsets); - value_vectors[i * max_segments] = std::move(values); + gather_offset_value_pairs(minor_comm, + major_buffer_first, + major_buffer_first + segment_size, + static_cast(i), + subgroup_size, + init, + process_local_edges ? false : true, + segment_stream); + if (static_cast(i) == minor_comm_rank) { + offset_vectors[0] = std::move(offsets); + value_vectors[0] = std::move(values); + } } else { device_reduce(minor_comm, major_buffer_first, - vertex_value_output_first, + tmp_vertex_value_output_first, segment_size, ReduceOp::compatible_raft_comms_op, static_cast(i), @@ -1959,19 +2148,22 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } if constexpr (std::is_same_v>) { auto [offsets, values] = - compute_offset_value_pairs(minor_comm, - major_buffer_first, - major_buffer_first + reduction_size, - static_cast(i), - subgroup_size, - init, - handle.get_stream()); - offset_vectors[i] = std::move(offsets); - value_vectors[i] = std::move(values); + gather_offset_value_pairs(minor_comm, + major_buffer_first, + major_buffer_first + reduction_size, + static_cast(i), + subgroup_size, + init, + process_local_edges ? false : true, + handle.get_stream()); + if (static_cast(i) == minor_comm_rank) { + offset_vectors[0] = std::move(offsets); + value_vectors[0] = std::move(values); + } } else { device_reduce(minor_comm, major_buffer_first, - vertex_value_output_first, + tmp_vertex_value_output_first, reduction_size, ReduceOp::compatible_raft_comms_op, static_cast(i), @@ -1980,13 +2172,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - if constexpr (GraphViewType::is_multi_gpu && update_major && - std::is_same_v>) { - if (key_segment_offsets) { - (*key_segment_offset_vectors)[i] = std::move(*key_segment_offsets); - } - } - if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { handle.sync_stream_pool( *stream_pool_indices); // to prevent buffer over-write (this can happen @@ -1995,104 +2180,120 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime6 = std::chrono::steady_clock::now(); + auto subtime5 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; std::chrono::duration subdur2 = subtime3 - subtime2; std::chrono::duration subdur3 = subtime4 - subtime3; std::chrono::duration subdur4 = subtime5 - subtime4; - std::chrono::duration subdur5 = subtime6 - subtime5; std::cout << "\t\t\tdetail::per_v i=" << i << " took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," - << subdur4.count() << "," << subdur5.count() << ")" << std::endl; + << subdur4.count() << ")" << std::endl; #endif } + + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time2 = std::chrono::steady_clock::now(); #endif - // 10. communication + // 10. scatter if constexpr (GraphViewType::is_multi_gpu && update_major && std::is_same_v>) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - if (key_segment_offset_vectors && stream_pool_indices) { - auto key_segment_offsets = (*key_segment_offset_vectors)[i]; + auto segment_offsets = + graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); + std::optional> edge_partition_stream_pool_indices{std::nullopt}; + if (segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + edge_partition_stream_pool_indices = std::vector(max_segments); + std::iota((*edge_partition_stream_pool_indices).begin(), + (*edge_partition_stream_pool_indices).end(), + size_t{0}); + } - if (edge_partition.dcs_nzd_vertex_count()) { - if (key_segment_offsets[4] - key_segment_offsets[3] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments) % - (*stream_pool_indices).size()); - auto segment_offset = key_segment_offsets[3]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 3]), - std::move(value_vectors[i * max_segments + 3]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); - } - } - if (key_segment_offsets[3] - key_segment_offsets[2] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 1) % - (*stream_pool_indices).size()); - auto segment_offset = key_segment_offsets[2]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 2]), - std::move(value_vectors[i * max_segments + 2]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); - } - if (key_segment_offsets[2] - key_segment_offsets[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 2) % - (*stream_pool_indices).size()); - auto segment_offset = key_segment_offsets[1]; - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments + 1]), - std::move(value_vectors[i * max_segments + 1]), - vertex_value_output_first + segment_offset, - static_cast(i), - segment_stream); - } - if (key_segment_offsets[1] > 0) { - auto segment_stream = handle.get_stream_from_stream_pool((i * max_segments + 3) % - (*stream_pool_indices).size()); - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i * max_segments]), - std::move(value_vectors[i * max_segments]), - vertex_value_output_first, - static_cast(i), - segment_stream); - } - } else { - gather_offset_value_pairs_and_update_vertex_value_output( - minor_comm, - std::move(offset_vectors[i]), - std::move(value_vectors[i]), - vertex_value_output_first, - static_cast(i), + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(static_cast(minor_comm_rank))); + + std::optional> key_segment_offsets{std::nullopt}; + if (segment_offsets) { + if constexpr (use_input_key) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), handle.get_stream()); + } else { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); } } + + if (key_segment_offsets && edge_partition_stream_pool_indices) { + if (edge_partition.dcs_nzd_vertex_count()) { + if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]); + auto segment_offset = (*key_segment_offsets)[3]; + thrust::scatter(rmm::exec_policy(segment_stream), + get_dataframe_buffer_begin(value_vectors[3]), + get_dataframe_buffer_end(value_vectors[3]), + offset_vectors[3].begin(), + tmp_vertex_value_output_first + segment_offset); + } + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]); + auto segment_offset = (*key_segment_offsets)[2]; + thrust::scatter(rmm::exec_policy(segment_stream), + get_dataframe_buffer_begin(value_vectors[2]), + get_dataframe_buffer_end(value_vectors[2]), + offset_vectors[2].begin(), + tmp_vertex_value_output_first + segment_offset); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]); + auto segment_offset = (*key_segment_offsets)[1]; + thrust::scatter(rmm::exec_policy(segment_stream), + get_dataframe_buffer_begin(value_vectors[1]), + get_dataframe_buffer_end(value_vectors[1]), + offset_vectors[1].begin(), + tmp_vertex_value_output_first + segment_offset); + } + if ((*key_segment_offsets)[1] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]); + thrust::scatter(rmm::exec_policy(segment_stream), + get_dataframe_buffer_begin(value_vectors[0]), + get_dataframe_buffer_end(value_vectors[0]), + offset_vectors[0].begin(), + tmp_vertex_value_output_first); + } + } else { + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(value_vectors[0]), + get_dataframe_buffer_end(value_vectors[0]), + offset_vectors[0].begin(), + tmp_vertex_value_output_first); + } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time3 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + // 11. communication if constexpr (GraphViewType::is_multi_gpu && !update_major) { auto& comm = handle.get_comms(); @@ -2150,7 +2351,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, tx_buffer_first); device_reduce(major_comm, tx_buffer_first, - vertex_value_output_first, + tmp_vertex_value_output_first, static_cast( graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), ReduceOp::compatible_raft_comms_op, @@ -2171,7 +2372,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, minor_range_first; device_reduce(major_comm, view.value_first() + offset, - vertex_value_output_first, + tmp_vertex_value_output_first, static_cast( graph_view.vertex_partition_range_size(this_segment_vertex_partition_id)), ReduceOp::compatible_raft_comms_op, From bb75771924c96280746f169d26e835af22332f40 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 17 Aug 2024 13:08:18 -0700 Subject: [PATCH 025/126] add a utility to find iteator type in dataframe buffer --- .../cugraph/utilities/dataframe_buffer.hpp | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp index 55f3e8ac360..5d839d22fc5 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp +++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp @@ -90,6 +90,34 @@ struct dataframe_buffer_type { template using dataframe_buffer_type_t = typename dataframe_buffer_type::type; +template +struct dataframe_buffer_iterator_type { + using type = typename rmm::device_uvector::iterator; +}; + +template +struct dataframe_buffer_iterator_type> { + using type = thrust::zip_iterator::iterator...>>; +}; + +template +using dataframe_buffer_iterator_type_t = typename dataframe_buffer_iterator_type::type; + +template +struct dataframe_buffer_const_iterator_type { + using type = typename rmm::device_uvector::const_iterator; +}; + +template +struct dataframe_buffer_const_iterator_type> { + using type = + thrust::zip_iterator::const_iterator...>>; +}; + +template +using dataframe_buffer_const_iterator_type_t = + typename dataframe_buffer_const_iterator_type::type; + template void reserve_dataframe_buffer(BufferType& buffer, size_t new_buffer_capacity, From cfce7bc188ce4220cf6bdc3b109ad4e1fd2e5d04 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 17 Aug 2024 13:10:44 -0700 Subject: [PATCH 026/126] minor performance tuning --- .../prims/detail/per_v_transform_reduce_e.cuh | 805 +++++++++--------- 1 file changed, 391 insertions(+), 414 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 0ad014fea96..9b2b46deb09 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -66,6 +66,9 @@ namespace cugraph { namespace detail { +// FIXME: on A6000 we got better performance with 128, need to tune on H100 (possibly due to wasting +// less computing power on processing high degree vertices, we may use different values for +// different kernels for exhaustive tuning) int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; template @@ -1029,10 +1032,8 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, rmm::device_uvector offsets(0, stream_view); auto values = allocate_dataframe_buffer(0, stream_view); if (keep_flags) { - auto copy_size = thrust::count_if(rmm::exec_policy(stream_view), - (*keep_flags).begin(), - (*keep_flags).end(), - thrust::identity{}); + auto copy_size = thrust::count( + rmm::exec_policy(stream_view), (*keep_flags).begin(), (*keep_flags).end(), true); offsets.resize(copy_size, stream_view); resize_dataframe_buffer(values, copy_size, stream_view); @@ -1053,7 +1054,12 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); } - rmm::device_uvector rx_offsets(comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); + // FIXME: calling the following two device_gatherv within device_group_start() and + // device_group_end() improves performance (approx. 5%) + // FIXME: or we can implement this in All-to-All after iteration over every edge partition + // FIXME: we may consdier optionally sending offsets in bitmaps + rmm::device_uvector rx_offsets( + comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); device_gatherv(comm, offsets.begin(), rx_offsets.begin(), @@ -1065,8 +1071,8 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, offsets.resize(0, stream_view); offsets.shrink_to_fit(stream_view); - auto rx_values = - allocate_dataframe_buffer(comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); + auto rx_values = allocate_dataframe_buffer( + comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); device_gatherv(comm, get_dataframe_buffer_begin(values), get_dataframe_buffer_begin(rx_values), @@ -1271,6 +1277,7 @@ void per_v_transform_reduce_e_edge_partition( pred_op); } } else { + assert(!edge_partition_stream_pools); size_t num_keys{}; if constexpr (use_input_key) { num_keys = @@ -1307,7 +1314,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 1 +#define PER_V_PERFORMANCE_MEASUREMENT 0 template > local_vertex_partition_segment_offsets{std::nullopt}; + { size_t partition_idx = 0; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); partition_idx = static_cast(minor_comm_rank); } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - auto sorted_uniue_nzd_key_last = compute_key_lower_bound( - sorted_unique_key_first, - sorted_unique_key_last, - graph_view.local_vertex_partition_range_first() + *((*segment_offsets).rbegin() + 1), - handle.get_stream()); + local_vertex_partition_segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + } + + auto sorted_unique_nzd_key_last = sorted_unique_key_last; + if constexpr (use_input_key) { + if (local_vertex_partition_segment_offsets) { + auto sorted_uniue_nzd_key_last = + compute_key_lower_bound(sorted_unique_key_first, + sorted_unique_key_last, + graph_view.local_vertex_partition_range_first() + + *((*local_vertex_partition_segment_offsets).rbegin() + 1), + handle.get_stream()); } } @@ -1427,18 +1440,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, thrust::distance(sorted_unique_key_first, sorted_unique_key_last), init); } else { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - thrust::fill(handle.get_thrust_policy(), - vertex_value_output_first + *((*segment_offsets).rbegin() + 1), - vertex_value_output_first + *((*segment_offsets).rbegin()), - init); + if (local_vertex_partition_segment_offsets) { + thrust::fill( + handle.get_thrust_policy(), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin() + 1), + vertex_value_output_first + *((*local_vertex_partition_segment_offsets).rbegin()), + init); } } } else { @@ -1480,11 +1487,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, *edge_mask_view, static_cast(minor_comm_rank)) : thrust::nullopt; - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - std::optional> edge_partition_stream_pool_indices{std::nullopt}; - if (segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { edge_partition_stream_pool_indices = std::vector(max_segments); std::iota((*edge_partition_stream_pool_indices).begin(), (*edge_partition_stream_pool_indices).end(), @@ -1492,24 +1496,25 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } std::optional> key_segment_offsets{std::nullopt}; - if (segment_offsets) { + if (local_vertex_partition_segment_offsets) { if constexpr (use_input_key) { key_segment_offsets = compute_key_segment_offsets( sorted_unique_key_first, sorted_unique_nzd_key_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), edge_partition.major_range_first(), handle.get_stream()); } else { - key_segment_offsets = std::vector((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), + key_segment_offsets = std::vector((*local_vertex_partition_segment_offsets).size()); + std::transform((*local_vertex_partition_segment_offsets).begin(), + (*local_vertex_partition_segment_offsets).end(), (*key_segment_offsets).begin(), [](vertex_t offset) { return static_cast(offset); }); } } - handle.sync_stream(); + if (edge_partition_stream_pool_indices) { handle.sync_stream(); } edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -1546,7 +1551,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, (*edge_partition_stream_pool_indices).size()) : std::nullopt); - handle.sync_stream_pool(*edge_partition_stream_pool_indices); + if (edge_partition_stream_pool_indices) { + handle.sync_stream_pool(*edge_partition_stream_pool_indices); + } auto num_tmp_keys = thrust::count( handle.get_thrust_policy(), @@ -1577,7 +1584,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, tmp_vertex_value_output_first = vertex_value_output_first; } - /* 4. compute subgroup_size */ + /* 4. compute subgroup_size (used to compute priority in device_gatherv) */ [[maybe_unused]] std::conditional_t>, @@ -1605,15 +1612,14 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); size_t bool_size = - segment_offsets - ? *((*segment_offsets).rbegin() + 1) + local_vertex_partition_segment_offsets + ? *((*local_vertex_partition_segment_offsets).rbegin() + 1) : graph_view .local_vertex_partition_range_size(); // FIXME: if filtered, we can reduce bool_size + // FIXME: *sorted_unique_nzd_key_last - *sorted_unique_key_first could be smaller than + // bool_size by a non-negligible amount key_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, sorted_unique_nzd_key_last, @@ -1655,12 +1661,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - if ((graph_view.local_edge_partition_segment_offsets(0)) && - (handle.get_stream_pool_size() >= max_segments)) { - for (size_t i = 1; i < graph_view.number_of_local_edge_partitions(); ++i) { - assert(graph_view.local_edge_partition_segment_offsets(i)); - } - + if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -1668,6 +1669,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // peak memory requirement per loop is // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) + // FIXME: what about offsets & values? size_t num_streams = std::min(static_cast(minor_comm_size) * max_segments, @@ -1723,47 +1725,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // 8. set-up temporary buffers - std::vector> major_tmp_buffers{}; - if constexpr (GraphViewType::is_multi_gpu && update_major) { - std::vector major_tmp_buffer_sizes(graph_view.number_of_local_edge_partitions(), - size_t{0}); - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - if constexpr (use_input_key) { - major_tmp_buffer_sizes = local_key_list_sizes; - } else { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - if (segment_offsets) { - major_tmp_buffer_sizes[i] = - *((*segment_offsets).rbegin() + 1); // exclude the zero degree segment - } else { - if constexpr (GraphViewType::is_storage_transposed) { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_dst_range_size(i); - } else { - major_tmp_buffer_sizes[i] = graph_view.local_edge_partition_src_range_size(i); - } - } - } - } - if (stream_pool_indices) { - auto num_concurrent_loops = (*stream_pool_indices).size() / max_segments; - major_tmp_buffers.reserve(num_concurrent_loops); - for (size_t i = 0; i < num_concurrent_loops; ++i) { - size_t max_size{0}; - for (size_t j = i; j < graph_view.number_of_local_edge_partitions(); - j += num_concurrent_loops) { - max_size = std::max(major_tmp_buffer_sizes[j], max_size); - } - major_tmp_buffers.push_back(allocate_dataframe_buffer(max_size, handle.get_stream())); - } - } else { - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer( - *std::max_element(major_tmp_buffer_sizes.begin(), major_tmp_buffer_sizes.end()), - handle.get_stream())); - } - } else { // dummy - major_tmp_buffers.reserve(1); - major_tmp_buffers.push_back(allocate_dataframe_buffer(size_t{0}, handle.get_stream())); + size_t num_concurrent_loops{1}; + if (stream_pool_indices) { + assert(((*stream_pool_indices).size() % max_segments) == 0); + num_concurrent_loops = (*stream_pool_indices).size() / max_segments; } using minor_tmp_buffer_type = std::conditional_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - auto edge_partition_stream_pool_indices = - stream_pool_indices - ? std::make_optional>( - (*stream_pool_indices).data() + ((i * max_segments) % (*stream_pool_indices).size()), - max_segments) - : std::nullopt; - - T major_init{}; - T major_identity_element{}; - if constexpr (update_major) { - if constexpr (std::is_same_v>) { // if any edge has a non-init value, one - // of the non-init values will be - // selected. - major_init = init; - major_identity_element = init; - } else { - major_init = ReduceOp::identity_element; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - major_init = (static_cast(i) == minor_comm_rank) ? init : ReduceOp::identity_element; - } else { - major_init = init; - } - major_identity_element = ReduceOp::identity_element; - } - } + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - auto loop_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) - : handle.get_stream(); - - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - auto edge_partition_key_buffer = allocate_optional_dataframe_buffer< - std::conditional_t>(0, - loop_stream); -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime1 = std::chrono::steady_clock::now(); -#endif + std::conditional_t>, + std::byte /* dummy */> + edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu && use_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); + edge_partition_key_buffers.reserve(loop_count); + } + std::vector>> key_segment_offset_vectors{}; + key_segment_offset_vectors.reserve(loop_count); + std::conditional_t>, + std::byte /* dummy */> + major_output_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + major_output_buffers.reserve(loop_count); + } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto edge_partition_key_buffer = allocate_dataframe_buffer( + minor_comm_size > 1 ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); - resize_optional_dataframe_buffer( - edge_partition_key_buffer, local_key_list_sizes[i], loop_stream); - - if constexpr (try_bitmap) { - std::variant, decltype(sorted_unique_key_first)> - v_list{}; - if (use_bitmap_flags[i]) { - v_list = (static_cast(i) == minor_comm_rank) - ? raft::device_span((*key_list_bitmap).data(), - (*key_list_bitmap).size()) - : raft::device_span(static_cast(nullptr), - size_t{0}); + if constexpr (try_bitmap) { + std::variant, decltype(sorted_unique_key_first)> + v_list{}; + if (use_bitmap_flags[partition_idx]) { + v_list = (static_cast(partition_idx) == minor_comm_rank) + ? raft::device_span((*key_list_bitmap).data(), + (*key_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), + size_t{0}); + } else { + v_list = sorted_unique_key_first; + } + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + device_bcast_vertex_list(minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + loop_stream); } else { - v_list = sorted_unique_key_first; + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffer), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + loop_stream); } - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); - device_bcast_vertex_list(minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, - local_key_list_sizes[i], - static_cast(i), - loop_stream); - } else { - device_bcast(minor_comm, - sorted_unique_key_first, - get_dataframe_buffer_begin(edge_partition_key_buffer), - local_key_list_sizes[i], - static_cast(i), - loop_stream); } + edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); + } - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffer); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffer); + std::optional> key_segment_offsets{std::nullopt}; + if (segment_offsets) { + if constexpr (use_input_key) { + key_segment_offsets = compute_key_segment_offsets( + edge_partition_key_first, + edge_partition_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), + loop_stream); + } else { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } } - } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime2 = std::chrono::steady_clock::now(); -#endif + key_segment_offset_vectors.push_back(std::move(key_segment_offsets)); - std::optional> key_segment_offsets{std::nullopt}; - if (segment_offsets) { - if constexpr (use_input_key) { - key_segment_offsets = compute_key_segment_offsets( - edge_partition_key_first, - edge_partition_key_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - edge_partition.major_range_first(), - loop_stream); - } else { - key_segment_offsets = std::vector((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), - (*key_segment_offsets).begin(), - [](vertex_t offset) { return static_cast(offset); }); + if constexpr (GraphViewType::is_multi_gpu && update_major) { + size_t buffer_size{0}; + if constexpr (use_input_key) { + buffer_size = local_key_list_sizes[partition_idx]; + } else { + buffer_size = segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + } + major_output_buffers.emplace_back(buffer_size, loop_stream); } } - RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto edge_partition_stream_pool_indices = + stream_pool_indices ? std::make_optional>( + (*stream_pool_indices).data() + j * max_segments, max_segments) + : std::nullopt; + + T major_init{}; + T major_identity_element{}; + if constexpr (update_major) { + if constexpr (std::is_same_v>) { // if any edge has a non-init value, one + // of the non-init values will be + // selected. + major_init = init; + major_identity_element = init; + } else { + major_init = ReduceOp::identity_element; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + major_init = (static_cast(partition_idx) == minor_comm_rank) + ? init + : ReduceOp::identity_element; + } else { + major_init = init; + } + major_identity_element = ReduceOp::identity_element; + } + } - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); - } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); + } + } - auto major_buffer_first = - get_dataframe_buffer_begin(major_tmp_buffers[i % major_tmp_buffers.size()]); + auto const& key_segment_offsets = key_segment_offset_vectors[j]; - std::conditional_t, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = major_buffer_first; + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); } else { - output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + std::conditional_t, + edge_partition_minor_output_device_view_t>, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = get_dataframe_buffer_begin(major_output_buffers[j]); + } else { + output_buffer = + edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + } + } else { + output_buffer = tmp_vertex_value_output_first; } - } else { - output_buffer = tmp_vertex_value_output_first; - } - -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime3 = std::chrono::steady_clock::now(); -#endif - bool process_local_edges = true; - if constexpr (filter_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - if (static_cast(i) == minor_comm_rank) { process_local_edges = false; } - } - if (process_local_edges) { - per_v_transform_reduce_e_edge_partition( - handle, - edge_partition, - edge_partition_key_first, - edge_partition_key_last, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - major_identity_element, - reduce_op, - pred_op, - key_segment_offsets, - edge_partition_stream_pool_indices); - } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime4 = std::chrono::steady_clock::now(); -#endif + bool process_local_edges = true; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } + } - if constexpr (GraphViewType::is_multi_gpu && update_major) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); + if (process_local_edges) { + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets, + edge_partition_stream_pool_indices); + } - if (key_segment_offsets && edge_partition_stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { + if constexpr (GraphViewType::is_multi_gpu && update_major) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + if (key_segment_offsets && edge_partition_stream_pool_indices) { + if (edge_partition.dcs_nzd_vertex_count()) { + if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]); + auto segment_offset = (*key_segment_offsets)[3]; + auto segment_size = (*key_segment_offsets)[4] - (*key_segment_offsets)[3]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = gather_offset_value_pairs( + minor_comm, + output_buffer + segment_offset, + output_buffer + (segment_offset + segment_size), + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true, + segment_stream); + if (static_cast(partition_idx) == minor_comm_rank) { + offset_vectors[3] = std::move(offsets); + value_vectors[3] = std::move(values); + } + } else { + device_reduce(minor_comm, + output_buffer + segment_offset, + tmp_vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + segment_stream); + } + } + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]); - auto segment_offset = (*key_segment_offsets)[3]; - auto segment_size = (*key_segment_offsets)[4] - (*key_segment_offsets)[3]; + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]); + auto segment_offset = (*key_segment_offsets)[2]; + auto segment_size = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; if constexpr (std::is_same_v>) { - auto [offsets, values] = gather_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); - if (static_cast(i) == minor_comm_rank) { - offset_vectors[3] = std::move(offsets); - value_vectors[3] = std::move(values); + auto [offsets, values] = + gather_offset_value_pairs(minor_comm, + output_buffer + segment_offset, + output_buffer + (segment_offset + segment_size), + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true, + segment_stream); + if (static_cast(partition_idx) == minor_comm_rank) { + offset_vectors[2] = std::move(offsets); + value_vectors[2] = std::move(values); } } else { device_reduce(minor_comm, - major_buffer_first + segment_offset, + output_buffer + segment_offset, tmp_vertex_value_output_first + segment_offset, segment_size, ReduceOp::compatible_raft_comms_op, - static_cast(i), + static_cast(partition_idx), segment_stream); } } - } - if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]); - auto segment_offset = (*key_segment_offsets)[2]; - auto segment_size = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = gather_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); - if (static_cast(i) == minor_comm_rank) { - offset_vectors[2] = std::move(offsets); - value_vectors[2] = std::move(values); + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]); + auto segment_offset = (*key_segment_offsets)[1]; + auto segment_size = (*key_segment_offsets)[2] - (*key_segment_offsets)[1]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = + gather_offset_value_pairs(minor_comm, + output_buffer + segment_offset, + output_buffer + (segment_offset + segment_size), + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true, + segment_stream); + if (static_cast(partition_idx) == minor_comm_rank) { + offset_vectors[1] = std::move(offsets); + value_vectors[1] = std::move(values); + } + } else { + device_reduce(minor_comm, + output_buffer + segment_offset, + tmp_vertex_value_output_first + segment_offset, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + segment_stream); } - } else { - device_reduce(minor_comm, - major_buffer_first + segment_offset, - tmp_vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); } - } - if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]); - auto segment_offset = (*key_segment_offsets)[1]; - auto segment_size = (*key_segment_offsets)[2] - (*key_segment_offsets)[1]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = gather_offset_value_pairs( - minor_comm, - major_buffer_first + segment_offset, - major_buffer_first + (segment_offset + segment_size), - static_cast(i), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); - if (static_cast(i) == minor_comm_rank) { - offset_vectors[1] = std::move(offsets); - value_vectors[1] = std::move(values); + if ((*key_segment_offsets)[1] > 0) { + auto segment_stream = + handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]); + auto segment_size = (*key_segment_offsets)[1]; + if constexpr (std::is_same_v>) { + auto [offsets, values] = + gather_offset_value_pairs(minor_comm, + output_buffer, + output_buffer + segment_size, + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true, + segment_stream); + if (static_cast(partition_idx) == minor_comm_rank) { + offset_vectors[0] = std::move(offsets); + value_vectors[0] = std::move(values); + } + } else { + device_reduce(minor_comm, + output_buffer, + tmp_vertex_value_output_first, + segment_size, + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + segment_stream); } + } + } else { + size_t reduction_size{}; + if constexpr (use_input_key) { + reduction_size = static_cast( + thrust::distance(edge_partition_key_first, edge_partition_key_last)); } else { - device_reduce(minor_comm, - major_buffer_first + segment_offset, - tmp_vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); + reduction_size = static_cast( + key_segment_offsets + ? *((*key_segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size()); } - } - if ((*key_segment_offsets)[1] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]); - auto segment_size = (*key_segment_offsets)[1]; if constexpr (std::is_same_v>) { auto [offsets, values] = gather_offset_value_pairs(minor_comm, - major_buffer_first, - major_buffer_first + segment_size, - static_cast(i), + output_buffer, + output_buffer + reduction_size, + static_cast(partition_idx), subgroup_size, init, process_local_edges ? false : true, - segment_stream); - if (static_cast(i) == minor_comm_rank) { + handle.get_stream()); + if (static_cast(partition_idx) == minor_comm_rank) { offset_vectors[0] = std::move(offsets); value_vectors[0] = std::move(values); } } else { device_reduce(minor_comm, - major_buffer_first, + output_buffer, tmp_vertex_value_output_first, - segment_size, + reduction_size, ReduceOp::compatible_raft_comms_op, - static_cast(i), - segment_stream); + static_cast(partition_idx), + handle.get_stream()); } } - } else { - size_t reduction_size{}; - if constexpr (use_input_key) { - reduction_size = static_cast( - thrust::distance(edge_partition_key_first, edge_partition_key_last)); - } else { - reduction_size = static_cast( - segment_offsets - ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - } - if constexpr (std::is_same_v>) { - auto [offsets, values] = - gather_offset_value_pairs(minor_comm, - major_buffer_first, - major_buffer_first + reduction_size, - static_cast(i), - subgroup_size, - init, - process_local_edges ? false : true, - handle.get_stream()); - if (static_cast(i) == minor_comm_rank) { - offset_vectors[0] = std::move(offsets); - value_vectors[0] = std::move(values); - } - } else { - device_reduce(minor_comm, - major_buffer_first, - tmp_vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(i), - handle.get_stream()); - } } } - - if (stream_pool_indices && ((i + 1) % major_tmp_buffers.size() == 0)) { - handle.sync_stream_pool( - *stream_pool_indices); // to prevent buffer over-write (this can happen - // as *segment_offsets do not necessarily coincide - // in different edge partitions). - } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime5 = std::chrono::steady_clock::now(); - std::chrono::duration subdur0 = subtime1 - subtime0; - std::chrono::duration subdur1 = subtime2 - subtime1; - std::chrono::duration subdur2 = subtime3 - subtime2; - std::chrono::duration subdur3 = subtime4 - subtime3; - std::chrono::duration subdur4 = subtime5 - subtime4; - std::cout << "\t\t\tdetail::per_v i=" << i << " took (" << subdur0.count() << "," - << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," - << subdur4.count() << ")" << std::endl; -#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } - - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time2 = std::chrono::steady_clock::now(); From 75d6151f1c6752001e0f62efb102c1b8243c9119 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 17 Aug 2024 22:04:15 -0700 Subject: [PATCH 027/126] delete unused code --- cpp/tests/utilities/test_graphs.hpp | 35 ----------------------------- 1 file changed, 35 deletions(-) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 0a706d1cf80..484086cfc70 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -24,7 +24,6 @@ #include #include #include -#include // legacy coo_to_csr #include @@ -762,39 +761,5 @@ construct_graph(raft::handle_t const& handle, return std::make_tuple(std::move(graph), std::move(edge_weights), std::move(renumber_map)); } -namespace legacy { - -template -std::unique_ptr> construct_graph_csr( - raft::handle_t const& handle, input_usecase_t const& input_usecase, bool test_weighted) -{ - auto [d_src_v, d_dst_v, d_weight_v, d_vertices_v, is_symmetric] = - input_usecase.template construct_edgelist( - handle, test_weighted, false, false); - vertex_t num_vertices{}; // assuming that vertex IDs are non-negative consecutive integers - if (d_vertices_v) { - num_vertices = - max_element( - handle, raft::device_span((*d_vertices_v).data(), (*d_vertices_v).size())) + - 1; - } else { - num_vertices = - std::max( - max_element(handle, raft::device_span(d_src_v.data(), d_src_v.size())), - max_element(handle, raft::device_span(d_dst_v.data(), d_dst_v.size()))) + - 1; - } - - cugraph::legacy::GraphCOOView cooview( - d_src_v.data(), - d_dst_v.data(), - d_weight_v ? d_weight_v->data() : nullptr, - num_vertices, - static_cast(d_src_v.size())); - - return cugraph::coo_to_csr(cooview); -} - -} // namespace legacy } // namespace test } // namespace cugraph From 0f88988c0505342a0290243e14d757fc0cac636e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 17 Aug 2024 22:11:22 -0700 Subject: [PATCH 028/126] add an option to skip edge shuffling in R-mat edge list generation --- cpp/tests/utilities/test_graphs.hpp | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index 484086cfc70..f660fed89b1 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -233,7 +233,8 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { construct_edgelist(raft::handle_t const& handle, bool test_weighted, bool store_transposed, - bool multi_gpu) const + bool multi_gpu, + bool shuffle = true) const { CUGRAPH_EXPECTS( (size_t{1} << scale_) <= static_cast(std::numeric_limits::max()), @@ -329,7 +330,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { handle, std::move(tmp_src_v), std::move(tmp_dst_v), std::move(tmp_weights_v)); } - if (multi_gpu) { + if (multi_gpu && shuffle) { std::tie(store_transposed ? tmp_dst_v : tmp_src_v, store_transposed ? tmp_src_v : tmp_dst_v, tmp_weights_v, @@ -374,7 +375,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { translate(handle, vertex_v); - if (multi_gpu) { + if (multi_gpu && shuffle) { vertex_v = cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( handle, std::move(vertex_v)); } From 2efb51ebe6a2c3c208cedbd85cbabc142345fedd Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 19 Aug 2024 15:44:06 -0700 Subject: [PATCH 029/126] fix build error --- cpp/src/prims/detail/per_v_transform_reduce_e.cuh | 2 +- cpp/src/traversal/bfs_impl.cuh | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 9b2b46deb09..44854a02164 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1889,7 +1889,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ : edge_partition.major_range_size(); } - major_output_buffers.emplace_back(buffer_size, loop_stream); + major_output_buffers.push_back(allocate_dataframe_buffer(buffer_size, loop_stream)); } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 26b30d87c53..51315884fd9 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -211,6 +211,7 @@ void bfs(raft::handle_t const& handle, std::optional> out_degrees{std::nullopt}; std::optional> nzd_unvisited_vertices{std::nullopt}; if (direction_optimizing) { + // FIXME: if this becomes the main performance bottleneck, we may just approximate this. out_degrees = graph_view.compute_out_degrees(handle); nzd_unvisited_vertices = rmm::device_uvector( graph_view.local_vertex_partition_range_size(), handle.get_stream()); From 106a6ad21cf0724cb0331c330de676dd1e2e1e51 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 19 Aug 2024 15:44:44 -0700 Subject: [PATCH 030/126] fix documentation error --- cpp/include/cugraph/graph_functions.hpp | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index e1364f69991..171ab0722cf 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -1171,14 +1171,16 @@ shuffle_external_vertex_value_pairs(raft::handle_t const& handle, * @param edge_ids Optional list of edge ids * @param edge_types Optional list of edge types * @return Tuple of vectors storing edge sources, destinations, optional weights, - * optional edge ids, optional edge types mapped to this GPU. + * optional edge ids, optional edge types mapped to this GPU and a vector storing the + * number of edges received from each GPU. */ template std::tuple, rmm::device_uvector, std::optional>, std::optional>, - std::optional>> + std::optional>, + std::vector> shuffle_external_edges(raft::handle_t const& handle, rmm::device_uvector&& edge_srcs, rmm::device_uvector&& edge_dsts, From 98419cb79d00e43916087facfda557187870ab3c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 19 Aug 2024 15:45:09 -0700 Subject: [PATCH 031/126] add a query function --- cpp/tests/utilities/test_graphs.hpp | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index f660fed89b1..b7a91985658 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -391,6 +391,10 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { void set_edge_factor(size_t edge_factor) { edge_factor_ = edge_factor; } + bool undirected() const { + return undirected_; + } + private: size_t scale_{}; size_t edge_factor_{}; From 9625e0c71cdceddd44ece33011ac5892830ed209 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 20 Aug 2024 11:44:28 -0700 Subject: [PATCH 032/126] bug fix --- cpp/src/prims/detail/per_v_transform_reduce_e.cuh | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 44854a02164..a6d80be8557 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1160,7 +1160,8 @@ void per_v_transform_reduce_e_edge_partition( auto segment_key_last = edge_partition_key_last; if constexpr (use_input_key) { segment_key_first += (*key_segment_offsets)[3]; - segment_key_last += (*key_segment_offsets)[4]; + segment_key_last = + segment_key_first + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3]); } else { assert(segment_key_first == nullptr); assert(segment_key_last == nullptr); From 03cfe0d6eeaa01d097f682445282bfdca434f967 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 20 Aug 2024 16:50:18 -0700 Subject: [PATCH 033/126] bug fix --- .../prims/detail/per_v_transform_reduce_e.cuh | 30 ++++++++++--------- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index a6d80be8557..eeaf53182b0 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -875,22 +875,23 @@ __global__ static void per_v_transform_reduce_e_high_degree( } } -template -__host__ __device__ int rank_to_priority( +template +__host__ __device__ priority_t rank_to_priority( int rank, int root, int subgroup_size /* faster interconnect within a subgroup */, int comm_size, vertex_t offset /* to evenly distribute traffic */) { + assert(comm_size <= std::numeric_limits::max()); if (rank == root) { // no need for communication (priority 0) - return int{0}; + return priority_t{0}; } else if (rank / subgroup_size == root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in // [1, subgroup_size) int modulo = subgroup_size - 1; auto rank_dist = (rank + subgroup_size - root) % subgroup_size; - return 1 + ((rank_dist - 1) + (offset % modulo)) % modulo; + return static_cast(1 + ((rank_dist - 1) + (offset % modulo)) % modulo); } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) int modulo = comm_size - subgroup_size; auto subgroup_dist = @@ -898,27 +899,27 @@ __host__ __device__ int rank_to_priority( (comm_size / subgroup_size); auto intra_subgroup_rank_dist = ((rank % subgroup_size) + subgroup_size - (root % subgroup_size)) % subgroup_size; - return subgroup_size + + return static_cast(subgroup_size + ((subgroup_dist * subgroup_size + intra_subgroup_rank_dist - subgroup_size) + (offset % modulo)) % - modulo; + modulo); } } -template +template __host__ __device__ int priority_to_rank( - int priority, + priority_t priority, int root, int subgroup_size /* faster interconnect within a subgroup */, int comm_size, vertex_t offset /* to evenly distribute traffict */) { - if (priority == int{0}) { + if (priority == priority_t{0}) { return root; - } else if (priority < subgroup_size) { + } else if (priority < static_cast(subgroup_size)) { int modulo = subgroup_size - 1; auto rank_dist = 1 + (priority - 1 + modulo - (offset % modulo)) % modulo; - return (root + rank_dist) % subgroup_size; + return (root - (root % subgroup_size)) + ((root + rank_dist) % subgroup_size); } else { int modulo = comm_size - subgroup_size; auto rank_dist = @@ -963,7 +964,7 @@ std::optional> compute_keep_flags( [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { auto val = *(value_first + offset); return (val != init) - ? rank_to_priority( + ? rank_to_priority( comm_rank, root, subgroup_size, comm_size, static_cast(offset)) : std::numeric_limits::max(); // lowest priority }); @@ -987,8 +988,8 @@ std::optional> compute_keep_flags( [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { auto offset = thrust::get<0>(pair); auto priority = thrust::get<1>(pair); - auto rank = - priority_to_rank(priority, root, subgroup_size, comm_size, offset); + auto rank = (priority == std::numeric_limits::max()) ? comm_size : + priority_to_rank(priority, root, subgroup_size, comm_size, offset); return (rank == comm_rank); }); } @@ -1054,6 +1055,7 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); } + // FIXME: should we gatherv offsets? Can't we figure this out from priorities??? // FIXME: calling the following two device_gatherv within device_group_start() and // device_group_end() improves performance (approx. 5%) // FIXME: or we can implement this in All-to-All after iteration over every edge partition From c15305fb6db15213b2b96838923ee2f5456b3cdc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 20 Aug 2024 21:09:36 -0700 Subject: [PATCH 034/126] bug fixes --- .../prims/detail/per_v_transform_reduce_e.cuh | 53 ++++++++++++------- 1 file changed, 33 insertions(+), 20 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index eeaf53182b0..28b7e6d3a1c 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -876,12 +876,12 @@ __global__ static void per_v_transform_reduce_e_high_degree( } template -__host__ __device__ priority_t rank_to_priority( - int rank, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - int comm_size, - vertex_t offset /* to evenly distribute traffic */) +__host__ __device__ priority_t +rank_to_priority(int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) { assert(comm_size <= std::numeric_limits::max()); if (rank == root) { // no need for communication (priority 0) @@ -889,20 +889,21 @@ __host__ __device__ priority_t rank_to_priority( } else if (rank / subgroup_size == root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in // [1, subgroup_size) - int modulo = subgroup_size - 1; auto rank_dist = (rank + subgroup_size - root) % subgroup_size; + assert((rank_dist > 0) && (rank_dist < subgroup_size)); + int modulo = subgroup_size - 1; return static_cast(1 + ((rank_dist - 1) + (offset % modulo)) % modulo); } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) - int modulo = comm_size - subgroup_size; auto subgroup_dist = ((rank / subgroup_size) + (comm_size / subgroup_size) - (root / subgroup_size)) % (comm_size / subgroup_size); auto intra_subgroup_rank_dist = ((rank % subgroup_size) + subgroup_size - (root % subgroup_size)) % subgroup_size; + auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; + int modulo = comm_size - subgroup_size; + assert((rankd_dist >= subgroup_size) && (rank_dist < (comm_size - (root % subgroup_size)); return static_cast(subgroup_size + - ((subgroup_dist * subgroup_size + intra_subgroup_rank_dist - subgroup_size) + - (offset % modulo)) % - modulo); + ((rank_dist - subgroup_size) + (offset % modulo)) % modulo); } } @@ -919,15 +920,17 @@ __host__ __device__ int priority_to_rank( } else if (priority < static_cast(subgroup_size)) { int modulo = subgroup_size - 1; auto rank_dist = 1 + (priority - 1 + modulo - (offset % modulo)) % modulo; + assert((rank_dist >= 1) && (rank_dist < subgroup_size)); return (root - (root % subgroup_size)) + ((root + rank_dist) % subgroup_size); } else { int modulo = comm_size - subgroup_size; auto rank_dist = subgroup_size + (priority - subgroup_size + modulo - (offset % modulo)) % modulo; + assert((rank_dist >= subgroup_size) && (rank_dist < comm_size)); auto subgroup_dist = rank_dist / subgroup_size; auto intra_subgroup_rank_dist = rank_dist % subgroup_size; - return ((root / subgroup_size + subgroup_dist) % (comm_size / subgroup_size)) * subgroup_size + - (root % subgroup_size + intra_subgroup_rank_dist) % subgroup_size; + return ((root / subgroup_size) * subgroup_size) + subgroup_dist * subgroup_size + + (root + intra_subgroup_rank_dist) % subgroup_size; } } @@ -988,8 +991,10 @@ std::optional> compute_keep_flags( [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { auto offset = thrust::get<0>(pair); auto priority = thrust::get<1>(pair); - auto rank = (priority == std::numeric_limits::max()) ? comm_size : - priority_to_rank(priority, root, subgroup_size, comm_size, offset); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); return (rank == comm_rank); }); } @@ -1596,14 +1601,22 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, subgroup_size{}; if constexpr (GraphViewType::is_multi_gpu && update_major && std::is_same_v>) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); int num_gpus_per_node{}; RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); - subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm - ? std::max(num_gpus_per_node / minor_comm_size, int{1}) - : std::min(minor_comm_size, num_gpus_per_node); + if (comm_size <= num_gpus_per_node) { + subgroup_size = comm_size; + } else { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::max(num_gpus_per_node / major_comm_size, int{1}) + : std::min(minor_comm_size, num_gpus_per_node); + } } // 5. compute optional bitmap info From 4a1f150e1d3ce8bcfd9c5ea97f6a1c2b966b2d26 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 21 Aug 2024 00:01:37 -0700 Subject: [PATCH 035/126] bug fix --- .../prims/detail/per_v_transform_reduce_e.cuh | 64 +++++++++++++------ 1 file changed, 43 insertions(+), 21 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 28b7e6d3a1c..bf5e4c26998 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -883,27 +883,37 @@ rank_to_priority(int rank, int comm_size, vertex_t offset /* to evenly distribute traffic */) { - assert(comm_size <= std::numeric_limits::max()); - if (rank == root) { // no need for communication (priority 0) + using cast_t = + std::conditional_t, + int16_t, + std::conditional_t, + int32_t, + int64_t>>; // to prevent overflow (assuming that + // comm_size <= + // std::numeric_limits::max()) + if (rank == root) { // no need for communication (priority 0) return priority_t{0}; } else if (rank / subgroup_size == root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in // [1, subgroup_size) - auto rank_dist = (rank + subgroup_size - root) % subgroup_size; - assert((rank_dist > 0) && (rank_dist < subgroup_size)); + auto rank_dist = + static_cast(((static_cast(rank) + subgroup_size) - root) % subgroup_size); int modulo = subgroup_size - 1; - return static_cast(1 + ((rank_dist - 1) + (offset % modulo)) % modulo); + return static_cast(1 + (static_cast(rank_dist - 1) + (offset % modulo)) % + modulo); } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) auto subgroup_dist = - ((rank / subgroup_size) + (comm_size / subgroup_size) - (root / subgroup_size)) % - (comm_size / subgroup_size); - auto intra_subgroup_rank_dist = - ((rank % subgroup_size) + subgroup_size - (root % subgroup_size)) % subgroup_size; + static_cast(((static_cast(rank / subgroup_size) + (comm_size / subgroup_size)) - + (root / subgroup_size)) % + (comm_size / subgroup_size)); + auto intra_subgroup_rank_dist = static_cast( + ((static_cast(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) % + subgroup_size); auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; int modulo = comm_size - subgroup_size; - assert((rankd_dist >= subgroup_size) && (rank_dist < (comm_size - (root % subgroup_size)); - return static_cast(subgroup_size + - ((rank_dist - subgroup_size) + (offset % modulo)) % modulo); + return static_cast( + subgroup_size + + (static_cast(rank_dist - subgroup_size) + (offset % modulo)) % modulo); } } @@ -915,22 +925,34 @@ __host__ __device__ int priority_to_rank( int comm_size, vertex_t offset /* to evenly distribute traffict */) { + using cast_t = + std::conditional_t, + int16_t, + std::conditional_t, + int32_t, + int64_t>>; // to prevent overflow (assuming that + // comm_size <= + // std::numeric_limits::max()) if (priority == priority_t{0}) { return root; } else if (priority < static_cast(subgroup_size)) { int modulo = subgroup_size - 1; - auto rank_dist = 1 + (priority - 1 + modulo - (offset % modulo)) % modulo; - assert((rank_dist >= 1) && (rank_dist < subgroup_size)); - return (root - (root % subgroup_size)) + ((root + rank_dist) % subgroup_size); + auto rank_dist = static_cast( + 1 + ((static_cast(priority - 1) + modulo) - (offset % modulo)) % modulo); + return static_cast((root - (root % subgroup_size)) + + ((static_cast(root) + rank_dist) % subgroup_size)); } else { - int modulo = comm_size - subgroup_size; - auto rank_dist = - subgroup_size + (priority - subgroup_size + modulo - (offset % modulo)) % modulo; - assert((rank_dist >= subgroup_size) && (rank_dist < comm_size)); + int modulo = comm_size - subgroup_size; + auto rank_dist = static_cast( + subgroup_size + + ((static_cast(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo); auto subgroup_dist = rank_dist / subgroup_size; auto intra_subgroup_rank_dist = rank_dist % subgroup_size; - return ((root / subgroup_size) * subgroup_size) + subgroup_dist * subgroup_size + - (root + intra_subgroup_rank_dist) % subgroup_size; + return static_cast( + ((static_cast((root / subgroup_size) * subgroup_size) + + subgroup_dist * subgroup_size) + + (static_cast(root) + intra_subgroup_rank_dist) % subgroup_size) % + comm_size); } } From 29b6834362bfcbc824d31521843a15758925d0fe Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 21 Aug 2024 11:08:06 -0700 Subject: [PATCH 036/126] replace offset vector communication with local computing --- .../prims/detail/per_v_transform_reduce_e.cuh | 139 +++++++++++------- 1 file changed, 84 insertions(+), 55 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index bf5e4c26998..dce776971ee 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -956,16 +956,19 @@ __host__ __device__ int priority_to_rank( } } +// return selected ranks if root. +// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are +// selected or not. template -std::optional> compute_keep_flags( - raft::comms::comms_t const& comm, - ValueIterator value_first, - ValueIterator value_last, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - typename thrust::iterator_traits::value_type init, - bool ignore_local_values, - rmm::cuda_stream_view stream_view) +std::variant /* root */, std::optional>> +compute_selected_ranks(raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) { auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); @@ -1001,15 +1004,14 @@ std::optional> compute_keep_flags( raft::comms::op_t::MIN, stream_view); - std::optional> keep_flags{std::nullopt}; - if (!ignore_local_values) { - keep_flags = rmm::device_uvector(priorities.size(), stream_view); + if (comm_rank == root) { + rmm::device_uvector selected_ranks(priorities.size(), stream_view); auto offset_priority_pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); thrust::transform(rmm::exec_policy(stream_view), offset_priority_pair_first, offset_priority_pair_first + priorities.size(), - (*keep_flags).begin(), + selected_ranks.begin(), [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { auto offset = thrust::get<0>(pair); auto priority = thrust::get<1>(pair); @@ -1017,11 +1019,31 @@ std::optional> compute_keep_flags( ? comm_size : priority_to_rank( priority, root, subgroup_size, comm_size, offset); - return (rank == comm_rank); + return rank; }); + return selected_ranks; + } else { + std::optional> keep_flags{std::nullopt}; + if (!ignore_local_values) { + keep_flags = rmm::device_uvector(priorities.size(), stream_view); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + thrust::transform(rmm::exec_policy(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + priorities.size(), + (*keep_flags).begin(), + [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + return (rank == comm_rank); + }); + } + return keep_flags; } - - return keep_flags; } template @@ -1041,65 +1063,63 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); - std::optional> keep_flags{std::nullopt}; + std::variant, std::optional>> + selected_ranks_or_flags{std::nullopt}; if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t - keep_flags = compute_keep_flags( + selected_ranks_or_flags = compute_selected_ranks( comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); } #if 0 // FIXME: this should be enabled (currently, raft does not support allreduce on uint16_t). else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t - keep_flags = compute_keep_flags( - comm, value_first, value_last, root, subgroup_size, init, stream_view); + selected_ranks_or_flags = compute_selected_ranks( + comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); } #endif else { // priority_t == uint32_t - keep_flags = compute_keep_flags( + selected_ranks_or_flags = compute_selected_ranks( comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); } - rmm::device_uvector offsets(0, stream_view); auto values = allocate_dataframe_buffer(0, stream_view); - if (keep_flags) { - auto copy_size = thrust::count( - rmm::exec_policy(stream_view), (*keep_flags).begin(), (*keep_flags).end(), true); - - offsets.resize(copy_size, stream_view); - resize_dataframe_buffer(values, copy_size, stream_view); - auto offset_value_pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), value_first); - thrust::copy_if(rmm::exec_policy(stream_view), - offset_value_pair_first, - offset_value_pair_first + (*keep_flags).size(), - (*keep_flags).begin(), - thrust::make_zip_iterator(offsets.begin(), get_dataframe_buffer_begin(values)), - thrust::identity{}); + if (comm_rank == root) { + assert(selected_ranks_or_flags.index() == 0); + auto const& selected_ranks = std::get<0>(selected_ranks_or_flags); + if (!ignore_local_values) { + auto copy_size = thrust::count( + rmm::exec_policy(stream_view), selected_ranks.begin(), selected_ranks.end(), comm_rank); + resize_dataframe_buffer(values, copy_size, stream_view); + thrust::copy_if( + rmm::exec_policy(stream_view), + value_first, + value_first + selected_ranks.size(), + selected_ranks.begin(), + get_dataframe_buffer_begin(values), + is_equal_t{comm_rank}); + } + } else { + assert(selected_ranks_or_flags.index() == 1); + auto const& selected_flags = std::get<1>(selected_ranks_or_flags); + if (selected_flags) { + auto copy_size = thrust::count( + rmm::exec_policy(stream_view), (*selected_flags).begin(), (*selected_flags).end(), true); + resize_dataframe_buffer(values, copy_size, stream_view); + thrust::copy_if( + rmm::exec_policy(stream_view), + value_first, + value_first + (*selected_flags).size(), + (*selected_flags).begin(), + get_dataframe_buffer_begin(values), + thrust::identity{}); + } } - auto rx_sizes = host_scalar_gather(comm, offsets.size(), root, stream_view); + auto rx_sizes = host_scalar_gather(comm, size_dataframe_buffer(values), root, stream_view); std::vector rx_displs{}; if (comm_rank == root) { rx_displs.resize(rx_sizes.size()); std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); } - // FIXME: should we gatherv offsets? Can't we figure this out from priorities??? - // FIXME: calling the following two device_gatherv within device_group_start() and - // device_group_end() improves performance (approx. 5%) - // FIXME: or we can implement this in All-to-All after iteration over every edge partition - // FIXME: we may consdier optionally sending offsets in bitmaps - rmm::device_uvector rx_offsets( - comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); - device_gatherv(comm, - offsets.begin(), - rx_offsets.begin(), - offsets.size(), - rx_sizes, - rx_displs, - root, - stream_view); - offsets.resize(0, stream_view); - offsets.shrink_to_fit(stream_view); - auto rx_values = allocate_dataframe_buffer( comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); device_gatherv(comm, @@ -1113,6 +1133,15 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, resize_dataframe_buffer(values, 0, stream_view); shrink_to_fit_dataframe_buffer(values, stream_view); + rmm::device_uvector rx_offsets(0, stream_view); + if (comm_rank == root) { + auto& selected_ranks = std::get<0>(selected_ranks_or_flags); + rx_offsets.resize(selected_ranks.size(), stream_view); + thrust::sequence(rmm::exec_policy(stream_view), rx_offsets.begin(), rx_offsets.end(), vertex_t{0}); + thrust::stable_sort_by_key(rmm::exec_policy(stream_view), selected_ranks.begin(), selected_ranks.end(), rx_offsets.begin()); + rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); + } + return std::make_tuple(std::move(rx_offsets), std::move(rx_values)); } From fcc75e08d65b3f112e95c80b506075bcb0268f0f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 21 Aug 2024 11:08:50 -0700 Subject: [PATCH 037/126] add tmp perf measurement code --- cpp/src/traversal/bfs_impl.cuh | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 51315884fd9..21f84d3cfbe 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -204,6 +204,10 @@ void bfs(raft::handle_t const& handle, thrust::fill(handle.get_thrust_policy(), output_first, output_first + n_sources, vertex_t{0}); // 3. update meta data for direction optimizing BFS +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep1 = std::chrono::steady_clock::now(); +#endif constexpr edge_t direction_optimizing_alpha = 14; constexpr vertex_t direction_optimizing_beta = 24; @@ -237,6 +241,10 @@ void bfs(raft::handle_t const& handle, } // 4. initialize BFS frontier +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep2 = std::chrono::steady_clock::now(); +#endif constexpr size_t bucket_idx_cur = 0; constexpr size_t bucket_idx_next = 1; @@ -254,6 +262,10 @@ void bfs(raft::handle_t const& handle, handle, graph_view); // this may mark some vertices visited in previous iterations as unvisited // (but this is OK as we check prev_dst_visited_flags first) fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false); +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto prep3 = std::chrono::steady_clock::now(); +#endif fill_edge_dst_property(handle, graph_view, @@ -263,9 +275,13 @@ void bfs(raft::handle_t const& handle, true); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto prep1 = std::chrono::steady_clock::now(); - std::chrono::duration dur = prep1 - prep0; - std::cout << "prep took " << dur.count() << " s." << std::endl; + auto prep4 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = prep1 - prep0; + std::chrono::duration dur1 = prep2 - prep1; + std::chrono::duration dur2 = prep3 - prep2; + std::chrono::duration dur3 = prep4 - prep3; + std::chrono::duration dur = prep4 - prep0; + std::cout << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; #endif // 4. BFS iteration @@ -334,7 +350,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur = topdown3 - topdown0; - std::cout << "topdown took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + std::cout << "topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ") s." << std::endl; #endif break; @@ -439,7 +455,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; std::chrono::duration dur = topdown6 - topdown0; - std::cout << "topdown took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + std::cout << "topdown (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," << dur5.count() << ") s." << std::endl; #endif @@ -533,7 +549,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; - std::cout << "bottomup took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + std::cout << "bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s." << std::endl; #endif break; @@ -589,7 +605,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; std::chrono::duration dur = bottomup5 - bottomup0; - std::cout << "bottomup took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + std::cout << "bottomup (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." << std::endl; #endif From 710eb88ea62d6a1c5a34413753435d5a195dab97 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 21 Aug 2024 11:09:25 -0700 Subject: [PATCH 038/126] map GPUs on minor_comm to consecutive GPUs --- cpp/include/cugraph/partition_manager.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 309b169e646..5951b6e9d77 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -42,7 +42,7 @@ class partition_manager { // partitioning along the major axis (major sub-communicator is responsible for this) and along // the minor axis (minor sub-communicator is responsible for this). This variable controls whether // to map the major sub-communicator to the GPU row communicator or the GPU column communicator. - static constexpr bool map_major_comm_to_gpu_row_comm = true; + static constexpr bool map_major_comm_to_gpu_row_comm = false; // FIXME: this is for benchmarking, reset to true before merging #ifdef __CUDACC__ __host__ __device__ From d040110a5cc2d8fba739fa7eb6f5fdc521826adf Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 22 Aug 2024 15:03:45 -0700 Subject: [PATCH 039/126] additional performance tuning --- .../detail/extract_transform_v_frontier_e.cuh | 562 +++++++++++------- .../prims/detail/per_v_transform_reduce_e.cuh | 99 ++- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 4 +- 3 files changed, 394 insertions(+), 271 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 1fe81198eb8..c11727a35c1 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -15,6 +15,7 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" #include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/property_op_utils.cuh" @@ -574,6 +575,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, "Invalid input argument: frontier includes out-of-range keys."); } + [[maybe_unused]] constexpr auto max_segments = + detail::num_sparse_segments_per_vertex_partition + size_t{1}; + // 1. pre-process frontier data auto frontier_key_first = frontier.begin(); @@ -652,6 +656,78 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, }); } + // compute max_pushes + + std::vector max_push_counts{}; + { + size_t partition_idx{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + partition_idx = static_cast(minor_comm_rank); + } + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto frontier_major_first = + thrust_tuple_get_or_identity(frontier_key_first); + auto frontier_major_last = + thrust_tuple_get_or_identity(frontier_key_last); + // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of + // additional computing) + auto max_pushes = edge_partition.compute_number_of_edges( + frontier_major_first, frontier_major_last, handle.get_stream()); + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + max_push_counts = host_scalar_allgather(minor_comm, max_pushes, handle.get_stream()); + } else { + max_push_counts = {max_pushes}; + } + } + + // set-up stream ppol + + std::optional> stream_pool_indices{std::nullopt}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto partition_idx = static_cast(minor_comm_rank); + + if (graph_view.local_edge_partition_segment_offsets(partition_idx) && + (handle.get_stream_pool_size() >= max_segments)) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + + auto max_tmp_buffer_size = + (graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t); + + auto aggregate_major_range_size = host_scalar_allreduce( + comm, + static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + raft::comms::op_t::SUM, + handle.get_stream()); + auto aggregate_max_pushes = host_scalar_allreduce( + comm, max_push_counts[partition_idx], raft::comms::op_t::SUM, handle.get_stream()); + auto approx_tmp_buffer_size_per_edge_partition = + (aggregate_major_range_size / comm_size) * sizeof(key_t) + + (aggregate_max_pushes / comm_size) * (sizeof(output_key_t) + sizeof(output_value_t)); + + stream_pool_indices = init_stream_pool_indices(handle, + max_tmp_buffer_size, + approx_tmp_buffer_size_per_edge_partition, + graph_view.number_of_local_edge_partitions(), + max_segments); + } + } + + size_t num_concurrent_loops{1}; + if (stream_pool_indices) { + assert(((*stream_pool_indices).size() % max_segments) == 0); + num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + } + + if (stream_pool_indices) { handle.sync_stream(); } + // 2. fill the buffers std::vector> key_buffers{}; @@ -665,237 +741,297 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time1 = std::chrono::steady_clock::now(); #endif - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime0 = std::chrono::steady_clock::now(); -#endif - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(i)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, i) - : thrust::nullopt; - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(i); - - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; - auto edge_partition_frontier_key_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { + auto loop_count = + std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + + std::conditional_t>, + std::byte /* dummy */> + edge_partition_key_buffers{}; + if constexpr (GraphViewType::is_multi_gpu) { edge_partition_key_buffers.reserve(loop_count); } + std::vector>> key_segment_offset_vectors{}; + key_segment_offset_vectors.reserve(loop_count); + std::vector> output_key_buffers{}; + output_key_buffers.reserve(loop_count); + std::vector> output_value_buffers{}; + output_value_buffers.reserve(loop_count); + std::vector> output_buffer_idx_scalars{}; + output_buffer_idx_scalars.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); - - resize_dataframe_buffer( - edge_partition_frontier_key_buffer, local_frontier_sizes[i], handle.get_stream()); - - if constexpr (try_bitmap) { - std::variant, decltype(frontier_key_first)> v_list{}; - if (use_bitmap_flags[i]) { - v_list = (static_cast(i) == minor_comm_rank) - ? raft::device_span((*frontier_bitmap).data(), - (*frontier_bitmap).size()) - : raft::device_span(static_cast(nullptr), - size_t{0}); + auto const minor_comm_size = minor_comm.get_size(); + + if (minor_comm_size > 1) { + auto edge_partition_key_buffer = + allocate_dataframe_buffer(local_frontier_sizes[partition_idx], loop_stream); + if constexpr (try_bitmap) { + std::variant, decltype(frontier_key_first)> v_list{}; + if (use_bitmap_flags[partition_idx]) { + v_list = (static_cast(partition_idx) == minor_comm_rank) + ? raft::device_span((*frontier_bitmap).data(), + (*frontier_bitmap).size()) + : raft::device_span(static_cast(nullptr), + size_t{0}); + } else { + v_list = frontier_key_first; + } + auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) + : edge_partition.major_range_size(); + device_bcast_vertex_list(minor_comm, + v_list, + get_dataframe_buffer_begin(edge_partition_key_buffer), + edge_partition.major_range_first(), + edge_partition.major_range_first() + bool_size, + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + loop_stream); } else { - v_list = frontier_key_first; + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffer), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + loop_stream); } - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); - device_bcast_vertex_list(minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, - local_frontier_sizes[i], - static_cast(i), - handle.get_stream()); - } else { - device_bcast(minor_comm, - frontier_key_first, - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer), - local_frontier_sizes[i], - static_cast(i), - handle.get_stream()); + edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); + edge_partition_frontier_key_first = + get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_frontier_key_last = + get_dataframe_buffer_end(edge_partition_key_buffers[j]); } + } - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_frontier_key_buffer); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_frontier_key_buffer); + std::optional> key_segment_offsets{std::nullopt}; + if (segment_offsets) { + auto edge_partition_frontier_major_first = + thrust_tuple_get_or_identity( + edge_partition_frontier_key_first); + auto edge_partition_frontier_major_last = + thrust_tuple_get_or_identity( + edge_partition_frontier_key_last); + key_segment_offsets = compute_key_segment_offsets( + edge_partition_frontier_major_first, + edge_partition_frontier_major_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + edge_partition.major_range_first(), + loop_stream); } - } + key_segment_offset_vectors.push_back(std::move(key_segment_offsets)); - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); + auto edge_partition_max_pushes = max_push_counts[partition_idx]; - auto max_pushes = edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, edge_partition_frontier_major_last, handle.get_stream()); - - auto tmp_key_buffer = - allocate_optional_dataframe_buffer(max_pushes, handle.get_stream()); - auto tmp_value_buffer = - allocate_optional_dataframe_buffer(max_pushes, handle.get_stream()); - rmm::device_scalar tmp_buffer_idx(size_t{0}, handle.get_stream()); - - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, i); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, i); - edge_partition_dst_value_input = edge_partition_dst_input_device_view_t(edge_dst_value_input); + output_key_buffers.push_back( + allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); + output_value_buffers.push_back( + allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); + output_buffer_idx_scalars.push_back(rmm::device_scalar(size_t{0}, loop_stream)); } - auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, i); + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto edge_partition_stream_pool_indices = + stream_pool_indices ? std::make_optional>( + (*stream_pool_indices).data() + j * max_segments, max_segments) + : std::nullopt; + + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + edge_partition_frontier_key_first = + get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_frontier_key_last = + get_dataframe_buffer_end(edge_partition_key_buffers[j]); + } + } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime1 = std::chrono::steady_clock::now(); -#endif - if (segment_offsets) { - auto h_offsets = compute_key_segment_offsets( - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - edge_partition.major_range_first(), - handle.get_stream()); + auto const& key_segment_offsets = key_segment_offset_vectors[j]; - // FIXME: we may further improve performance by 1) concurrently running kernels on different - // segments; 2) individually tuning block sizes for different segments; and 3) adding one - // more segment for very high degree vertices and running segmented reduction - if (h_offsets[1] > 0) { - raft::grid_1d_block_t update_grid(h_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_high_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); - } - if (h_offsets[2] - h_offsets[1] > 0) { - raft::grid_1d_warp_t update_grid(h_offsets[2] - h_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_mid_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[1], - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); - } - if (h_offsets[3] - h_offsets[2] > 0) { - raft::grid_1d_thread_t update_grid(h_offsets[3] - h_offsets[2], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[2], - edge_partition_frontier_key_first + h_offsets[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); - } - if (edge_partition.dcs_nzd_vertex_count() && (h_offsets[4] - h_offsets[3] > 0)) { - raft::grid_1d_thread_t update_grid(h_offsets[4] - h_offsets[3], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + h_offsets[3], - edge_partition_frontier_key_first + h_offsets[4], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); + auto& tmp_key_buffer = output_key_buffers[j]; + auto& tmp_value_buffer = output_value_buffers[j]; + auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; + + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); + } else { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); } - } else { - if (local_frontier_sizes[i] > 0) { - raft::grid_1d_thread_t update_grid(local_frontier_sizes[i], + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + if (key_segment_offsets) { + if ((*key_segment_offsets)[1] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) + : handle.get_stream(); + raft::grid_1d_block_t update_grid((*key_segment_offsets)[1], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_high_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), + e_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - edge_partition_frontier_key_last, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); + extract_transform_v_frontier_e_mid_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), + e_op); + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), + e_op); + } + if (edge_partition.dcs_nzd_vertex_count() && + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_frontier_key_first + (*key_segment_offsets)[4], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), + e_op); + } + } else { + assert(!edge_partition_stream_pool_indices); + if (local_frontier_sizes[partition_idx] > 0) { + raft::grid_1d_thread_t update_grid(local_frontier_sizes[partition_idx], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(tmp_key_buffer), + get_optional_dataframe_buffer_begin(tmp_value_buffer), + tmp_buffer_idx.data(), + e_op); + } } } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime2 = std::chrono::steady_clock::now(); -#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto tmp_buffer_size = tmp_buffer_idx.value(handle.get_stream()); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - resize_optional_dataframe_buffer( - tmp_key_buffer, tmp_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, handle.get_stream()); + auto& tmp_key_buffer = output_key_buffers[j]; + auto& tmp_value_buffer = output_value_buffers[j]; + auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; - resize_optional_dataframe_buffer( - tmp_value_buffer, tmp_buffer_size, handle.get_stream()); - shrink_to_fit_optional_dataframe_buffer(tmp_value_buffer, handle.get_stream()); + auto tmp_buffer_size = tmp_buffer_idx.value(loop_stream); - key_buffers.push_back(std::move(tmp_key_buffer)); - value_buffers.push_back(std::move(tmp_value_buffer)); + resize_optional_dataframe_buffer(tmp_key_buffer, tmp_buffer_size, loop_stream); + shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, loop_stream); -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto subtime3 = std::chrono::steady_clock::now(); - std::chrono::duration subdur0 = subtime1 - subtime0; - std::chrono::duration subdur1 = subtime2 - subtime1; - std::chrono::duration subdur2 = subtime3 - subtime2; - std::cout << "\t\t\tdetail::extract i=" << i << " took (" << subdur0.count() << "," - << subdur1.count() << "," << subdur2.count() << ")" << std::endl; -#endif + resize_optional_dataframe_buffer( + tmp_value_buffer, tmp_buffer_size, loop_stream); + shrink_to_fit_optional_dataframe_buffer(tmp_value_buffer, loop_stream); + + key_buffers.push_back(std::move(tmp_key_buffer)); + value_buffers.push_back(std::move(tmp_value_buffer)); + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -906,7 +1042,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto key_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); - if (key_buffers.size() == 0) { + if (key_buffers.size() == 1) { key_buffer = std::move(key_buffers[0]); value_buffer = std::move(value_buffers[0]); } else { @@ -951,8 +1087,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\tdetail::extract took (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << ")" << std::endl; + std::cout << "\t\tdetail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << ")" << std::endl; #endif return std::make_tuple(std::move(key_buffer), std::move(value_buffer)); diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index dce776971ee..dd94fba319a 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -16,6 +16,7 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "prims/detail/multi_stream_utils.cuh" #include "prims/detail/optional_dataframe_buffer.hpp" #include "prims/detail/prim_functors.cuh" #include "prims/fill_edge_src_dst_property.cuh" @@ -38,7 +39,6 @@ #include #include #include -#include #include @@ -1088,13 +1088,12 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, auto copy_size = thrust::count( rmm::exec_policy(stream_view), selected_ranks.begin(), selected_ranks.end(), comm_rank); resize_dataframe_buffer(values, copy_size, stream_view); - thrust::copy_if( - rmm::exec_policy(stream_view), - value_first, - value_first + selected_ranks.size(), - selected_ranks.begin(), - get_dataframe_buffer_begin(values), - is_equal_t{comm_rank}); + thrust::copy_if(rmm::exec_policy(stream_view), + value_first, + value_first + selected_ranks.size(), + selected_ranks.begin(), + get_dataframe_buffer_begin(values), + is_equal_t{comm_rank}); } } else { assert(selected_ranks_or_flags.index() == 1); @@ -1103,13 +1102,12 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, auto copy_size = thrust::count( rmm::exec_policy(stream_view), (*selected_flags).begin(), (*selected_flags).end(), true); resize_dataframe_buffer(values, copy_size, stream_view); - thrust::copy_if( - rmm::exec_policy(stream_view), - value_first, - value_first + (*selected_flags).size(), - (*selected_flags).begin(), - get_dataframe_buffer_begin(values), - thrust::identity{}); + thrust::copy_if(rmm::exec_policy(stream_view), + value_first, + value_first + (*selected_flags).size(), + (*selected_flags).begin(), + get_dataframe_buffer_begin(values), + thrust::identity{}); } } @@ -1137,8 +1135,12 @@ gather_offset_value_pairs(raft::comms::comms_t const& comm, if (comm_rank == root) { auto& selected_ranks = std::get<0>(selected_ranks_or_flags); rx_offsets.resize(selected_ranks.size(), stream_view); - thrust::sequence(rmm::exec_policy(stream_view), rx_offsets.begin(), rx_offsets.end(), vertex_t{0}); - thrust::stable_sort_by_key(rmm::exec_policy(stream_view), selected_ranks.begin(), selected_ranks.end(), rx_offsets.begin()); + thrust::sequence( + rmm::exec_policy(stream_view), rx_offsets.begin(), rx_offsets.end(), vertex_t{0}); + thrust::stable_sort_by_key(rmm::exec_policy(stream_view), + selected_ranks.begin(), + selected_ranks.end(), + rx_offsets.begin()); rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); } @@ -1729,42 +1731,33 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is - // update_major ? (use_input_key ? aggregate key list size : V) / comm_size * sizeof(T) : 0 - // and limit memory requirement to (E / comm_size) * sizeof(vertex_t) - // FIXME: what about offsets & values? + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); - size_t num_streams = - std::min(static_cast(minor_comm_size) * max_segments, - raft::round_down_safe(handle.get_stream_pool_size(), max_segments)); + auto max_tmp_buffer_size = + static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * + sizeof(vertex_t); + size_t approx_tmp_buffer_size_per_edge_partition{0}; if constexpr (update_major) { - size_t value_size{0}; - if constexpr (is_thrust_tuple_of_arithmetic::value) { - auto elem_sizes = compute_thrust_tuple_element_sizes{}(); - value_size = std::reduce(elem_sizes.begin(), elem_sizes.end()); - } else { - value_size = sizeof(T); - } size_t key_size{0}; if constexpr (use_input_key) { - if constexpr (std::is_same_v) { - key_size = sizeof(vertex_t); + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); } else { - key_size = sizeof(thrust::tuple_element<0, key_t>::type) + - sizeof(thrust::tuple_element<1, key_t>::type); + key_size = sum_thrust_tuple_element_sizes(); } } - - auto num_edges = graph_view.compute_number_of_edges(handle); + size_t value_size{0}; + if constexpr (std::is_arithmetic_v) { + value_size = sizeof(T); + } else { + value_size = sum_thrust_tuple_element_sizes(); + } size_t aggregate_major_range_size{}; if constexpr (use_input_key) { aggregate_major_range_size = - host_scalar_allreduce(handle.get_comms(), + host_scalar_allreduce(comm, static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), raft::comms::op_t::SUM, @@ -1772,21 +1765,15 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } else { aggregate_major_range_size = graph_view.number_of_vertices(); } - num_streams = std::min( - static_cast( - (aggregate_major_range_size > 0 - ? (static_cast(num_edges) / static_cast(aggregate_major_range_size)) - : double{0}) * - (static_cast(sizeof(vertex_t)) / static_cast(value_size + key_size))) * - max_segments, - num_streams); + approx_tmp_buffer_size_per_edge_partition = + (aggregate_major_range_size / comm_size) * (key_size + value_size); } - if (num_streams >= max_segments) { - assert((num_streams % max_segments) == 0); - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - } + stream_pool_indices = init_stream_pool_indices(handle, + max_tmp_buffer_size, + approx_tmp_buffer_size_per_edge_partition, + graph_view.number_of_local_edge_partitions(), + max_segments); } } @@ -1856,7 +1843,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); std::conditional_t>, + std::vector>, std::byte /* dummy */> edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu && use_input_key) { diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index c85b8ceae1a..6efa9c6313f 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -319,8 +319,8 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\tprim took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" - << std::endl; + std::cout << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ")" << std::endl; #endif if constexpr (!std::is_same_v) { From ca816dd801ec5300d1b3750b6dffe055a08a8564 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 22 Aug 2024 15:27:43 -0700 Subject: [PATCH 040/126] add a utility function --- cpp/src/prims/detail/multi_stream_utils.cuh | 54 +++++++++++++++++++++ 1 file changed, 54 insertions(+) create mode 100644 cpp/src/prims/detail/multi_stream_utils.cuh diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh new file mode 100644 index 00000000000..54d1fa0648b --- /dev/null +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include +#include + +#include +#include + +namespace cugraph { + +namespace detail { + +inline std::vector init_stream_pool_indices(raft::handle_t const& handle, + size_t max_tmp_buffer_size, + size_t approx_tmp_buffer_size_per_edge_partition, + size_t num_local_edge_partitions, + size_t num_streams_per_edge_partition) +{ + size_t num_streams = + std::min(num_local_edge_partitions * num_streams_per_edge_partition, + raft::round_down_safe(handle.get_stream_pool_size(), num_streams_per_edge_partition)); + + auto num_concurrent_loops = + (approx_tmp_buffer_size_per_edge_partition > 0) + ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_edge_partition, size_t{1}) + : num_local_edge_partitions; + num_streams = std::min(num_concurrent_loops * num_streams_per_edge_partition, num_streams); + + std::vector stream_pool_indices(num_streams); + std::iota(stream_pool_indices.begin(), stream_pool_indices.end(), size_t{0}); + + return stream_pool_indices; +} + +} // namespace detail + +} // namespace cugraph From 7712c3885063674aeafcbf300024432720af54ae Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 22 Aug 2024 15:42:32 -0700 Subject: [PATCH 041/126] fix build error --- .../detail/extract_transform_v_frontier_e.cuh | 31 ++++++++++++++++--- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index c11727a35c1..4ccdf73ab9b 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -708,9 +708,32 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, handle.get_stream()); auto aggregate_max_pushes = host_scalar_allreduce( comm, max_push_counts[partition_idx], raft::comms::op_t::SUM, handle.get_stream()); + + size_t key_size{0}; + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + size_t output_key_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_key_size = sizeof(output_key_t); + } else { + output_key_size = sum_thrust_tuple_element_sizes(); + } + } + size_t output_value_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_value_size = sizeof(output_value_t); + } else { + output_value_size = sum_thrust_tuple_element_sizes(); + } + } auto approx_tmp_buffer_size_per_edge_partition = - (aggregate_major_range_size / comm_size) * sizeof(key_t) + - (aggregate_max_pushes / comm_size) * (sizeof(output_key_t) + sizeof(output_value_t)); + (aggregate_major_range_size / comm_size) * key_size + + (aggregate_max_pushes / comm_size) * (output_key_size + output_value_size); stream_pool_indices = init_stream_pool_indices(handle, max_tmp_buffer_size, @@ -752,9 +775,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { edge_partition_key_buffers.reserve(loop_count); } std::vector>> key_segment_offset_vectors{}; key_segment_offset_vectors.reserve(loop_count); - std::vector> output_key_buffers{}; + std::vector> output_key_buffers{}; output_key_buffers.reserve(loop_count); - std::vector> output_value_buffers{}; + std::vector> output_value_buffers{}; output_value_buffers.reserve(loop_count); std::vector> output_buffer_idx_scalars{}; output_buffer_idx_scalars.reserve(loop_count); From 31a59554f9aae36a3e72456b98c4eb8041096569 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 22 Aug 2024 15:48:38 -0700 Subject: [PATCH 042/126] fix build error --- cpp/src/prims/detail/extract_transform_v_frontier_e.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 4ccdf73ab9b..d102e592df0 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -713,14 +713,14 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if constexpr (std::is_arithmetic_v) { key_size = sizeof(key_t); } else { - key_size = sum_thrust_tuple_element_sizes(); + key_size = cugraph::sum_thrust_tuple_element_sizes(); } size_t output_key_size{0}; if constexpr (!std::is_same_v) { if constexpr (std::is_arithmetic_v) { output_key_size = sizeof(output_key_t); } else { - output_key_size = sum_thrust_tuple_element_sizes(); + output_key_size = cugraph::sum_thrust_tuple_element_sizes(); } } size_t output_value_size{0}; @@ -728,7 +728,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if constexpr (std::is_arithmetic_v) { output_value_size = sizeof(output_value_t); } else { - output_value_size = sum_thrust_tuple_element_sizes(); + output_value_size = cugraph::sum_thrust_tuple_element_sizes(); } } auto approx_tmp_buffer_size_per_edge_partition = From ac33784ae20204a032bc58fbb8c542c5bfb86944 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 22 Aug 2024 18:59:42 -0700 Subject: [PATCH 043/126] bug fix --- .../detail/extract_transform_v_frontier_e.cuh | 40 +++++++++++-------- 1 file changed, 24 insertions(+), 16 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index d102e592df0..01b9ceec176 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -658,7 +658,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // compute max_pushes - std::vector max_push_counts{}; + size_t max_pushes{}; { size_t partition_idx{}; if constexpr (GraphViewType::is_multi_gpu) { @@ -675,14 +675,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust_tuple_get_or_identity(frontier_key_last); // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of // additional computing) - auto max_pushes = edge_partition.compute_number_of_edges( + max_pushes = edge_partition.compute_number_of_edges( frontier_major_first, frontier_major_last, handle.get_stream()); - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - max_push_counts = host_scalar_allgather(minor_comm, max_pushes, handle.get_stream()); - } else { - max_push_counts = {max_pushes}; - } } // set-up stream ppol @@ -707,7 +701,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, raft::comms::op_t::SUM, handle.get_stream()); auto aggregate_max_pushes = host_scalar_allreduce( - comm, max_push_counts[partition_idx], raft::comms::op_t::SUM, handle.get_stream()); + comm, + max_pushes, + raft::comms::op_t::SUM, + handle.get_stream()); // this is approximate as we only consider local edges for + // [frontier_key_first, frontier_key_last), note that neighbor lists + // are partitioned if minor_comm_size > 1 size_t key_size{0}; if constexpr (std::is_arithmetic_v) { @@ -839,14 +838,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } + auto edge_partition_frontier_major_first = + thrust_tuple_get_or_identity( + edge_partition_frontier_key_first); + auto edge_partition_frontier_major_last = + thrust_tuple_get_or_identity( + edge_partition_frontier_key_last); + std::optional> key_segment_offsets{std::nullopt}; if (segment_offsets) { - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); key_segment_offsets = compute_key_segment_offsets( edge_partition_frontier_major_first, edge_partition_frontier_major_last, @@ -856,7 +856,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } key_segment_offset_vectors.push_back(std::move(key_segment_offsets)); - auto edge_partition_max_pushes = max_push_counts[partition_idx]; + size_t edge_partition_max_pushes = max_pushes; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + if (static_cast(partition_idx) != minor_comm_rank) { + edge_partition_max_pushes = edge_partition.compute_number_of_edges( + edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); + } + } output_key_buffers.push_back( allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); From 6d8c7ef709764cc81deb3cd052755b60161d40ac Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 22 Aug 2024 19:47:20 -0700 Subject: [PATCH 044/126] perf experiment --- .../detail/extract_transform_v_frontier_e.cuh | 42 ++++++++++--------- 1 file changed, 23 insertions(+), 19 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 01b9ceec176..1557b378bd9 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -613,24 +613,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } - std::vector local_frontier_sizes{}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - handle.get_stream()); - } else { - local_frontier_sizes = std::vector{static_cast( - static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; - } - // update frontier bitmap (used to reduce broadcast bandwidth size) std:: conditional_t>, std::byte /* dummy */> frontier_bitmap{}; - std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; if constexpr (try_bitmap) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -648,12 +635,6 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, graph_view.local_vertex_partition_range_first() + bool_size, handle.get_stream()); } - auto tmp_flags = host_scalar_allgather( - minor_comm, frontier_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - use_bitmap_flags.resize(tmp_flags.size()); - std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { - return flag == uint8_t{1}; - }); } // compute max_pushes @@ -679,6 +660,29 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_major_first, frontier_major_last, handle.get_stream()); } + // communication over minor_comm + + std::vector local_frontier_sizes{}; + std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + local_frontier_sizes = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + handle.get_stream()); + if constexpr (try_bitmap) { + auto tmp_flags = host_scalar_allgather( + minor_comm, frontier_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + use_bitmap_flags.resize(tmp_flags.size()); + std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { + return flag == uint8_t{1}; + }); + } + } else { + local_frontier_sizes = std::vector{static_cast( + static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; + } + // set-up stream ppol std::optional> stream_pool_indices{std::nullopt}; From 6bcdbe7c6d865e8665df88725d391c66838ba40f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 23 Aug 2024 13:26:21 -0700 Subject: [PATCH 045/126] perf measurement code update --- cpp/src/prims/detail/per_v_transform_reduce_e.cuh | 4 ++-- cpp/tests/traversal/mg_bfs_test.cpp | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index dd94fba319a..df4580d632c 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -2419,8 +2419,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; std::chrono::duration dur3 = time4 - time3; - std::cout << "\t\tdetail::per_v took (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << ")" << std::endl; + std::cout << "\t\tdetail::per_v (prep, ep, scatter, comm) took (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << "," << dur3.count() << ")" << std::endl; #endif } diff --git a/cpp/tests/traversal/mg_bfs_test.cpp b/cpp/tests/traversal/mg_bfs_test.cpp index c294c6d0091..810c62d5321 100644 --- a/cpp/tests/traversal/mg_bfs_test.cpp +++ b/cpp/tests/traversal/mg_bfs_test.cpp @@ -332,9 +332,9 @@ INSTANTIATE_TEST_SUITE_P( ::testing::Values( // enable correctness checks std::make_tuple(BFS_Usecase{0, false}, - cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)), + cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false)), std::make_tuple(BFS_Usecase{0, true}, - cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, false, false)))); + cugraph::test::Rmat_Usecase(10, 16, 0.57, 0.19, 0.19, 0, true /* undirected */, false)))); INSTANTIATE_TEST_SUITE_P( rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with From 3a950a56e4e16778ec93ddf3301234ae53456e87 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 23 Aug 2024 15:25:14 -0700 Subject: [PATCH 046/126] rename [vertex_first, vertex_last) in fill|update_edge_src|dst_property to [sorted_unique_vertex_first,sorted_unique_vertex_last) and require them to be sorted. --- cpp/src/prims/fill_edge_src_dst_property.cuh | 158 ++++++++------ .../prims/update_edge_src_dst_property.cuh | 198 ++++++++++-------- 2 files changed, 204 insertions(+), 152 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index a1c4000d806..7155ce23dbd 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -129,8 +129,8 @@ template void fill_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMajorPropertyOutputWrapper edge_major_property_output, T input) { @@ -153,10 +153,10 @@ void fill_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); + auto rx_counts = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); auto max_rx_size = std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); @@ -169,8 +169,12 @@ void fill_edge_major_property(raft::handle_t const& handle, edge_partition_device_view_t( graph_view.local_edge_partition_view(i)); - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + rx_counts[i], + i, + handle.get_stream()); if (edge_partition_keys) { thrust::for_each( @@ -232,17 +236,18 @@ void fill_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_firsts[0]] __device__( auto v) { packed_bool_atomic_set(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -286,8 +291,8 @@ template void fill_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeMinorPropertyOutputWrapper edge_minor_property_output, T input) { @@ -307,10 +312,10 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); + auto rx_counts = host_scalar_allgather( + major_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); auto max_rx_size = std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); @@ -332,8 +337,12 @@ void fill_edge_minor_property(raft::handle_t const& handle, // FIXME: we can optionally use bitmap for this broadcast // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(major_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + rx_counts[i], + i, + handle.get_stream()); if (edge_partition_keys) { thrust::for_each( @@ -395,18 +404,19 @@ void fill_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_src_range_size()); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [input, output_value_first = edge_partition_value_first] __device__(auto v) { fill_scalar_or_thrust_tuple(output_value_first, v, input); }); } else { auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -452,8 +462,8 @@ void fill_edge_src_property(raft::handle_t const& handle, /** * @brief Fill graph edge source property values to the input value. * - * This version fills only a subset of graph edge source property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -462,10 +472,12 @@ void fill_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [vertex_first, sorted_unique_vertex_last) should be sorted & distinct (and + * should belong to the vertex partition assigned to this process in multi-GPU), otherwise undefined + * behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_src_property_output edge_src_property_view_t class object to store source property * values (for the edge source assigned to this process in multi-GPU). * @param input Edge source property values will be set to @p input. @@ -477,8 +489,8 @@ template void fill_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeSrcValueOutputWrapper edge_src_property_output, T input, bool do_expensive_check = false) @@ -487,8 +499,8 @@ void fill_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -499,17 +511,25 @@ void fill_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } else { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_src_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_src_property_output, + input); } } @@ -553,8 +573,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, /** * @brief Fill graph edge destination property values to the input value. * - * This version fills only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices to be filled. + * This version fills only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices to be filled. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -564,10 +584,12 @@ void fill_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a value to be filled. - * v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex partition - * assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a value to be filled. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a value + * to be filled. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be sorted & + * distinct (and should belong to the vertex partition assigned to this process in multi-GPU), + * otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a value to + * be filled. * @param edge_dst_property_output edge_dst_property_view_t class object to store destination * property values (for the edge destinations assigned to this process in multi-GPU). * @param input Edge destination property values will be set to @p input. @@ -579,8 +601,8 @@ template void fill_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, EdgeDstValueOutputWrapper edge_dst_property_output, T input, bool do_expensive_check = false) @@ -589,8 +611,8 @@ void fill_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -601,17 +623,25 @@ void fill_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { - detail::fill_edge_major_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_major_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } else { - detail::fill_edge_minor_property( - handle, graph_view, vertex_first, vertex_last, edge_dst_property_output, input); + detail::fill_edge_minor_property(handle, + graph_view, + sorted_unique_vertex_first, + sorted_unique_vertex_last, + edge_dst_property_output, + input); } } diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index 1bfdc23c66d..392e12420ad 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -265,8 +265,8 @@ template void update_edge_major_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMajorPropertyOutputWrapper edge_major_property_output) { @@ -288,10 +288,10 @@ void update_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = - host_scalar_allgather(minor_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); + auto rx_counts = host_scalar_allgather( + minor_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); auto max_rx_size = std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); @@ -317,7 +317,7 @@ void update_edge_major_property(raft::handle_t const& handle, graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -325,30 +325,36 @@ void update_edge_major_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - minor_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(minor_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + rx_counts[i], + i, + handle.get_stream()); device_bcast(minor_comm, rx_value_first, rx_value_first, @@ -420,20 +426,22 @@ void update_edge_major_property(raft::handle_t const& handle, assert(edge_partition_value_firsts.size() == size_t{1}); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_firsts[0]] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_firsts[0]); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_firsts[0]); } } } @@ -683,8 +691,8 @@ template void update_edge_minor_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeMinorPropertyOutputWrapper edge_minor_property_output) { @@ -706,10 +714,10 @@ void update_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = - host_scalar_allgather(major_comm, - static_cast(thrust::distance(vertex_first, vertex_last)), - handle.get_stream()); + auto rx_counts = host_scalar_allgather( + major_comm, + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), + handle.get_stream()); auto max_rx_size = std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); @@ -741,7 +749,7 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_vertex_partition_view()); if constexpr (contains_packed_bool_element) { auto bool_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_property_input_first, vertex_partition] __device__(auto v) { auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); @@ -749,30 +757,36 @@ void update_edge_minor_property(raft::handle_t const& handle, *(vertex_property_input_first + packed_bool_offset(v_offset)) & packed_bool_mask(v_offset)); })); - pack_bools(handle, - bool_first, - bool_first + thrust::distance(vertex_first, vertex_last), - rx_value_first); + pack_bools( + handle, + bool_first, + bool_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + rx_value_first); } else { auto map_first = thrust::make_transform_iterator( - vertex_first, + sorted_unique_vertex_first, cuda::proclaim_return_type([vertex_partition] __device__(auto v) { return vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); })); // FIXME: this gather (and temporary buffer) is unnecessary if NCCL directly takes a // permutation iterator (and directly gathers to the internal buffer) - thrust::gather(handle.get_thrust_policy(), - map_first, - map_first + thrust::distance(vertex_first, vertex_last), - vertex_property_input_first, - rx_value_first); + thrust::gather( + handle.get_thrust_policy(), + map_first, + map_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + vertex_property_input_first, + rx_value_first); } } // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast( - major_comm, vertex_first, rx_vertices.begin(), rx_counts[i], i, handle.get_stream()); + device_bcast(major_comm, + sorted_unique_vertex_first, + rx_vertices.begin(), + rx_counts[i], + i, + handle.get_stream()); device_bcast(major_comm, rx_value_first, rx_value_first, @@ -844,20 +858,22 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_src_range_size()); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [vertex_property_input_first, output_value_first = edge_partition_value_first] __device__(auto v) { bool val = static_cast(*(vertex_property_input_first + v)); packed_bool_atomic_set(output_value_first, v, val); }); } else { - auto val_first = thrust::make_permutation_iterator(vertex_property_input_first, vertex_first); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + thrust::distance(vertex_first, vertex_last), - vertex_first, - edge_partition_value_first); + auto val_first = + thrust::make_permutation_iterator(vertex_property_input_first, sorted_unique_vertex_first); + thrust::scatter( + handle.get_thrust_policy(), + val_first, + val_first + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last), + sorted_unique_vertex_first, + edge_partition_value_first); } } } @@ -909,8 +925,9 @@ void update_edge_src_property(raft::handle_t const& handle, /** * @brief Update graph edge source property values from the input vertex property values. * - * This version updates only a subset of graph edge source property values. [@p vertex_first, @p - * vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge source property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -919,10 +936,12 @@ void update_edge_src_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -937,8 +956,8 @@ template void update_edge_src_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeSrcValueOutputWrapper edge_src_property_output, bool do_expensive_check = false) @@ -946,8 +965,8 @@ void update_edge_src_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -958,23 +977,23 @@ void update_edge_src_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } else { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_src_property_output); } @@ -1026,8 +1045,9 @@ void update_edge_dst_property(raft::handle_t const& handle, /** * @brief Update graph edge destination property values from the input vertex property values. * - * This version updates only a subset of graph edge destination property values. [@p vertex_first, - * @p vertex_last) specifies the vertices with new property values to be updated. + * This version updates only a subset of graph edge destination property values. [@p + * sorted_unique_vertex_first, @p sorted_unique_vertex_last) specifies the vertices with new + * property values to be updated. * * @tparam GraphViewType Type of the passed non-owning graph object. * @tparam VertexIterator Type of the iterator for vertex identifiers. @@ -1037,10 +1057,12 @@ void update_edge_dst_property(raft::handle_t const& handle, * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and * handles to various CUDA libraries) to run graph algorithms. * @param graph_view Non-owning graph object. - * @param vertex_first Iterator pointing to the first (inclusive) vertex with a new value to be - * updated. v in [vertex_first, vertex_last) should be distinct (and should belong to the vertex - * partition assigned to this process in multi-GPU), otherwise undefined behavior. - * @param vertex_last Iterator pointing to the last (exclusive) vertex with a new value. + * @param sorted_unique_vertex_first Iterator pointing to the first (inclusive) vertex with a new + * value to be updated. v in [sorted_unique_vertex_first, sorted_unique_vertex_last) should be + * sorted & distinct (and should belong to the vertex partition assigned to this process in + * multi-GPU), otherwise undefined behavior. + * @param sorted_unique_vertex_last Iterator pointing to the last (exclusive) vertex with a new + * value. * @param vertex_property_input_first Iterator pointing to the vertex property value for the first * (inclusive) vertex (of the vertex partition assigned to this process in multi-GPU). * `vertex_property_input_last` (exclusive) is deduced as @p vertex_property_input_first + @p @@ -1055,8 +1077,8 @@ template void update_edge_dst_property(raft::handle_t const& handle, GraphViewType const& graph_view, - VertexIterator vertex_first, - VertexIterator vertex_last, + VertexIterator sorted_unique_vertex_first, + VertexIterator sorted_unique_vertex_last, VertexPropertyInputIterator vertex_property_input_first, EdgeDstValueOutputWrapper edge_dst_property_output, bool do_expensive_check = false) @@ -1064,8 +1086,8 @@ void update_edge_dst_property(raft::handle_t const& handle, if (do_expensive_check) { auto num_invalids = thrust::count_if( handle.get_thrust_policy(), - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, [local_vertex_partition_range_first = graph_view.local_vertex_partition_range_first(), local_vertex_partition_range_last = graph_view.local_vertex_partition_range_last()] __device__(auto v) { @@ -1076,23 +1098,23 @@ void update_edge_dst_property(raft::handle_t const& handle, num_invalids = host_scalar_allreduce(comm, num_invalids, raft::comms::op_t::SUM, handle.get_stream()); } - CUGRAPH_EXPECTS( - num_invalids == 0, - "Invalid input argument: invalid or non-local vertices in [vertex_first, vertex_last)."); + CUGRAPH_EXPECTS(num_invalids == 0, + "Invalid input argument: invalid or non-local vertices in " + "[sorted_unique_vertex_first, sorted_unique_vertex_last)."); } if constexpr (GraphViewType::is_storage_transposed) { detail::update_edge_major_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } else { detail::update_edge_minor_property(handle, graph_view, - vertex_first, - vertex_last, + sorted_unique_vertex_first, + sorted_unique_vertex_last, vertex_property_input_first, edge_dst_property_output); } From d27a5e3325c46b4cdee9a4cabe4d8e3b8dc0e407 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 23 Aug 2024 17:39:07 -0700 Subject: [PATCH 047/126] update fill|update_edge_minor_property to optionally use bitmap to broadcast vertex list --- cpp/src/prims/fill_edge_src_dst_property.cuh | 112 ++++++++++----- .../prims/update_edge_src_dst_property.cuh | 131 ++++++++++++------ 2 files changed, 164 insertions(+), 79 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 7155ce23dbd..9f561b57ff7 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -15,6 +15,8 @@ */ #pragma once +#include "prims/vertex_frontier.cuh" + #include #include #include @@ -153,12 +155,12 @@ void fill_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = host_scalar_allgather( + auto local_v_list_sizes = host_scalar_allgather( minor_comm, static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -172,7 +174,7 @@ void fill_edge_major_property(raft::handle_t const& handle, device_bcast(minor_comm, sorted_unique_vertex_first, rx_vertices.begin(), - rx_counts[i], + local_v_list_sizes[i], i, handle.get_stream()); @@ -180,7 +182,7 @@ void fill_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), input, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -203,7 +205,7 @@ void fill_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), input, @@ -223,7 +225,7 @@ void fill_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), val_first, - val_first + rx_counts[i], + val_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -312,15 +314,41 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = host_scalar_allgather( - major_comm, - static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); + auto v_list_size = + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_vertex_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_vertex_first + : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + + auto v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + v_list_range[0], + v_list_range[1], + handle.get_stream()); + + std::vector use_bitmap_flags(major_comm_size, false); + { + auto tmp_flags = host_scalar_allgather( + major_comm, v_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { + return flag == uint8_t{1}; }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); + } + auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); + auto local_v_list_range_firsts = + host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); + auto local_v_list_range_lasts = + host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -334,21 +362,33 @@ void fill_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { - // FIXME: we can optionally use bitmap for this broadcast + rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast(major_comm, - sorted_unique_vertex_first, - rx_vertices.begin(), - rx_counts[i], - i, - handle.get_stream()); + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (use_bitmap_flags[i]) { + v_list = + (i == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[i], + local_v_list_range_lasts[i], + local_v_list_sizes[i], + i, + handle.get_stream()); if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), input, subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], @@ -370,18 +410,18 @@ void fill_edge_minor_property(raft::handle_t const& handle, }); } else { if constexpr (contains_packed_bool_element) { - thrust::for_each(handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), - [edge_partition, - rx_vertex_first = rx_vertices.begin(), - input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = - edge_partition.minor_offset_from_minor_nocheck(rx_vertex); - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); - }); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), + [edge_partition, + rx_vertex_first = rx_vertices.begin(), + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + auto rx_vertex = *(rx_vertex_first + i); + auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(rx_vertex); + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); } else { auto map_first = thrust::make_transform_iterator( rx_vertices.begin(), @@ -393,7 +433,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), val_first, - val_first + rx_counts[i], + val_first + local_v_list_sizes[i], map_first, edge_partition_value_first); } diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index 392e12420ad..2408dcb3d68 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -16,6 +16,7 @@ #pragma once #include "detail/graph_partition_utils.cuh" +#include "prims/vertex_frontier.cuh" #include #include @@ -288,12 +289,12 @@ void update_edge_major_property(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto rx_counts = host_scalar_allgather( + auto local_v_list_sizes = host_scalar_allgather( minor_comm, static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { + auto max_rx_size = std::reduce( + local_v_list_sizes.begin(), local_v_list_sizes.end(), size_t{0}, [](auto lhs, auto rhs) { return std::max(lhs, rhs); }); rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); @@ -352,13 +353,14 @@ void update_edge_major_property(raft::handle_t const& handle, device_bcast(minor_comm, sorted_unique_vertex_first, rx_vertices.begin(), - rx_counts[i], + local_v_list_sizes[i], i, handle.get_stream()); device_bcast(minor_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -366,7 +368,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, edge_partition_key_first = ((*edge_partition_keys)[i]).begin(), @@ -392,7 +394,7 @@ void update_edge_major_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -413,7 +415,7 @@ void update_edge_major_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_firsts[i]); } @@ -463,13 +465,11 @@ void update_edge_minor_property(raft::handle_t const& handle, auto edge_partition_value_first = edge_minor_property_output.value_first(); if constexpr (GraphViewType::is_multi_gpu) { - using vertex_t = typename GraphViewType::vertex_type; - using bcast_buffer_type = - decltype(allocate_dataframe_buffer< - std::conditional_t>( - size_t{0}, handle.get_stream())); + using vertex_t = typename GraphViewType::vertex_type; + using bcast_buffer_type = dataframe_buffer_type_t< + std::conditional_t>; auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -540,15 +540,17 @@ void update_edge_minor_property(raft::handle_t const& handle, *(graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets()); } } else { - std::vector rx_counts(major_comm_size, size_t{0}); + std::vector local_v_list_sizes(major_comm_size, size_t{0}); for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = compute_local_edge_partition_minor_range_vertex_partition_id_t{ major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); - rx_counts[i] = graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); + local_v_list_sizes[i] = + graph_view.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector rx_displacements(major_comm_size, size_t{0}); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + std::exclusive_scan( + local_v_list_sizes.begin(), local_v_list_sizes.end(), rx_displacements.begin(), size_t{0}); key_offsets_or_rx_displacements = std::move(rx_displacements); } @@ -714,22 +716,42 @@ void update_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto rx_counts = host_scalar_allgather( - major_comm, - static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)), - handle.get_stream()); - auto max_rx_size = - std::reduce(rx_counts.begin(), rx_counts.end(), size_t{0}, [](auto lhs, auto rhs) { - return std::max(lhs, rhs); + auto v_list_size = + static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_vertex_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_vertex_first + : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + + auto v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + v_list_range[0], + v_list_range[1], + handle.get_stream()); + + std::vector use_bitmap_flags(major_comm_size, false); + { + auto tmp_flags = host_scalar_allgather( + major_comm, v_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { + return flag == uint8_t{1}; }); - rmm::device_uvector rx_vertices(max_rx_size, handle.get_stream()); - auto rx_tmp_buffer = allocate_dataframe_buffer< - std::conditional_t>( - contains_packed_bool_element ? packed_bool_size(max_rx_size) : max_rx_size, - handle.get_stream()); - auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + } + + auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); + auto local_v_list_range_firsts = + host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); + auto local_v_list_range_lasts = + host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -743,6 +765,16 @@ void update_edge_minor_property(raft::handle_t const& handle, graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { + rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); + auto rx_tmp_buffer = allocate_dataframe_buffer< + std::conditional_t>( + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], + handle.get_stream()); + auto rx_value_first = get_dataframe_buffer_begin(rx_tmp_buffer); + if (i == major_comm_rank) { auto vertex_partition = vertex_partition_device_view_t( @@ -781,16 +813,29 @@ void update_edge_minor_property(raft::handle_t const& handle, // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() - device_bcast(major_comm, - sorted_unique_vertex_first, - rx_vertices.begin(), - rx_counts[i], - i, - handle.get_stream()); + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (use_bitmap_flags[i]) { + v_list = + (i == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[i], + local_v_list_range_lasts[i], + local_v_list_sizes[i], + i, + handle.get_stream()); device_bcast(major_comm, rx_value_first, rx_value_first, - contains_packed_bool_element ? packed_bool_size(rx_counts[i]) : rx_counts[i], + contains_packed_bool_element ? packed_bool_size(local_v_list_sizes[i]) + : local_v_list_sizes[i], i, handle.get_stream()); @@ -798,7 +843,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_counts[i]), + thrust::make_counting_iterator(local_v_list_sizes[i]), [rx_vertex_first = rx_vertices.begin(), rx_value_first, subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], @@ -826,7 +871,7 @@ void update_edge_minor_property(raft::handle_t const& handle, thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(rx_counts[i])), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), [edge_partition, rx_vertex_first = rx_vertices.begin(), rx_value_first, @@ -847,7 +892,7 @@ void update_edge_minor_property(raft::handle_t const& handle, // directly scatters from the internal buffer) thrust::scatter(handle.get_thrust_policy(), rx_value_first, - rx_value_first + rx_counts[i], + rx_value_first + local_v_list_sizes[i], map_first, edge_partition_value_first); } From 97022f594d7006796e50b82c12672390bfcc11f5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 23 Aug 2024 17:46:25 -0700 Subject: [PATCH 048/126] add missing includes --- cpp/src/prims/vertex_frontier.cuh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 7483907549c..70d9f16d6af 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -18,14 +18,19 @@ #include #include #include +#include #include +#include #include #include #include #include +#include +#include +#include #include #include #include From ecf76f843026687745949f8abae47ef8dadde72e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 24 Aug 2024 00:36:50 -0700 Subject: [PATCH 049/126] specialization for bool --- cpp/src/prims/fill_edge_src_dst_property.cuh | 170 ++++++++++++------- 1 file changed, 106 insertions(+), 64 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 9f561b57ff7..5f3fbbf8ede 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -307,6 +307,13 @@ void fill_edge_minor_property(raft::handle_t const& handle, using edge_t = typename GraphViewType::edge_type; auto edge_partition_value_first = edge_minor_property_output.value_first(); + vertex_t minor_range_first{}; + if constexpr (GraphViewType::is_storage_transposed) { + minor_range_first = graph_view.local_edge_partition_src_range_first(); + } else { + minor_range_first = graph_view.local_edge_partition_dst_range_first(); + } + if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); auto const comm_rank = comm.get_rank(); @@ -328,6 +335,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, }); raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); handle.sync_stream(); + v_list_range[0] -= (v_list_range[0] - minor_range_first) % packed_bools_per_word(); } auto v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, @@ -357,85 +365,119 @@ void fill_edge_minor_property(raft::handle_t const& handle, key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(size_t{0})); auto edge_partition_keys = edge_minor_property_output.keys(); for (int i = 0; i < major_comm_size; ++i) { - rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - std::variant, decltype(sorted_unique_vertex_first)> - v_list{}; - if (use_bitmap_flags[i]) { - v_list = - (i == major_comm_rank) - ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) - : raft::device_span(static_cast(nullptr), size_t{0}); - } else { - v_list = sorted_unique_vertex_first; - } - device_bcast_vertex_list(major_comm, - v_list, - rx_vertices.begin(), - local_v_list_range_firsts[i], - local_v_list_range_lasts[i], - local_v_list_sizes[i], - i, - handle.get_stream()); - - if (edge_partition_keys) { + if (is_packed_bool() && + !edge_partition_keys && use_bitmap_flags[i]) { + rmm::device_uvector rx_bitmap( + packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]), + handle.get_stream()); + device_bcast( + major_comm, + (i == major_comm_rank) ? (*v_list_bitmap).data() : static_cast(nullptr), + rx_bitmap.data(), + rx_bitmap.size(), + i, + handle.get_stream()); thrust::for_each( handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_v_list_sizes[i]), - [rx_vertex_first = rx_vertices.begin(), - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], - subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[i + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[i]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); + thrust::make_counting_iterator(rx_bitmap.size()), + [input, + output_value_first = + edge_partition_value_first + + packed_bool_offset(local_v_list_range_firsts[i] - minor_range_first), + rx_bitmap = raft::device_span(rx_bitmap.data(), + rx_bitmap.size())] __device__(size_t i) { + if ((i == 0) || (i == (rx_bitmap.size() - 1))) { // first or last + cuda::atomic_ref word(*(output_value_first + i)); + if (input) { + word.fetch_or(rx_bitmap[i], cuda::std::memory_order_relaxed); + } else { + word.fetch_and(~rx_bitmap[i], cuda::std::memory_order_relaxed); + } + } else { + if (input) { + *(output_value_first + i) |= rx_bitmap[i]; } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + *(output_value_first + i) &= ~rx_bitmap[i]; } } }); } else { - if constexpr (contains_packed_bool_element) { + rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (use_bitmap_flags[i]) { + v_list = + (i == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[i], + local_v_list_range_lasts[i], + local_v_list_sizes[i], + i, + handle.get_stream()); + + if (edge_partition_keys) { thrust::for_each( handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), - [edge_partition, - rx_vertex_first = rx_vertices.begin(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[i]), + [rx_vertex_first = rx_vertices.begin(), input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = edge_partition.minor_offset_from_minor_nocheck(rx_vertex); - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], + subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[i + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[i]] __device__(auto i) { + auto minor = *(rx_vertex_first + i); + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + } + } }); } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type([edge_partition] __device__(auto v) { - return edge_partition.minor_offset_from_minor_nocheck(v); - })); - auto val_first = thrust::make_constant_iterator(input); - // FIXME: this scatter is unnecessary if NCCL directly takes a permutation iterator (and - // directly scatters from the internal buffer) - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + local_v_list_sizes[i], - map_first, - edge_partition_value_first); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), + [minor_range_first, + rx_vertex_first = rx_vertices.begin(), + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + auto rx_vertex = *(rx_vertex_first + i); + auto minor_offset = rx_vertex - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + local_v_list_sizes[i], + map_first, + edge_partition_value_first); + } } } } From 350f17e305d0f42a04513322a4345fa3be93ad1b Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 27 Aug 2024 15:49:31 -0700 Subject: [PATCH 050/126] add asynchronous copy_if --- cpp/src/prims/detail/multi_stream_utils.cuh | 44 +++++++++++++++++++++ cpp/src/prims/vertex_frontier.cuh | 30 +++++++------- 2 files changed, 60 insertions(+), 14 deletions(-) diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh index 54d1fa0648b..98c9acf6f03 100644 --- a/cpp/src/prims/detail/multi_stream_utils.cuh +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -19,6 +19,11 @@ #include #include +#include +#include +#include + +#include #include #include @@ -49,6 +54,45 @@ inline std::vector init_stream_pool_indices(raft::handle_t const& handle return stream_pool_indices; } +// this assumes that the caller already knows how many items will be copied. +template +void copy_if_nosync(InputIterator input_first, + InputIterator input_last, + FlagIterator flag_first, + OutputIterator output_first, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::copy_if_nosync relies on cub::DeviceSelect::Flagged which uses int for input " + "size, but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + rmm::device_scalar num_copied(stream_view); + + cub::DeviceSelect::Flagged(static_cast(nullptr), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + num_copied.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceSelect::Flagged(d_tmp_storage.data(), + tmp_storage_bytes, + input_first, + flag_first, + output_first, + num_copied.data(), + input_size, + stream_view); +} + } // namespace detail } // namespace cugraph diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 70d9f16d6af..2f463a74db4 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -15,13 +15,15 @@ */ #pragma once +#include "prims/detail/multi_stream_utils.cuh" + #include #include #include #include -#include #include +#include #include #include @@ -184,19 +186,19 @@ void device_bcast_vertex_list( assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size())); device_bcast( comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); - thrust::copy_if(rmm::exec_policy(stream_view), - thrust::make_counting_iterator(vertex_range_first), - thrust::make_counting_iterator(vertex_range_last), - thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - cuda::proclaim_return_type( - [bitmap = raft::device_span( - tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) { - return ((bitmap[packed_bool_offset(v_offset)] & - packed_bool_mask(v_offset)) != packed_bool_empty_mask()); - })), - output_v_first, - thrust::identity{}); + detail::copy_if_nosync( + thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + tmp_bitmap.data(), tmp_bitmap.size())] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != + packed_bool_empty_mask()); + })), + output_v_first, + stream_view); } else { device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view); } From 93f726ff048fef191498fabd5e52b2df4ff0a95e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 27 Aug 2024 15:50:27 -0700 Subject: [PATCH 051/126] fix implicit synchronization in multi-stream execution --- .../prims/detail/per_v_transform_reduce_e.cuh | 148 ++++++++++++++---- 1 file changed, 118 insertions(+), 30 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index df4580d632c..91167b5261e 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1375,7 +1375,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 0 +#define PER_V_PERFORMANCE_MEASUREMENT 1 template edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + edge_partition_key_buffers.reserve(loop_count); - } - std::vector>> key_segment_offset_vectors{}; - key_segment_offset_vectors.reserve(loop_count); - std::conditional_t>, - std::byte /* dummy */> - major_output_buffers{}; - if constexpr (GraphViewType::is_multi_gpu && update_major) { - major_output_buffers.reserve(loop_count); - } - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - if constexpr (GraphViewType::is_multi_gpu && use_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); auto edge_partition_key_buffer = allocate_dataframe_buffer( minor_comm_size > 1 ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); - if constexpr (try_bitmap) { std::variant, decltype(sorted_unique_key_first)> v_list{}; @@ -1911,6 +1902,39 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); + } + } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime1 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime2 = std::chrono::steady_clock::now(); +#endif + + std::vector>> key_segment_offset_vectors{}; + key_segment_offset_vectors.reserve(loop_count); + std::conditional_t>, + std::byte /* dummy */> + major_output_buffers{}; + if constexpr (GraphViewType::is_multi_gpu && update_major) { + major_output_buffers.reserve(loop_count); + } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); } @@ -1918,6 +1942,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> key_segment_offsets{std::nullopt}; if (segment_offsets) { if constexpr (use_input_key) { + // FIXME: compute_key_segment_offsets implicitly synchronizes to copy the results to host key_segment_offsets = compute_key_segment_offsets( edge_partition_key_first, edge_partition_key_last, @@ -1946,7 +1971,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, major_output_buffers.push_back(allocate_dataframe_buffer(buffer_size, loop_stream)); } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime3 = std::chrono::steady_clock::now(); +#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime4 = std::chrono::steady_clock::now(); +#endif for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i * num_concurrent_loops + j; @@ -2061,12 +2092,52 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, key_segment_offsets, edge_partition_stream_pool_indices); } + } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime5 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime6 = std::chrono::steady_clock::now(); +#endif + + if constexpr (GraphViewType::is_multi_gpu && update_major) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_stream_pool_indices = + stream_pool_indices ? std::make_optional>( + (*stream_pool_indices).data() + j * max_segments, max_segments) + : std::nullopt; - if constexpr (GraphViewType::is_multi_gpu && update_major) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); + } + } + + bool process_local_edges = true; + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } + } + + auto const& key_segment_offsets = key_segment_offset_vectors[j]; + auto output_buffer = get_dataframe_buffer_begin(major_output_buffers[j]); + if (key_segment_offsets && edge_partition_stream_pool_indices) { if (edge_partition.dcs_nzd_vertex_count()) { if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { @@ -2222,7 +2293,24 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime7 = std::chrono::steady_clock::now(); +#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime8 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; + std::chrono::duration subdur3 = subtime4 - subtime3; + std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; + std::chrono::duration subdur6 = subtime7 - subtime6; + std::chrono::duration subdur7 = subtime8 - subtime7; + std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() + << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," + << subdur6.count() << "," << subdur7.count() << ")" << std::endl; +#endif } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -2419,8 +2507,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; std::chrono::duration dur3 = time4 - time3; - std::cout << "\t\tdetail::per_v (prep, ep, scatter, comm) took (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << "," << dur3.count() << ")" << std::endl; + std::cout << "\t\tdetail::per_v (prep, ep, scatter, comm) took (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << ")" << std::endl; #endif } From d022c304adc35db12db651ffd87d92b3f6f880a2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 28 Aug 2024 12:18:13 -0700 Subject: [PATCH 052/126] fix implicit synchronizations for multi-stream execution --- cpp/src/prims/detail/multi_stream_utils.cuh | 43 +- .../prims/detail/per_v_transform_reduce_e.cuh | 738 ++++++++---------- cpp/src/prims/vertex_frontier.cuh | 12 +- 3 files changed, 367 insertions(+), 426 deletions(-) diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh index 98c9acf6f03..2f03a22bff5 100644 --- a/cpp/src/prims/detail/multi_stream_utils.cuh +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -19,9 +19,10 @@ #include #include + +#include #include #include -#include #include @@ -32,11 +33,12 @@ namespace cugraph { namespace detail { -inline std::vector init_stream_pool_indices(raft::handle_t const& handle, - size_t max_tmp_buffer_size, - size_t approx_tmp_buffer_size_per_edge_partition, - size_t num_local_edge_partitions, - size_t num_streams_per_edge_partition) +inline std::vector init_stream_pool_indices( + raft::handle_t const& handle, + size_t max_tmp_buffer_size, + size_t approx_tmp_buffer_size_per_edge_partition, + size_t num_local_edge_partitions, + size_t num_streams_per_edge_partition) { size_t num_streams = std::min(num_local_edge_partitions * num_streams_per_edge_partition, @@ -93,6 +95,35 @@ void copy_if_nosync(InputIterator input_first, stream_view); } +template +void count_nosync(InputIterator input_first, + InputIterator input_last, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type value, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + count.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, count.data(), input_size, stream_view); +} + } // namespace detail } // namespace cugraph diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 91167b5261e..be7b5874324 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -970,6 +970,7 @@ compute_selected_ranks(raft::comms::comms_t const& comm, bool ignore_local_values, rmm::cuda_stream_view stream_view) { + auto time0 = std::chrono::steady_clock::now(); auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); @@ -980,13 +981,13 @@ compute_selected_ranks(raft::comms::comms_t const& comm, rmm::device_uvector priorities(thrust::distance(value_first, value_last), stream_view); if (ignore_local_values) { - thrust::fill(rmm::exec_policy(stream_view), + thrust::fill(rmm::exec_policy_nosync(stream_view), priorities.begin(), priorities.end(), std::numeric_limits::max()); } else { thrust::tabulate( - rmm::exec_policy(stream_view), + rmm::exec_policy_nosync(stream_view), priorities.begin(), priorities.end(), [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { @@ -997,6 +998,7 @@ compute_selected_ranks(raft::comms::comms_t const& comm, : std::numeric_limits::max(); // lowest priority }); } + auto time1 = std::chrono::steady_clock::now(); device_allreduce(comm, priorities.data(), priorities.data(), @@ -1004,11 +1006,12 @@ compute_selected_ranks(raft::comms::comms_t const& comm, raft::comms::op_t::MIN, stream_view); + auto time2 = std::chrono::steady_clock::now(); if (comm_rank == root) { rmm::device_uvector selected_ranks(priorities.size(), stream_view); auto offset_priority_pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); - thrust::transform(rmm::exec_policy(stream_view), + thrust::transform(rmm::exec_policy_nosync(stream_view), offset_priority_pair_first, offset_priority_pair_first + priorities.size(), selected_ranks.begin(), @@ -1021,6 +1024,12 @@ compute_selected_ranks(raft::comms::comms_t const& comm, priority, root, subgroup_size, comm_size, offset); return rank; }); + auto time3 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::cout << "root compute_selected_ranks dur=(" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ")." << std::endl; return selected_ranks; } else { std::optional> keep_flags{std::nullopt}; @@ -1028,7 +1037,7 @@ compute_selected_ranks(raft::comms::comms_t const& comm, keep_flags = rmm::device_uvector(priorities.size(), stream_view); auto offset_priority_pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); - thrust::transform(rmm::exec_policy(stream_view), + thrust::transform(rmm::exec_policy_nosync(stream_view), offset_priority_pair_first, offset_priority_pair_first + priorities.size(), (*keep_flags).begin(), @@ -1042,111 +1051,16 @@ compute_selected_ranks(raft::comms::comms_t const& comm, return (rank == comm_rank); }); } + auto time3 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = time1 - time0; + std::chrono::duration dur1 = time2 - time1; + std::chrono::duration dur2 = time3 - time2; + std::cout << "non-root compute_selected_ranks dur=(" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << ")." << std::endl; return keep_flags; } } -template -std::tuple, - dataframe_buffer_type_t::value_type>> -gather_offset_value_pairs(raft::comms::comms_t const& comm, - ValueIterator value_first, - ValueIterator value_last, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - typename thrust::iterator_traits::value_type init, - bool ignore_local_values, // no valid value in [value_first, value_last) - rmm::cuda_stream_view stream_view) -{ - using value_t = typename thrust::iterator_traits::value_type; - - auto const comm_rank = comm.get_rank(); - auto const comm_size = comm.get_size(); - - std::variant, std::optional>> - selected_ranks_or_flags{std::nullopt}; - if (comm_size <= std::numeric_limits::max()) { // priority == uint8_t - selected_ranks_or_flags = compute_selected_ranks( - comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); - } -#if 0 // FIXME: this should be enabled (currently, raft does not support allreduce on uint16_t). - else if (comm_size <= std::numeric_limits::max()) { // priority == uint16_t - selected_ranks_or_flags = compute_selected_ranks( - comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); - } -#endif - else { // priority_t == uint32_t - selected_ranks_or_flags = compute_selected_ranks( - comm, value_first, value_last, root, subgroup_size, init, ignore_local_values, stream_view); - } - - auto values = allocate_dataframe_buffer(0, stream_view); - if (comm_rank == root) { - assert(selected_ranks_or_flags.index() == 0); - auto const& selected_ranks = std::get<0>(selected_ranks_or_flags); - if (!ignore_local_values) { - auto copy_size = thrust::count( - rmm::exec_policy(stream_view), selected_ranks.begin(), selected_ranks.end(), comm_rank); - resize_dataframe_buffer(values, copy_size, stream_view); - thrust::copy_if(rmm::exec_policy(stream_view), - value_first, - value_first + selected_ranks.size(), - selected_ranks.begin(), - get_dataframe_buffer_begin(values), - is_equal_t{comm_rank}); - } - } else { - assert(selected_ranks_or_flags.index() == 1); - auto const& selected_flags = std::get<1>(selected_ranks_or_flags); - if (selected_flags) { - auto copy_size = thrust::count( - rmm::exec_policy(stream_view), (*selected_flags).begin(), (*selected_flags).end(), true); - resize_dataframe_buffer(values, copy_size, stream_view); - thrust::copy_if(rmm::exec_policy(stream_view), - value_first, - value_first + (*selected_flags).size(), - (*selected_flags).begin(), - get_dataframe_buffer_begin(values), - thrust::identity{}); - } - } - - auto rx_sizes = host_scalar_gather(comm, size_dataframe_buffer(values), root, stream_view); - std::vector rx_displs{}; - if (comm_rank == root) { - rx_displs.resize(rx_sizes.size()); - std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); - } - - auto rx_values = allocate_dataframe_buffer( - comm_rank == root ? (rx_displs.back() + rx_sizes.back()) : size_t{0}, stream_view); - device_gatherv(comm, - get_dataframe_buffer_begin(values), - get_dataframe_buffer_begin(rx_values), - values.size(), - rx_sizes, - rx_displs, - root, - stream_view); - resize_dataframe_buffer(values, 0, stream_view); - shrink_to_fit_dataframe_buffer(values, stream_view); - - rmm::device_uvector rx_offsets(0, stream_view); - if (comm_rank == root) { - auto& selected_ranks = std::get<0>(selected_ranks_or_flags); - rx_offsets.resize(selected_ranks.size(), stream_view); - thrust::sequence( - rmm::exec_policy(stream_view), rx_offsets.begin(), rx_offsets.end(), vertex_t{0}); - thrust::stable_sort_by_key(rmm::exec_policy(stream_view), - selected_ranks.begin(), - selected_ranks.end(), - rx_offsets.begin()); - rx_offsets.resize(rx_displs.back() + rx_sizes.back(), stream_view); - } - - return std::make_tuple(std::move(rx_offsets), std::move(rx_values)); -} - template mutable_view().value_first())>, void /* dummy */>; - std::conditional_t>, - std::vector>, - std::byte /* dummy */> - offset_vectors{}; - std::conditional_t>, - std::vector>, - std::byte /* dummy */> - value_vectors{}; - if constexpr (update_major && std::is_same_v>) { - auto capacity = graph_view.local_edge_partition_segment_offsets(0) ? max_segments : 1; - offset_vectors.reserve(capacity); - value_vectors.reserve(capacity); - - for (size_t i = 0; i < capacity; ++i) { - offset_vectors.emplace_back(0, handle.get_stream()); - value_vectors.emplace_back(0, handle.get_stream()); - } - } - if (stream_pool_indices) { handle.sync_stream(); } // 9. proces local edge partitions @@ -1862,15 +1757,15 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - auto edge_partition_key_buffer = allocate_dataframe_buffer( minor_comm_size > 1 ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); if (minor_comm_size > 1) { if constexpr (try_bitmap) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + std::variant, decltype(sorted_unique_key_first)> v_list{}; if (use_bitmap_flags[partition_idx]) { @@ -1932,16 +1827,20 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, graph_view.local_edge_partition_view(partition_idx)); auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - if constexpr (GraphViewType::is_multi_gpu && use_input_key) { - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); - } - std::optional> key_segment_offsets{std::nullopt}; if (segment_offsets) { if constexpr (use_input_key) { + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); + } + } + // FIXME: compute_key_segment_offsets implicitly synchronizes to copy the results to host key_segment_offsets = compute_key_segment_offsets( edge_partition_key_first, @@ -2102,317 +2001,327 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif if constexpr (GraphViewType::is_multi_gpu && update_major) { - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto edge_partition_stream_pool_indices = - stream_pool_indices ? std::make_optional>( - (*stream_pool_indices).data() + j * max_segments, max_segments) - : std::nullopt; + if constexpr (std::is_same_v>) { + std::vector< + std::variant, std::optional>>> + edge_partition_selected_ranks_or_flags{}; + edge_partition_selected_ranks_or_flags.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); + if constexpr (use_input_key) { + if (minor_comm_size > 1) { + resize_dataframe_buffer(edge_partition_key_buffers[j], 0, loop_stream); + shrink_to_fit_dataframe_buffer(edge_partition_key_buffers[j], loop_stream); + } + } - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - if constexpr (GraphViewType::is_multi_gpu && use_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); + bool process_local_edges = true; + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } } - } - bool process_local_edges = true; - if constexpr (filter_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } + auto const& output_buffer = major_output_buffers[j]; + + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + auto selected_ranks_or_flags = compute_selected_ranks( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_end(output_buffer), + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true /* ignore_local_values */, + loop_stream); + edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } else if (minor_comm_size <= + std::numeric_limits::max()) { // priority == uint16_t + CUGRAPH_FAIL( + "unimplemented."); // currently, raft does not support allreduce on uint16_t. + } else { // priority_t == uint32_t + auto selected_ranks_or_flags = compute_selected_ranks( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_end(output_buffer), + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true /* ignore_local_values */, + loop_stream); + edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime7 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime8 = std::chrono::steady_clock::now(); + auto subtime9 = std::chrono::steady_clock::now(); + auto subtime10 = std::chrono::steady_clock::now(); +#endif - auto const& key_segment_offsets = key_segment_offset_vectors[j]; - auto output_buffer = get_dataframe_buffer_begin(major_output_buffers[j]); - - if (key_segment_offsets && edge_partition_stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]); - auto segment_offset = (*key_segment_offsets)[3]; - auto segment_size = (*key_segment_offsets)[4] - (*key_segment_offsets)[3]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = gather_offset_value_pairs( - minor_comm, - output_buffer + segment_offset, - output_buffer + (segment_offset + segment_size), - static_cast(partition_idx), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); - if (static_cast(partition_idx) == minor_comm_rank) { - offset_vectors[3] = std::move(offsets); - value_vectors[3] = std::move(values); - } - } else { - device_reduce(minor_comm, - output_buffer + segment_offset, - tmp_vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(partition_idx), - segment_stream); - } - } - } - if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]); - auto segment_offset = (*key_segment_offsets)[2]; - auto segment_size = (*key_segment_offsets)[3] - (*key_segment_offsets)[2]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = - gather_offset_value_pairs(minor_comm, - output_buffer + segment_offset, - output_buffer + (segment_offset + segment_size), - static_cast(partition_idx), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); + std::vector copy_sizes(loop_count, 0); + { + rmm::device_uvector d_copy_sizes(loop_count, handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), d_copy_sizes.begin(), d_copy_sizes.end(), size_t{0}); + if (stream_pool_indices) { handle.sync_stream(); } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + bool process_local_edges = true; + if constexpr (filter_input_key) { if (static_cast(partition_idx) == minor_comm_rank) { - offset_vectors[2] = std::move(offsets); - value_vectors[2] = std::move(values); + process_local_edges = false; } - } else { - device_reduce(minor_comm, - output_buffer + segment_offset, - tmp_vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(partition_idx), - segment_stream); } - } - if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]); - auto segment_offset = (*key_segment_offsets)[1]; - auto segment_size = (*key_segment_offsets)[2] - (*key_segment_offsets)[1]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = - gather_offset_value_pairs(minor_comm, - output_buffer + segment_offset, - output_buffer + (segment_offset + segment_size), - static_cast(partition_idx), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); - if (static_cast(partition_idx) == minor_comm_rank) { - offset_vectors[1] = std::move(offsets); - value_vectors[1] = std::move(values); + + if (minor_comm_rank == static_cast(partition_idx)) { + if (process_local_edges) { + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + count_nosync(selected_ranks.begin(), + selected_ranks.end(), + raft::device_span(d_copy_sizes.data() + j, size_t{1}), + minor_comm_rank, + loop_stream); } } else { - device_reduce(minor_comm, - output_buffer + segment_offset, - tmp_vertex_value_output_first + segment_offset, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(partition_idx), - segment_stream); - } - } - if ((*key_segment_offsets)[1] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]); - auto segment_size = (*key_segment_offsets)[1]; - if constexpr (std::is_same_v>) { - auto [offsets, values] = - gather_offset_value_pairs(minor_comm, - output_buffer, - output_buffer + segment_size, - static_cast(partition_idx), - subgroup_size, - init, - process_local_edges ? false : true, - segment_stream); - if (static_cast(partition_idx) == minor_comm_rank) { - offset_vectors[0] = std::move(offsets); - value_vectors[0] = std::move(values); + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& selected_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + if (selected_flags) { + count_nosync((*selected_flags).begin(), + (*selected_flags).end(), + raft::device_span(d_copy_sizes.data() + j, size_t{1}), + true, + loop_stream); } - } else { - device_reduce(minor_comm, - output_buffer, - tmp_vertex_value_output_first, - segment_size, - ReduceOp::compatible_raft_comms_op, - static_cast(partition_idx), - segment_stream); } } - } else { - size_t reduction_size{}; - if constexpr (use_input_key) { - reduction_size = static_cast( - thrust::distance(edge_partition_key_first, edge_partition_key_last)); - } else { - reduction_size = static_cast( - key_segment_offsets - ? *((*key_segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size()); - } - if constexpr (std::is_same_v>) { - auto [offsets, values] = - gather_offset_value_pairs(minor_comm, - output_buffer, - output_buffer + reduction_size, - static_cast(partition_idx), - subgroup_size, - init, - process_local_edges ? false : true, - handle.get_stream()); - if (static_cast(partition_idx) == minor_comm_rank) { - offset_vectors[0] = std::move(offsets); - value_vectors[0] = std::move(values); +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + subtime9 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + raft::update_host( + copy_sizes.data(), d_copy_sizes.data(), d_copy_sizes.size(), handle.get_stream()); + handle.sync_stream(); +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + subtime10 = std::chrono::steady_clock::now(); +#endif + } + + std::vector> edge_partition_values{}; + edge_partition_values.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto& output_buffer = major_output_buffers[j]; + + auto copy_size = copy_sizes[j]; + + auto values = allocate_dataframe_buffer(0, loop_stream); + if (minor_comm_rank == static_cast(partition_idx)) { + if (copy_size > 0) { + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + resize_dataframe_buffer(values, copy_size, loop_stream); + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_end(output_buffer), + thrust::make_transform_iterator( + selected_ranks.begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + loop_stream); } } else { - device_reduce(minor_comm, - output_buffer, - tmp_vertex_value_output_first, - reduction_size, - ReduceOp::compatible_raft_comms_op, - static_cast(partition_idx), - handle.get_stream()); + if (copy_size > 0) { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& selected_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + resize_dataframe_buffer(values, copy_size, loop_stream); + copy_if_nosync(get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_end(output_buffer), + (*selected_flags).begin(), + get_dataframe_buffer_begin(values), + loop_stream); + (*selected_flags).resize(0, loop_stream); + (*selected_flags).shrink_to_fit(loop_stream); + } } + + resize_dataframe_buffer(output_buffer, 0, loop_stream); + shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); + + edge_partition_values.push_back(std::move(values)); } - } - } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime7 = std::chrono::steady_clock::now(); + auto subtime11 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime8 = std::chrono::steady_clock::now(); - std::chrono::duration subdur0 = subtime1 - subtime0; - std::chrono::duration subdur1 = subtime2 - subtime1; - std::chrono::duration subdur2 = subtime3 - subtime2; - std::chrono::duration subdur3 = subtime4 - subtime3; - std::chrono::duration subdur4 = subtime5 - subtime4; - std::chrono::duration subdur5 = subtime6 - subtime5; - std::chrono::duration subdur6 = subtime7 - subtime6; - std::chrono::duration subdur7 = subtime8 - subtime7; - std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() - << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," - << subdur6.count() << "," << subdur7.count() << ")" << std::endl; + auto subtime12 = std::chrono::steady_clock::now(); #endif - } + + std::vector rx_sizes{}; + std::vector rx_displs{}; + auto rx_values = allocate_dataframe_buffer(0, handle.get_stream()); + if (stream_pool_indices) { handle.sync_stream(); } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto& values = edge_partition_values[j]; + // FIXME: host_scalar_gather implicitly synchronizes to copy the results to host + auto tmp_rx_sizes = host_scalar_gather(minor_comm, + size_dataframe_buffer(values), + static_cast(partition_idx), + loop_stream); + + if (minor_comm_rank == static_cast(partition_idx)) { + rx_sizes = std::move(tmp_rx_sizes); + rx_displs.resize(rx_sizes.size()); + std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); + rx_values = + allocate_dataframe_buffer(rx_displs.back() + rx_sizes.back(), loop_stream); + } + } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time2 = std::chrono::steady_clock::now(); + auto subtime13 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime14 = std::chrono::steady_clock::now(); #endif - // 10. scatter - - if constexpr (GraphViewType::is_multi_gpu && update_major && - std::is_same_v>) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - - std::optional> edge_partition_stream_pool_indices{std::nullopt}; - if (segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { - edge_partition_stream_pool_indices = std::vector(max_segments); - std::iota((*edge_partition_stream_pool_indices).begin(), - (*edge_partition_stream_pool_indices).end(), - size_t{0}); - } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto& values = edge_partition_values[j]; + + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + get_dataframe_buffer_begin(rx_values), + values.size(), + rx_sizes, + rx_displs, + static_cast(partition_idx), + loop_stream); + } else { + device_gatherv(minor_comm, + get_dataframe_buffer_begin(values), + dataframe_buffer_iterator_type_t{}, + values.size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + loop_stream); + } - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(static_cast(minor_comm_rank))); + resize_dataframe_buffer(values, 0, loop_stream); + shrink_to_fit_dataframe_buffer(values, loop_stream); + } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime15 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime16 = std::chrono::steady_clock::now(); +#endif - std::optional> key_segment_offsets{std::nullopt}; - if (segment_offsets) { - if constexpr (use_input_key) { - key_segment_offsets = compute_key_segment_offsets( - sorted_unique_key_first, - sorted_unique_nzd_key_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - edge_partition.major_range_first(), - handle.get_stream()); + if (size_dataframe_buffer(rx_values) > 0) { + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + + auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), vertex_t{0}); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + selected_ranks.begin(), + selected_ranks.end(), + rx_offsets.begin()); + // selected_ranks[] == comm_size if no GPU in minor_comm has a non-init value + rx_offsets.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); + + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + rx_offsets.begin(), + tmp_vertex_value_output_first); + handle.sync_stream(); + } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime17 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; + std::chrono::duration subdur3 = subtime4 - subtime3; + std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; + std::chrono::duration subdur6 = subtime7 - subtime6; + std::chrono::duration subdur7 = subtime8 - subtime7; + std::chrono::duration subdur8 = subtime9 - subtime8; + std::chrono::duration subdur9 = subtime10 - subtime9; + std::chrono::duration subdur10 = subtime11 - subtime10; + std::chrono::duration subdur11 = subtime12 - subtime11; + std::chrono::duration subdur12 = subtime13 - subtime12; + std::chrono::duration subdur13 = subtime14 - subtime13; + std::chrono::duration subdur14 = subtime15 - subtime14; + std::chrono::duration subdur15 = subtime16 - subtime15; + std::chrono::duration subdur16 = subtime17 - subtime16; + std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," + << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," + << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," + << subdur11.count() << "," << subdur12.count() << "," << subdur13.count() << "," + << subdur14.count() << "," << subdur15.count() << "," << subdur16.count() << ")" + << std::endl; +#endif } else { - key_segment_offsets = std::vector((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), - (*key_segment_offsets).begin(), - [](vertex_t offset) { return static_cast(offset); }); - } - } - - if (key_segment_offsets && edge_partition_stream_pool_indices) { - if (edge_partition.dcs_nzd_vertex_count()) { - if ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]); - auto segment_offset = (*key_segment_offsets)[3]; - thrust::scatter(rmm::exec_policy(segment_stream), - get_dataframe_buffer_begin(value_vectors[3]), - get_dataframe_buffer_end(value_vectors[3]), - offset_vectors[3].begin(), - tmp_vertex_value_output_first + segment_offset); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + device_reduce(minor_comm, + get_dataframe_buffer_begin(major_output_buffers[j]), + tmp_vertex_value_output_first, + size_dataframe_buffer(major_output_buffers[j]), + ReduceOp::compatible_raft_comms_op, + static_cast(partition_idx), + loop_stream); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } - if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]); - auto segment_offset = (*key_segment_offsets)[2]; - thrust::scatter(rmm::exec_policy(segment_stream), - get_dataframe_buffer_begin(value_vectors[2]), - get_dataframe_buffer_end(value_vectors[2]), - offset_vectors[2].begin(), - tmp_vertex_value_output_first + segment_offset); - } - if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]); - auto segment_offset = (*key_segment_offsets)[1]; - thrust::scatter(rmm::exec_policy(segment_stream), - get_dataframe_buffer_begin(value_vectors[1]), - get_dataframe_buffer_end(value_vectors[1]), - offset_vectors[1].begin(), - tmp_vertex_value_output_first + segment_offset); - } - if ((*key_segment_offsets)[1] > 0) { - auto segment_stream = - handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]); - thrust::scatter(rmm::exec_policy(segment_stream), - get_dataframe_buffer_begin(value_vectors[0]), - get_dataframe_buffer_end(value_vectors[0]), - offset_vectors[0].begin(), - tmp_vertex_value_output_first); - } - } else { - thrust::scatter(handle.get_thrust_policy(), - get_dataframe_buffer_begin(value_vectors[0]), - get_dataframe_buffer_end(value_vectors[0]), - offset_vectors[0].begin(), - tmp_vertex_value_output_first); } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time3 = std::chrono::steady_clock::now(); + auto time2 = std::chrono::steady_clock::now(); #endif - // 11. communication + // 10. communication if constexpr (GraphViewType::is_multi_gpu && !update_major) { auto& comm = handle.get_comms(); @@ -2502,13 +2411,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time4 = std::chrono::steady_clock::now(); + auto time3 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::chrono::duration dur3 = time4 - time3; - std::cout << "\t\tdetail::per_v (prep, ep, scatter, comm) took (" << dur0.count() << "," - << dur1.count() << "," << dur2.count() << "," << dur3.count() << ")" << std::endl; + std::cout << "\t\tdetail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() + << "," << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 2f463a74db4..9c7c84e9719 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -99,7 +99,7 @@ std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, rmm::device_uvector d_offsets(d_thresholds.size(), stream_view); if constexpr (std::is_same_v) { - thrust::lower_bound(rmm::exec_policy(stream_view), + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), sorted_key_first, sorted_key_last, d_thresholds.begin(), @@ -108,7 +108,7 @@ std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, } else { auto sorted_vertex_first = thrust::make_transform_iterator(sorted_key_first, thrust_tuple_get{}); - thrust::lower_bound(rmm::exec_policy(stream_view), + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), sorted_vertex_first, sorted_vertex_first + thrust::distance(sorted_key_first, sorted_key_last), d_thresholds.begin(), @@ -146,9 +146,11 @@ std::optional> compute_vertex_list_bitmap_info( if (v_list_size > static_cast(bool_size * threshold_ratio)) { bitmap = rmm::device_uvector(packed_bool_size(bool_size), stream_view); - thrust::fill( - rmm::exec_policy(stream_view), (*bitmap).begin(), (*bitmap).end(), packed_bool_empty_mask()); - thrust::for_each(rmm::exec_policy(stream_view), + thrust::fill(rmm::exec_policy_nosync(stream_view), + (*bitmap).begin(), + (*bitmap).end(), + packed_bool_empty_mask()); + thrust::for_each(rmm::exec_policy_nosync(stream_view), sorted_unique_vertex_first, sorted_unique_vertex_last, [bitmap = raft::device_span((*bitmap).data(), (*bitmap).size()), From e53b3b8082d7413206fd8b597ce804527ce0dcf3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 28 Aug 2024 12:41:36 -0700 Subject: [PATCH 053/126] delete debug prints --- cpp/src/prims/detail/per_v_transform_reduce_e.cuh | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index be7b5874324..b4c2a663184 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -970,7 +970,6 @@ compute_selected_ranks(raft::comms::comms_t const& comm, bool ignore_local_values, rmm::cuda_stream_view stream_view) { - auto time0 = std::chrono::steady_clock::now(); auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); @@ -998,7 +997,6 @@ compute_selected_ranks(raft::comms::comms_t const& comm, : std::numeric_limits::max(); // lowest priority }); } - auto time1 = std::chrono::steady_clock::now(); device_allreduce(comm, priorities.data(), priorities.data(), @@ -1006,7 +1004,6 @@ compute_selected_ranks(raft::comms::comms_t const& comm, raft::comms::op_t::MIN, stream_view); - auto time2 = std::chrono::steady_clock::now(); if (comm_rank == root) { rmm::device_uvector selected_ranks(priorities.size(), stream_view); auto offset_priority_pair_first = @@ -1024,12 +1021,6 @@ compute_selected_ranks(raft::comms::comms_t const& comm, priority, root, subgroup_size, comm_size, offset); return rank; }); - auto time3 = std::chrono::steady_clock::now(); - std::chrono::duration dur0 = time1 - time0; - std::chrono::duration dur1 = time2 - time1; - std::chrono::duration dur2 = time3 - time2; - std::cout << "root compute_selected_ranks dur=(" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << ")." << std::endl; return selected_ranks; } else { std::optional> keep_flags{std::nullopt}; @@ -1051,12 +1042,6 @@ compute_selected_ranks(raft::comms::comms_t const& comm, return (rank == comm_rank); }); } - auto time3 = std::chrono::steady_clock::now(); - std::chrono::duration dur0 = time1 - time0; - std::chrono::duration dur1 = time2 - time1; - std::chrono::duration dur2 = time3 - time2; - std::cout << "non-root compute_selected_ranks dur=(" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << ")." << std::endl; return keep_flags; } } From b6e4f28a00bc435262d8b0ed5f85eae694378ad7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 29 Aug 2024 16:20:42 -0700 Subject: [PATCH 054/126] fix erroneous comments (numbering) --- cpp/src/structure/renumber_edgelist_impl.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 41f81d72ab1..de7fd0d3f62 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -535,7 +535,7 @@ std::tuple, std::vector, vertex_t> compu } } - // 4. sort local vertices by degree (descending) + // 5. sort local vertices by degree (descending) thrust::sort_by_key(handle.get_thrust_policy(), sorted_local_vertex_degrees.begin(), @@ -543,7 +543,7 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertices.begin(), thrust::greater()); - // 5. compute segment_offsets + // 6. compute segment_offsets static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && From be504cccdbef757366da2389d8e069c938c1512f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 30 Aug 2024 15:05:00 -0700 Subject: [PATCH 055/126] reduce memory footprint in graph creation --- cpp/include/cugraph/partition_manager.hpp | 27 +- cpp/src/structure/renumber_edgelist_impl.cuh | 301 +++++++++++++------ 2 files changed, 240 insertions(+), 88 deletions(-) diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 309b169e646..18c19d3b54d 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -42,7 +42,8 @@ class partition_manager { // partitioning along the major axis (major sub-communicator is responsible for this) and along // the minor axis (minor sub-communicator is responsible for this). This variable controls whether // to map the major sub-communicator to the GPU row communicator or the GPU column communicator. - static constexpr bool map_major_comm_to_gpu_row_comm = true; + static constexpr bool map_major_comm_to_gpu_row_comm = + false; // FIXME: this is for benchmarking, reset to true before merging #ifdef __CUDACC__ __host__ __device__ @@ -71,6 +72,30 @@ class partition_manager { : (major_comm_rank * minor_comm_size + minor_comm_rank); } +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_major_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank % major_comm_size + : comm_rank / minor_comm_size; + } + +#ifdef __CUDACC__ + __host__ __device__ +#endif + static int + compute_minor_comm_rank_from_global_comm_rank(int major_comm_size, + int minor_comm_size, + int comm_rank) + { + return map_major_comm_to_gpu_row_comm ? comm_rank / major_comm_size + : comm_rank % minor_comm_size; + } + #ifdef __CUDACC__ __host__ __device__ #endif diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 41f81d72ab1..574163d4af5 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -51,6 +51,8 @@ #include #include +#include + #include #include #include @@ -242,119 +244,240 @@ std::tuple, std::vector, vertex_t> compu std::vector const& edgelist_minors, std::vector const& edgelist_edge_counts) { + // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to + // construct local_vertices) + rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); + if (!local_vertices) { + rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); + if (edgelist_majors.size() > 1) { + constexpr size_t num_bins{8}; // increase the number of bins to cut peak memory usage (at the + // expense of additional computing) + constexpr uint32_t hash_seed = + 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function + // used to map vertices to GPUs, and we may not see the expected randomization) + + assert(multi_gpu); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + assert(static_cast(minor_comm_size) == edgelist_majors.size()); + + auto edge_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_majors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_count_vectors) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { + rmm::device_uvector d_edge_counts(num_bins, handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), d_edge_counts.begin(), d_edge_counts.end(), edge_t{0}); + thrust::for_each(handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [counts = raft::device_span( + d_edge_counts.data(), d_edge_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_count_vectors)[i].data(), + d_edge_counts.data(), + d_edge_counts.size(), + handle.get_stream()); + } + handle.sync_stream(); + } - edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); + for (size_t i = 0; i < num_bins; ++i) { + std::vector> + edge_partition_sorted_unique_majors{}; // for bin "i" + edge_partition_sorted_unique_majors.reserve(edgelist_majors.size()); + std::vector tx_counts(minor_comm_size); + for (size_t j = 0; j < edgelist_majors.size(); ++j) { + rmm::device_uvector majors(0, handle.get_stream()); + if (num_bins > 1) { + majors.resize((*edge_count_vectors)[j][i], handle.get_stream()); + thrust::copy_if(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + majors.begin(), + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + } else { + majors.resize(edgelist_edge_counts[j], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_majors[j], + edgelist_majors[j] + edgelist_edge_counts[j], + majors.begin()); + } + thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end()); + majors.resize(thrust::distance( + majors.begin(), + thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end())), + handle.get_stream()); + majors.shrink_to_fit(handle.get_stream()); - // 1. if local_vertices.has_value() is false, find unique vertices from edge majors (to construct - // local_vertices) + tx_counts[j] = majors.size(); + edge_partition_sorted_unique_majors.push_back(std::move(majors)); + } - rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); - if (!local_vertices) { - sorted_unique_majors.resize(num_local_edges, handle.get_stream()); - size_t major_offset{0}; - for (size_t i = 0; i < edgelist_majors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[i], - edgelist_majors[i] + edgelist_edge_counts[i], - sorted_unique_majors.begin() + major_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]); - major_offset += static_cast(thrust::distance( - sorted_unique_majors.begin() + major_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_majors.begin() + major_offset, - sorted_unique_majors.begin() + major_offset + edgelist_edge_counts[i]))); - } - sorted_unique_majors.resize(major_offset, handle.get_stream()); + rmm::device_uvector tmp_majors(std::reduce(tx_counts.begin(), tx_counts.end()), + handle.get_stream()); + size_t offset{0}; + for (size_t j = 0; j < edge_partition_sorted_unique_majors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_sorted_unique_majors[j].begin(), + edge_partition_sorted_unique_majors[j].end(), + tmp_majors.begin() + offset); + offset += edge_partition_sorted_unique_majors[j].size(); + } + edge_partition_sorted_unique_majors.clear(); - if (edgelist_majors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end()); - } - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - } + std::tie(tmp_majors, std::ignore) = + shuffle_values(minor_comm, tmp_majors.begin(), tx_counts, handle.get_stream()); - // 2. if local_vertices.has_value() is false, find unique vertices from edge minors (to construct - // local_vertices) + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + tmp_majors.resize( + thrust::distance( + tmp_majors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), + handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + if (i == 0) { + sorted_unique_majors = std::move(tmp_majors); + } else { + rmm::device_uvector merged_majors( + sorted_unique_majors.size() + tmp_majors.size(), handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_unique_majors.begin(), + sorted_unique_majors.end(), + tmp_majors.begin(), + tmp_majors.end(), + merged_majors.begin()); + sorted_unique_majors = std::move(merged_majors); + tmp_majors.resize(0, handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + sorted_unique_majors.resize(thrust::distance(sorted_unique_majors.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_unique_majors.begin(), + sorted_unique_majors.end())), + handle.get_stream()); + sorted_unique_majors.shrink_to_fit(handle.get_stream()); + } + } + } else { + rmm::device_uvector majors(edgelist_edge_counts[0], handle.get_stream()); + thrust::copy(handle.get_thrust_policy(), + edgelist_majors[0], + edgelist_majors[0] + edgelist_edge_counts[0], + majors.begin()); + thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end()); + majors.resize( + thrust::distance(majors.begin(), + thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end())), + handle.get_stream()); + majors.shrink_to_fit(handle.get_stream()); + sorted_unique_majors = std::move(majors); + } - rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - if (!local_vertices) { - sorted_unique_minors.resize(num_local_edges, handle.get_stream()); - size_t minor_offset{0}; + rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); for (size_t i = 0; i < edgelist_minors.size(); ++i) { + rmm::device_uvector tmp_minors(edgelist_edge_counts[i], handle.get_stream()); thrust::copy(handle.get_thrust_policy(), edgelist_minors[i], edgelist_minors[i] + edgelist_edge_counts[i], - sorted_unique_minors.begin() + minor_offset); - thrust::sort(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]); - minor_offset += static_cast(thrust::distance( - sorted_unique_minors.begin() + minor_offset, - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin() + minor_offset, - sorted_unique_minors.begin() + minor_offset + edgelist_edge_counts[i]))); - } - sorted_unique_minors.resize(minor_offset, handle.get_stream()); - if (edgelist_minors.size() > 1) { - thrust::sort( - handle.get_thrust_policy(), sorted_unique_minors.begin(), sorted_unique_minors.end()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); - } - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - } - - // 3. update sorted_local_vertices. - // if local_vertices.has_value() is false, reconstruct local_vertices first + tmp_minors.begin()); + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + + if constexpr (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + std::tie(tmp_minors, std::ignore) = groupby_gpu_id_and_shuffle_values( + major_comm, + tmp_minors.begin(), + tmp_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + auto comm_rank = gpu_id_func(v); + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, comm_rank); + }, + handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + } - if (local_vertices) { - sorted_local_vertices = std::move(*local_vertices); - thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - } else { + if (i == 0) { + sorted_unique_minors = std::move(tmp_minors); + } else { + rmm::device_uvector merged_minors(sorted_unique_minors.size() + tmp_minors.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_unique_minors.begin(), + sorted_unique_minors.end(), + tmp_minors.begin(), + tmp_minors.end(), + merged_minors.begin()); + sorted_unique_minors = std::move(merged_minors); + tmp_minors.resize(0, handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_unique_minors.begin(), + sorted_unique_minors.end())), + handle.get_stream()); + sorted_unique_minors.shrink_to_fit(handle.get_stream()); + } + } sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), handle.get_stream()); - thrust::merge(handle.get_thrust_policy(), sorted_unique_majors.begin(), sorted_unique_majors.end(), sorted_unique_minors.begin(), sorted_unique_minors.end(), sorted_local_vertices.begin()); - sorted_unique_majors.resize(0, handle.get_stream()); sorted_unique_majors.shrink_to_fit(handle.get_stream()); sorted_unique_minors.resize(0, handle.get_stream()); sorted_unique_minors.shrink_to_fit(handle.get_stream()); - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), thrust::unique(handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end())), handle.get_stream()); sorted_local_vertices.shrink_to_fit(handle.get_stream()); - - if constexpr (multi_gpu) { - sorted_local_vertices = - cugraph::detail::shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning( - handle, std::move(sorted_local_vertices)); - thrust::sort( - handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); - } + } else { + sorted_local_vertices = std::move(*local_vertices); + thrust::sort( + handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); } + // 2. find an unused vertex ID + auto locally_unused_vertex_id = find_locally_unused_ext_vertex_id( handle, raft::device_span(sorted_local_vertices.data(), sorted_local_vertices.size()), @@ -363,7 +486,7 @@ std::tuple, std::vector, vertex_t> compu "Invalid input arguments: there is no unused value in the entire range of " "vertex_t, increase vertex_t to 64 bit."); - // 4. compute global degrees for the sorted local vertices + // 3. compute global degrees for the sorted local vertices rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); std::optional> stream_pool_indices{ @@ -387,6 +510,9 @@ std::tuple, std::vector, vertex_t> compu host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream()); if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { + edge_t num_local_edges = + std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); + auto vertex_edge_counts = host_scalar_allreduce( comm, thrust::make_tuple(static_cast(sorted_local_vertices.size()), num_local_edges), @@ -857,10 +983,10 @@ renumber_edgelist( (*edgelist_intra_partition_segment_offsets).size() == static_cast(minor_comm_size), "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets).size()."); for (size_t i = 0; i < edgelist_majors.size(); ++i) { - CUGRAPH_EXPECTS( - (*edgelist_intra_partition_segment_offsets)[i].size() == - static_cast(major_comm_size + 1), - "Invalid input arguments: erroneous (*edgelist_intra_partition_segment_offsets)[].size()."); + CUGRAPH_EXPECTS((*edgelist_intra_partition_segment_offsets)[i].size() == + static_cast(major_comm_size + 1), + "Invalid input arguments: erroneous " + "(*edgelist_intra_partition_segment_offsets)[].size()."); CUGRAPH_EXPECTS( std::is_sorted((*edgelist_intra_partition_segment_offsets)[i].begin(), (*edgelist_intra_partition_segment_offsets)[i].end()), @@ -868,7 +994,8 @@ renumber_edgelist( CUGRAPH_EXPECTS( ((*edgelist_intra_partition_segment_offsets)[i][0] == 0) && ((*edgelist_intra_partition_segment_offsets)[i].back() == edgelist_edge_counts[i]), - "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 and " + "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 " + "and " "(*edgelist_intra_partition_segment_offsets)[].back() should coincide with " "edgelist_edge_counts[]."); } From 3b151e0cb7891ef1448fdb48f6e6f42cab98057c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 30 Aug 2024 15:35:11 -0700 Subject: [PATCH 056/126] undo temporary change for benchmarking --- cpp/include/cugraph/partition_manager.hpp | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 18c19d3b54d..e3bb699f00d 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -42,8 +42,7 @@ class partition_manager { // partitioning along the major axis (major sub-communicator is responsible for this) and along // the minor axis (minor sub-communicator is responsible for this). This variable controls whether // to map the major sub-communicator to the GPU row communicator or the GPU column communicator. - static constexpr bool map_major_comm_to_gpu_row_comm = - false; // FIXME: this is for benchmarking, reset to true before merging + static constexpr bool map_major_comm_to_gpu_row_comm = true; #ifdef __CUDACC__ __host__ __device__ From ad0c87994ca52b0ee7277eff8ecb159e3dde4a41 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 30 Aug 2024 15:42:02 -0700 Subject: [PATCH 057/126] update comments --- cpp/src/structure/renumber_edgelist_impl.cuh | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 574163d4af5..5ebd8323fac 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -251,8 +251,10 @@ std::tuple, std::vector, vertex_t> compu if (!local_vertices) { rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); if (edgelist_majors.size() > 1) { - constexpr size_t num_bins{8}; // increase the number of bins to cut peak memory usage (at the - // expense of additional computing) + constexpr size_t num_bins{ + 8}; // increase the number of bins to cut peak memory usage (at the expense of additional + // computing), limit the maximum temporary memory usage to "size of local edge list + // majors & minors" / "# bins". constexpr uint32_t hash_seed = 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function // used to map vertices to GPUs, and we may not see the expected randomization) @@ -387,7 +389,10 @@ std::tuple, std::vector, vertex_t> compu } rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - for (size_t i = 0; i < edgelist_minors.size(); ++i) { + for (size_t i = 0; i < edgelist_minors.size(); + ++i) { // limit the maximum temporary memory usage to "size of local edge list majors & + // minors" / "# local edge partitions" (FXIME: we can further cut peak memory usage + // by applying binning here as well; fewer bins than the edge list major case) rmm::device_uvector tmp_minors(edgelist_edge_counts[i], handle.get_stream()); thrust::copy(handle.get_thrust_policy(), edgelist_minors[i], From ce4ea93a53ca7a8316ae59a82bd69ea4f0148148 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 30 Aug 2024 15:54:45 -0700 Subject: [PATCH 058/126] cosmetic updates --- cpp/src/structure/renumber_edgelist_impl.cuh | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 5ebd8323fac..9507e0fd33b 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -260,9 +260,7 @@ std::tuple, std::vector, vertex_t> compu // used to map vertices to GPUs, and we may not see the expected randomization) assert(multi_gpu); - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); assert(static_cast(minor_comm_size) == edgelist_majors.size()); @@ -373,7 +371,7 @@ std::tuple, std::vector, vertex_t> compu sorted_unique_majors.shrink_to_fit(handle.get_stream()); } } - } else { + } else { // FIXME: why not apply binning here? rmm::device_uvector majors(edgelist_edge_counts[0], handle.get_stream()); thrust::copy(handle.get_thrust_policy(), edgelist_majors[0], @@ -1000,8 +998,7 @@ renumber_edgelist( ((*edgelist_intra_partition_segment_offsets)[i][0] == 0) && ((*edgelist_intra_partition_segment_offsets)[i].back() == edgelist_edge_counts[i]), "Invalid input arguments: (*edgelist_intra_partition_segment_offsets)[][0] should be 0 " - "and " - "(*edgelist_intra_partition_segment_offsets)[].back() should coincide with " + "and (*edgelist_intra_partition_segment_offsets)[].back() should coincide with " "edgelist_edge_counts[]."); } } From 9445027410faf76df813d76d9ff78aab5ac10e1c Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 3 Sep 2024 10:51:30 -0700 Subject: [PATCH 059/126] update renumbering to use binning in more places --- cpp/src/structure/renumber_edgelist_impl.cuh | 293 +++++++++++-------- 1 file changed, 168 insertions(+), 125 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 9507e0fd33b..abfa515df05 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -249,22 +249,16 @@ std::tuple, std::vector, vertex_t> compu rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { - rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); - if (edgelist_majors.size() > 1) { - constexpr size_t num_bins{ - 8}; // increase the number of bins to cut peak memory usage (at the expense of additional - // computing), limit the maximum temporary memory usage to "size of local edge list - // majors & minors" / "# bins". - constexpr uint32_t hash_seed = - 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function - // used to map vertices to GPUs, and we may not see the expected randomization) - - assert(multi_gpu); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - assert(static_cast(minor_comm_size) == edgelist_majors.size()); + constexpr size_t num_bins{ + 8}; // increase the number of bins to cut peak memory usage (at the expense of additional + // computing), limit the maximum temporary memory usage to "size of local edge list + // majors|minors * 2 / # bins" + constexpr uint32_t hash_seed = + 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function + // used to map vertices to GPUs, and we may not see the expected randomization) + rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); + { auto edge_count_vectors = num_bins > 1 ? std::make_optional>>( edgelist_majors.size(), std::vector(num_bins)) @@ -296,7 +290,6 @@ std::tuple, std::vector, vertex_t> compu std::vector> edge_partition_sorted_unique_majors{}; // for bin "i" edge_partition_sorted_unique_majors.reserve(edgelist_majors.size()); - std::vector tx_counts(minor_comm_size); for (size_t j = 0; j < edgelist_majors.size(); ++j) { rmm::device_uvector majors(0, handle.get_stream()); if (num_bins > 1) { @@ -323,32 +316,51 @@ std::tuple, std::vector, vertex_t> compu handle.get_stream()); majors.shrink_to_fit(handle.get_stream()); - tx_counts[j] = majors.size(); edge_partition_sorted_unique_majors.push_back(std::move(majors)); } - rmm::device_uvector tmp_majors(std::reduce(tx_counts.begin(), tx_counts.end()), - handle.get_stream()); - size_t offset{0}; - for (size_t j = 0; j < edge_partition_sorted_unique_majors.size(); ++j) { - thrust::copy(handle.get_thrust_policy(), - edge_partition_sorted_unique_majors[j].begin(), - edge_partition_sorted_unique_majors[j].end(), - tmp_majors.begin() + offset); - offset += edge_partition_sorted_unique_majors[j].size(); + rmm::device_uvector tmp_majors(0, handle.get_stream()); + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + assert(static_cast(minor_comm_size) == + edge_partition_sorted_unique_majors.size()); + + if (minor_comm_size > 1) { + std::vector tx_counts(minor_comm_size); + for (size_t j = 0; j < edge_partition_sorted_unique_majors.size(); ++j) { + tx_counts[j] = edge_partition_sorted_unique_majors[j].size(); + } + tmp_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()), handle.get_stream()); + size_t offset{0}; + for (size_t j = 0; j < edge_partition_sorted_unique_majors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_sorted_unique_majors[j].begin(), + edge_partition_sorted_unique_majors[j].end(), + tmp_majors.begin() + offset); + offset += tx_counts[j]; + } + edge_partition_sorted_unique_majors.clear(); + + std::tie(tmp_majors, std::ignore) = + shuffle_values(minor_comm, tmp_majors.begin(), tx_counts, handle.get_stream()); + + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + tmp_majors.resize( + thrust::distance( + tmp_majors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), + handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + } else { + assert(edge_partition_sorted_unique_majors.size() == 1); + tmp_majors = std::move(edge_partition_sorted_unique_majors[0]); + } + } else { + assert(edge_partition_sorted_unique_majors.size() == 1); + tmp_majors = std::move(edge_partition_sorted_unique_majors[0]); } - edge_partition_sorted_unique_majors.clear(); - std::tie(tmp_majors, std::ignore) = - shuffle_values(minor_comm, tmp_majors.begin(), tx_counts, handle.get_stream()); - - thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - tmp_majors.resize( - thrust::distance( - tmp_majors.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), - handle.get_stream()); - tmp_majors.shrink_to_fit(handle.get_stream()); if (i == 0) { sorted_unique_majors = std::move(tmp_majors); } else { @@ -359,71 +371,72 @@ std::tuple, std::vector, vertex_t> compu sorted_unique_majors.end(), tmp_majors.begin(), tmp_majors.end(), - merged_majors.begin()); + merged_majors.begin()); // merging two unique sets from different hash + // bins, so the merged set can't have duplicates sorted_unique_majors = std::move(merged_majors); - tmp_majors.resize(0, handle.get_stream()); - tmp_majors.shrink_to_fit(handle.get_stream()); - sorted_unique_majors.resize(thrust::distance(sorted_unique_majors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_majors.begin(), - sorted_unique_majors.end())), - handle.get_stream()); - sorted_unique_majors.shrink_to_fit(handle.get_stream()); } } - } else { // FIXME: why not apply binning here? - rmm::device_uvector majors(edgelist_edge_counts[0], handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[0], - edgelist_majors[0] + edgelist_edge_counts[0], - majors.begin()); - thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end()); - majors.resize( - thrust::distance(majors.begin(), - thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end())), - handle.get_stream()); - majors.shrink_to_fit(handle.get_stream()); - sorted_unique_majors = std::move(majors); } rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - for (size_t i = 0; i < edgelist_minors.size(); - ++i) { // limit the maximum temporary memory usage to "size of local edge list majors & - // minors" / "# local edge partitions" (FXIME: we can further cut peak memory usage - // by applying binning here as well; fewer bins than the edge list major case) - rmm::device_uvector tmp_minors(edgelist_edge_counts[i], handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_minors[i], - edgelist_minors[i] + edgelist_edge_counts[i], - tmp_minors.begin()); - thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); - tmp_minors.resize( - thrust::distance( - tmp_minors.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), - handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); - - if constexpr (multi_gpu) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - compute_gpu_id_from_ext_vertex_t gpu_id_func{ - comm_size, major_comm_size, minor_comm_size}; - std::tie(tmp_minors, std::ignore) = groupby_gpu_id_and_shuffle_values( - major_comm, - tmp_minors.begin(), - tmp_minors.end(), - [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { - auto comm_rank = gpu_id_func(v); - return partition_manager::compute_major_comm_rank_from_global_comm_rank( - major_comm_size, minor_comm_size, comm_rank); - }, - handle.get_stream()); + { + auto edge_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_minors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_count_vectors) { + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + rmm::device_uvector d_edge_counts(num_bins, handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), d_edge_counts.begin(), d_edge_counts.end(), edge_t{0}); + thrust::for_each(handle.get_thrust_policy(), + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], + [counts = raft::device_span( + d_edge_counts.data(), d_edge_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_count_vectors)[i].data(), + d_edge_counts.data(), + d_edge_counts.size(), + handle.get_stream()); + } + handle.sync_stream(); + } + + for (size_t i = 0; i < num_bins; ++i) { + edge_t bin_size{0}; + if (edge_count_vectors) { + for (size_t j = 0; j < edgelist_minors.size(); ++j) { + bin_size += (*edge_count_vectors)[j][i]; + } + } else { + bin_size = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); + } + rmm::device_uvector tmp_minors(bin_size, handle.get_stream()); + edge_t offset{0}; + for (size_t j = 0; j < edgelist_minors.size(); ++j) { + if (num_bins > 1) { + thrust::copy_if(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin() + offset, + [i] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + return (static_cast(hash_func(v) % num_bins) == i); + }); + offset += (*edge_count_vectors)[j][i]; + } else { + thrust::copy(handle.get_thrust_policy(), + edgelist_minors[j], + edgelist_minors[j] + edgelist_edge_counts[j], + tmp_minors.begin() + offset); + offset += edgelist_edge_counts[j]; + } + } thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); tmp_minors.resize( thrust::distance( @@ -431,30 +444,60 @@ std::tuple, std::vector, vertex_t> compu thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), handle.get_stream()); tmp_minors.shrink_to_fit(handle.get_stream()); - } - if (i == 0) { - sorted_unique_minors = std::move(tmp_minors); - } else { - rmm::device_uvector merged_minors(sorted_unique_minors.size() + tmp_minors.size(), - handle.get_stream()); - thrust::merge(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end(), - tmp_minors.begin(), - tmp_minors.end(), - merged_minors.begin()); - sorted_unique_minors = std::move(merged_minors); - tmp_minors.resize(0, handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); - sorted_unique_minors.shrink_to_fit(handle.get_stream()); + if constexpr (multi_gpu) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + std::tie(tmp_minors, std::ignore) = groupby_gpu_id_and_shuffle_values( + major_comm, + tmp_minors.begin(), + tmp_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + auto comm_rank = gpu_id_func(v); + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, comm_rank); + }, + handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + } + + if (i == 0) { + sorted_unique_minors = std::move(tmp_minors); + } else { + rmm::device_uvector merged_minors( + sorted_unique_minors.size() + tmp_minors.size(), handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_unique_minors.begin(), + sorted_unique_minors.end(), + tmp_minors.begin(), + tmp_minors.end(), + merged_minors.begin()); + sorted_unique_minors = std::move(merged_minors); + tmp_minors.resize(0, handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + sorted_unique_minors.begin(), + sorted_unique_minors.end())), + handle.get_stream()); + sorted_unique_minors.shrink_to_fit(handle.get_stream()); + } } } + sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), handle.get_stream()); thrust::merge(handle.get_thrust_policy(), @@ -492,14 +535,12 @@ std::tuple, std::vector, vertex_t> compu // 3. compute global degrees for the sorted local vertices rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); - std::optional> stream_pool_indices{ - std::nullopt}; // FIXME: move this inside the if statement auto constexpr num_chunks = size_t{ - 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more binary - // searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and temporary - // buffer requirement (cut by num_chunks times), currently set to 2 to avoid peak memory - // usage happening in this part (especially when minor_comm_size is small) + 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more + // binary searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and + // temporary buffer requirement (cut by num_chunks times), currently set to 2 to avoid + // peak memory usage happening in this part (especially when minor_comm_size is small) if constexpr (multi_gpu) { auto& comm = handle.get_comms(); @@ -512,6 +553,8 @@ std::tuple, std::vector, vertex_t> compu auto edge_partition_major_range_sizes = host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream()); + std::optional> stream_pool_indices{std::nullopt}; + if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { edge_t num_local_edges = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); @@ -664,7 +707,7 @@ std::tuple, std::vector, vertex_t> compu } } - // 4. sort local vertices by degree (descending) + // 5. sort local vertices by degree (descending) thrust::sort_by_key(handle.get_thrust_policy(), sorted_local_vertex_degrees.begin(), @@ -672,7 +715,7 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertices.begin(), thrust::greater()); - // 5. compute segment_offsets + // 6. compute segment_offsets static_assert(detail::num_sparse_segments_per_vertex_partition == 3); static_assert((detail::low_degree_threshold <= detail::mid_degree_threshold) && From 28641a60ae47c35148758c1cabd0c3541eb5ca67 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 3 Sep 2024 12:12:44 -0700 Subject: [PATCH 060/126] update detail::extract_transform_v_frontier_e to use multiple CUDA streams --- .../detail/extract_transform_v_frontier_e.cuh | 142 ++++++++++++++---- 1 file changed, 110 insertions(+), 32 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 1557b378bd9..beb4f5f7d68 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -637,9 +637,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } - // compute max_pushes + // compute local max_pushes - size_t max_pushes{}; + size_t local_max_pushes{}; { size_t partition_idx{}; if constexpr (GraphViewType::is_multi_gpu) { @@ -656,7 +656,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, thrust_tuple_get_or_identity(frontier_key_last); // for an edge-masked graph, we can pass edge mask to compute tighter bound (at the expense of // additional computing) - max_pushes = edge_partition.compute_number_of_edges( + local_max_pushes = edge_partition.compute_number_of_edges( frontier_major_first, frontier_major_last, handle.get_stream()); } @@ -706,7 +706,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, handle.get_stream()); auto aggregate_max_pushes = host_scalar_allreduce( comm, - max_pushes, + local_max_pushes, raft::comms::op_t::SUM, handle.get_stream()); // this is approximate as we only consider local edges for // [frontier_key_first, frontier_key_last), note that neighbor lists @@ -768,6 +768,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto time1 = std::chrono::steady_clock::now(); #endif for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime0 = std::chrono::steady_clock::now(); +#endif auto loop_count = std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); @@ -775,36 +778,26 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::vector>, std::byte /* dummy */> edge_partition_key_buffers{}; - if constexpr (GraphViewType::is_multi_gpu) { edge_partition_key_buffers.reserve(loop_count); } - std::vector>> key_segment_offset_vectors{}; - key_segment_offset_vectors.reserve(loop_count); - std::vector> output_key_buffers{}; - output_key_buffers.reserve(loop_count); - std::vector> output_value_buffers{}; - output_value_buffers.reserve(loop_count); - std::vector> output_buffer_idx_scalars{}; - output_buffer_idx_scalars.reserve(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if constexpr (GraphViewType::is_multi_gpu) { + edge_partition_key_buffers.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; - if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { auto edge_partition_key_buffer = - allocate_dataframe_buffer(local_frontier_sizes[partition_idx], loop_stream); + allocate_dataframe_buffer(minor_comm_size > 1 ? local_frontier_sizes[partition_idx] : size_t{0}, loop_stream); + if (minor_comm_size > 1) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if constexpr (try_bitmap) { std::variant, decltype(frontier_key_first)> v_list{}; if (use_bitmap_flags[partition_idx]) { @@ -834,7 +827,44 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, static_cast(partition_idx), loop_stream); } + } edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); + } + } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime1 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime2 = std::chrono::steady_clock::now(); +#endif + + std::vector>> key_segment_offset_vectors{}; + key_segment_offset_vectors.reserve(loop_count); + std::vector> output_key_buffers{}; + output_key_buffers.reserve(loop_count); + std::vector> output_value_buffers{}; + output_value_buffers.reserve(loop_count); + std::vector> output_buffer_idx_scalars{}; + output_buffer_idx_scalars.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i * num_concurrent_loops + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + if (minor_comm_size > 1) { edge_partition_frontier_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); edge_partition_frontier_key_last = @@ -851,6 +881,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::optional> key_segment_offsets{std::nullopt}; if (segment_offsets) { + // FIXME: comapute_key_segment_offstes() implicitly synchronizes to copy the results to host key_segment_offsets = compute_key_segment_offsets( edge_partition_frontier_major_first, edge_partition_frontier_major_last, @@ -860,11 +891,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } key_segment_offset_vectors.push_back(std::move(key_segment_offsets)); - size_t edge_partition_max_pushes = max_pushes; + size_t edge_partition_max_pushes = local_max_pushes; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); if (static_cast(partition_idx) != minor_comm_rank) { + // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host edge_partition_max_pushes = edge_partition.compute_number_of_edges( edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); } @@ -876,7 +908,13 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); output_buffer_idx_scalars.push_back(rmm::device_scalar(size_t{0}, loop_stream)); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime3 = std::chrono::steady_clock::now(); +#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime4 = std::chrono::steady_clock::now(); +#endif for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i * num_concurrent_loops + j; @@ -1043,18 +1081,40 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime5 = std::chrono::steady_clock::now(); +#endif + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime6 = std::chrono::steady_clock::now(); +#endif + + std::vector tmp_buffer_sizes(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; + // FIXME: tmp_buffer_idx.value() implicitly synchronizes to copy the results to host + tmp_buffer_sizes[j] = tmp_buffer_idx.value(loop_stream); + } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime7 = std::chrono::steady_clock::now(); +#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime8 = std::chrono::steady_clock::now(); +#endif for (size_t j = 0; j < loop_count; ++j) { auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); + auto tmp_buffer_size = tmp_buffer_sizes[j]; auto& tmp_key_buffer = output_key_buffers[j]; auto& tmp_value_buffer = output_value_buffers[j]; - auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; - - auto tmp_buffer_size = tmp_buffer_idx.value(loop_stream); resize_optional_dataframe_buffer(tmp_key_buffer, tmp_buffer_size, loop_stream); shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, loop_stream); @@ -1066,7 +1126,24 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, key_buffers.push_back(std::move(tmp_key_buffer)); value_buffers.push_back(std::move(tmp_value_buffer)); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime9 = std::chrono::steady_clock::now(); +#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime10 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = subtime1 - subtime0; + std::chrono::duration subdur1 = subtime2 - subtime1; + std::chrono::duration subdur2 = subtime3 - subtime2; + std::chrono::duration subdur3 = subtime4 - subtime3; + std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; + std::chrono::duration subdur6 = subtime7 - subtime6; + std::chrono::duration subdur7 = subtime8 - subtime7; + std::chrono::duration subdur8 = subtime9 - subtime8; + std::chrono::duration subdur9 = subtime10 - subtime9; + std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << ")" << std::endl; +#endif } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -1097,6 +1174,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::vector buffer_displacements(buffer_sizes.size()); std::exclusive_scan( buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0}); + // FIXME: this copy can be performed in multiple streams for (size_t i = 0; i < key_buffers.size(); ++i) { if constexpr (!std::is_same_v) { thrust::copy( From 05df778b138c216664a8427719cd9ff6459c3ffc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 3 Sep 2024 15:15:41 -0700 Subject: [PATCH 061/126] exec_policy=>exec_policy_nosync --- .../cugraph/edge_partition_device_view.cuh | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 583b0a37214..6c1ede94a5b 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -255,7 +255,7 @@ class edge_partition_device_view_t local_degrees(this->major_range_size(), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -266,7 +266,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -284,7 +284,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); if (dcs_nzd_vertices_) { assert(major_hypersparse_first_); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -295,7 +295,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -368,7 +368,7 @@ class edge_partition_device_view_tmajor_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -394,7 +394,7 @@ class edge_partition_device_view_t compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -595,7 +595,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), @@ -638,7 +638,7 @@ class edge_partition_device_view_t local_degrees(this->major_range_size(), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(this->major_range_first()), thrust::make_counting_iterator(this->major_range_last()), local_degrees.begin(), @@ -660,7 +660,7 @@ class edge_partition_device_view_t local_degrees(thrust::distance(major_first, major_last), stream); thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), major_first, major_last, local_degrees.begin(), From 5c4e3bd7c005f2bc7655aa304df0e3a4a999e6af Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 3 Sep 2024 19:36:04 -0700 Subject: [PATCH 062/126] performance-tweak detail::extract_transform_v_frontier_e --- .../detail/extract_transform_v_frontier_e.cuh | 185 ++++++++++-------- .../prims/detail/per_v_transform_reduce_e.cuh | 7 +- 2 files changed, 110 insertions(+), 82 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index beb4f5f7d68..1caaa0c2181 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -596,20 +596,23 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_key_last = get_dataframe_buffer_end(frontier_keys); } - { // drop zero degree vertices + std::optional> key_segment_offsets{std::nullopt}; + { // drop zero degree vertices & compute key_segment_offsets size_t partition_idx{0}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); partition_idx = static_cast(minor_comm.get_rank()); } auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - frontier_key_last = compute_key_lower_bound( + key_segment_offsets = compute_key_segment_offsets( frontier_key_first, frontier_key_last, - graph_view.local_vertex_partition_range_first() + *((*segment_offsets).rbegin() + 1), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + frontier_key_last = frontier_key_first + (*key_segment_offsets).back(); } } @@ -664,9 +667,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::vector local_frontier_sizes{}; std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - local_frontier_sizes = host_scalar_allgather( + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + local_frontier_sizes = host_scalar_allgather( minor_comm, static_cast(thrust::distance(frontier_key_first, frontier_key_last)), handle.get_stream()); @@ -678,9 +683,44 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, return flag == uint8_t{1}; }); } + if (key_segment_offsets) { + rmm::device_uvector d_key_segment_offsets((*key_segment_offsets).size(), + handle.get_stream()); + raft::update_device(d_key_segment_offsets.data(), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + rmm::device_uvector d_aggregate_key_segment_offsets( + minor_comm_size * d_key_segment_offsets.size(), handle.get_stream()); + std::vector rx_counts(minor_comm_size, d_key_segment_offsets.size()); + std::vector rx_displacements(minor_comm_size); + std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + device_allgatherv(minor_comm, + d_key_segment_offsets.data(), + d_aggregate_key_segment_offsets.data(), + rx_counts, + rx_displacements, + handle.get_stream()); + std::vector h_aggregate_key_segment_offsets(d_aggregate_key_segment_offsets.size()); + raft::update_host(h_aggregate_key_segment_offsets.data(), + d_aggregate_key_segment_offsets.data(), + d_aggregate_key_segment_offsets.size(), + handle.get_stream()); + handle.sync_stream(); + key_segment_offset_vectors = std::vector>(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + (*key_segment_offset_vectors)[i] = std::vector( + h_aggregate_key_segment_offsets.begin() + i * (*key_segment_offsets).size(), + h_aggregate_key_segment_offsets.begin() + (i + 1) * (*key_segment_offsets).size()); + } + } } else { local_frontier_sizes = std::vector{static_cast( static_cast(thrust::distance(frontier_key_first, frontier_key_last)))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } } // set-up stream ppol @@ -790,13 +830,13 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - auto edge_partition_key_buffer = - allocate_dataframe_buffer(minor_comm_size > 1 ? local_frontier_sizes[partition_idx] : size_t{0}, loop_stream); - if (minor_comm_size > 1) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + auto edge_partition_key_buffer = allocate_dataframe_buffer( + minor_comm_size > 1 ? local_frontier_sizes[partition_idx] : size_t{0}, loop_stream); + if (size_dataframe_buffer(edge_partition_key_buffer) > 0) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); if constexpr (try_bitmap) { std::variant, decltype(frontier_key_first)> v_list{}; @@ -828,19 +868,17 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, loop_stream); } } - edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); + edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); } } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime1 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime2 = std::chrono::steady_clock::now(); #endif - std::vector>> key_segment_offset_vectors{}; - key_segment_offset_vectors.reserve(loop_count); std::vector> output_key_buffers{}; output_key_buffers.reserve(loop_count); std::vector> output_value_buffers{}; @@ -856,49 +894,30 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; + auto edge_partition_max_pushes = local_max_pushes; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_key_buffers[j]); - } - } - - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); - - std::optional> key_segment_offsets{std::nullopt}; - if (segment_offsets) { - // FIXME: comapute_key_segment_offstes() implicitly synchronizes to copy the results to host - key_segment_offsets = compute_key_segment_offsets( - edge_partition_frontier_major_first, - edge_partition_frontier_major_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - edge_partition.major_range_first(), - loop_stream); - } - key_segment_offset_vectors.push_back(std::move(key_segment_offsets)); - - size_t edge_partition_max_pushes = local_max_pushes; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - if (static_cast(partition_idx) != minor_comm_rank) { - // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host - edge_partition_max_pushes = edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); + if (static_cast(partition_idx) != minor_comm_rank) { + auto edge_partition_frontier_key_first = + get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + auto edge_partition_frontier_key_last = + get_dataframe_buffer_end(edge_partition_key_buffers[j]); + auto edge_partition_frontier_major_first = + thrust_tuple_get_or_identity( + edge_partition_frontier_key_first); + auto edge_partition_frontier_major_last = + thrust_tuple_get_or_identity( + edge_partition_frontier_key_last); + edge_partition_max_pushes = edge_partition.compute_number_of_edges( + edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); + // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host + edge_partition_max_pushes = edge_partition.compute_number_of_edges( + edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); + } } } @@ -946,8 +965,6 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } - auto const& key_segment_offsets = key_segment_offset_vectors[j]; - auto& tmp_key_buffer = output_key_buffers[j]; auto& tmp_value_buffer = output_value_buffers[j]; auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; @@ -968,20 +985,22 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, partition_idx); - if (key_segment_offsets) { - if ((*key_segment_offsets)[1] > 0) { + if (key_segment_offset_vectors) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + if (key_segment_offsets[1] > 0) { auto exec_stream = edge_partition_stream_pool_indices ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) : handle.get_stream(); - raft::grid_1d_block_t update_grid((*key_segment_offsets)[1], + raft::grid_1d_block_t update_grid(key_segment_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_high_degree <<>>( edge_partition, edge_partition_frontier_key_first, - edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_frontier_key_first + key_segment_offsets[1], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -991,19 +1010,19 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, tmp_buffer_idx.data(), e_op); } - if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + if (key_segment_offsets[2] - key_segment_offsets[1] > 0) { auto exec_stream = edge_partition_stream_pool_indices ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) : handle.get_stream(); - raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + raft::grid_1d_warp_t update_grid(key_segment_offsets[2] - key_segment_offsets[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_mid_degree <<>>( edge_partition, - edge_partition_frontier_key_first + (*key_segment_offsets)[1], - edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_frontier_key_first + key_segment_offsets[1], + edge_partition_frontier_key_first + key_segment_offsets[2], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1013,19 +1032,19 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, tmp_buffer_idx.data(), e_op); } - if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + if (key_segment_offsets[3] - key_segment_offsets[2] > 0) { auto exec_stream = edge_partition_stream_pool_indices ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + raft::grid_1d_thread_t update_grid(key_segment_offsets[3] - key_segment_offsets[2], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, - edge_partition_frontier_key_first + (*key_segment_offsets)[2], - edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_frontier_key_first + key_segment_offsets[2], + edge_partition_frontier_key_first + key_segment_offsets[3], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1036,19 +1055,19 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, e_op); } if (edge_partition.dcs_nzd_vertex_count() && - ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { + (key_segment_offsets[4] - key_segment_offsets[3] > 0)) { auto exec_stream = edge_partition_stream_pool_indices ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) : handle.get_stream(); - raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], + raft::grid_1d_thread_t update_grid(key_segment_offsets[4] - key_segment_offsets[3], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_hypersparse_or_low_degree <<>>( edge_partition, - edge_partition_frontier_key_first + (*key_segment_offsets)[3], - edge_partition_frontier_key_first + (*key_segment_offsets)[4], + edge_partition_frontier_key_first + key_segment_offsets[3], + edge_partition_frontier_key_first + key_segment_offsets[4], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1095,15 +1114,15 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; + auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; // FIXME: tmp_buffer_idx.value() implicitly synchronizes to copy the results to host tmp_buffer_sizes[j] = tmp_buffer_idx.value(loop_stream); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime7 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime8 = std::chrono::steady_clock::now(); #endif @@ -1112,7 +1131,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto tmp_buffer_size = tmp_buffer_sizes[j]; + auto tmp_buffer_size = tmp_buffer_sizes[j]; + if (tmp_buffer_size > 0) { auto& tmp_key_buffer = output_key_buffers[j]; auto& tmp_value_buffer = output_value_buffers[j]; @@ -1125,13 +1145,14 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, key_buffers.push_back(std::move(tmp_key_buffer)); value_buffers.push_back(std::move(tmp_value_buffer)); + } } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime9 = std::chrono::steady_clock::now(); #endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime10 = std::chrono::steady_clock::now(); + auto subtime10 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; std::chrono::duration subdur2 = subtime3 - subtime2; @@ -1142,7 +1163,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration subdur7 = subtime8 - subtime7; std::chrono::duration subdur8 = subtime9 - subtime8; std::chrono::duration subdur9 = subtime10 - subtime9; - std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << ")" << std::endl; + std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() + << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," + << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," + << subdur9.count() << ")" << std::endl; #endif } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1154,7 +1178,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto key_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); - if (key_buffers.size() == 1) { + if (key_buffers.size() == 0) { + /* nothing to do */ + } + else if (key_buffers.size() == 1) { key_buffer = std::move(key_buffers[0]); value_buffer = std::move(value_buffers[0]); } else { diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index b4c2a663184..cbd3e2ff054 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1744,7 +1744,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto edge_partition_key_buffer = allocate_dataframe_buffer( minor_comm_size > 1 ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); - if (minor_comm_size > 1) { + if (size_dataframe_buffer(edge_partition_key_buffer) > 0) { if constexpr (try_bitmap) { auto edge_partition = edge_partition_device_view_t( @@ -1787,8 +1787,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime1 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime2 = std::chrono::steady_clock::now(); #endif @@ -1826,7 +1826,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // FIXME: compute_key_segment_offsets implicitly synchronizes to copy the results to host + // FIXME: compute_key_segment_offsets() implicitly synchronizes to copy the results to host key_segment_offsets = compute_key_segment_offsets( edge_partition_key_first, edge_partition_key_last, @@ -2203,6 +2203,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto& values = edge_partition_values[j]; + // FIXME: this if-else might be unnecessary if (minor_comm_rank == static_cast(partition_idx)) { device_gatherv(minor_comm, get_dataframe_buffer_begin(values), From dc44a7d8a24202cf8ecf975503c0294a920b8937 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 4 Sep 2024 11:51:27 -0700 Subject: [PATCH 063/126] update comments --- cpp/src/prims/fill_edge_src_dst_property.cuh | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 5f3fbbf8ede..46a12555eca 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -335,7 +335,11 @@ void fill_edge_minor_property(raft::handle_t const& handle, }); raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); handle.sync_stream(); - v_list_range[0] -= (v_list_range[0] - minor_range_first) % packed_bools_per_word(); + v_list_range[0] -= + (v_list_range[0] - minor_range_first) % + packed_bools_per_word(); // to perform bitwise AND|OR in word granularity (if edge minor + // property value type is packed bool && + // !edge_partition_keys.has_value() && v_list_bitmap.has_value()) } auto v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, From ebcbfb7bfbaacca38eb7932f583be7208d4811ad Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 4 Sep 2024 11:52:04 -0700 Subject: [PATCH 064/126] improve stream concurrency --- .../prims/detail/per_v_transform_reduce_e.cuh | 230 ++++++++++-------- 1 file changed, 126 insertions(+), 104 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index cbd3e2ff054..0719d9a8df3 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1078,7 +1078,7 @@ void per_v_transform_reduce_e_edge_partition( T major_identity_element, ReduceOp reduce_op, PredOp pred_op, - std::optional> const& key_segment_offsets, + std::optional> key_segment_offsets, std::optional> const& edge_partition_stream_pool_indices) { constexpr bool use_input_key = !std::is_same_v; @@ -1362,7 +1362,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, [[maybe_unused]] constexpr auto max_segments = detail::num_sparse_segments_per_vertex_partition + size_t{1}; - // 1. exclude zero degree keys + // 1. drop zero degree keys & compute key_segment_offsets std::optional> local_vertex_partition_segment_offsets{std::nullopt}; { @@ -1376,15 +1376,20 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, graph_view.local_edge_partition_segment_offsets(partition_idx); } + std::conditional_t>, std::byte /* dummy */> + key_segment_offsets{}; auto sorted_unique_nzd_key_last = sorted_unique_key_last; if constexpr (use_input_key) { if (local_vertex_partition_segment_offsets) { - auto sorted_uniue_nzd_key_last = - compute_key_lower_bound(sorted_unique_key_first, - sorted_unique_key_last, - graph_view.local_vertex_partition_range_first() + - *((*local_vertex_partition_segment_offsets).rbegin() + 1), - handle.get_stream()); + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + sorted_unique_nzd_key_last = sorted_unique_key_first + (*key_segment_offsets).back(); } } @@ -1419,7 +1424,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 3. filter input keys + // 3. filter input keys & update key_segment_offsets auto edge_mask_view = graph_view.edge_mask_view(); @@ -1455,25 +1460,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, size_t{0}); } - std::optional> key_segment_offsets{std::nullopt}; - if (local_vertex_partition_segment_offsets) { - if constexpr (use_input_key) { - key_segment_offsets = compute_key_segment_offsets( - sorted_unique_key_first, - sorted_unique_nzd_key_last, - raft::host_span((*local_vertex_partition_segment_offsets).data(), - (*local_vertex_partition_segment_offsets).size()), - edge_partition.major_range_first(), - handle.get_stream()); - } else { - key_segment_offsets = std::vector((*local_vertex_partition_segment_offsets).size()); - std::transform((*local_vertex_partition_segment_offsets).begin(), - (*local_vertex_partition_segment_offsets).end(), - (*key_segment_offsets).begin(), - [](vertex_t offset) { return static_cast(offset); }); - } - } - if (edge_partition_stream_pool_indices) { handle.sync_stream(); } edge_partition_src_input_device_view_t edge_partition_src_value_input{}; @@ -1505,7 +1491,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, init, reduce_op, pred_op, - key_segment_offsets, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, edge_partition_stream_pool_indices ? std::make_optional>( (*edge_partition_stream_pool_indices).data(), (*edge_partition_stream_pool_indices).size()) @@ -1540,6 +1528,18 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, sorted_unique_nzd_key_last = get_optional_dataframe_buffer_end(tmp_key_buffer); tmp_vertex_value_output_first = thrust::make_permutation_iterator( vertex_value_output_first, get_optional_dataframe_buffer_begin(tmp_output_indices)); + + if (key_segment_offsets) { + key_segment_offsets = compute_key_segment_offsets( + sorted_unique_key_first, + sorted_unique_nzd_key_last, + raft::host_span((*local_vertex_partition_segment_offsets).data(), + (*local_vertex_partition_segment_offsets).size()), + edge_partition.major_range_first(), + handle.get_stream()); + assert((*key_segment_offsets).back() == *((*key_segment_offsets).rbegin() + 1)); + assert(sorted_uniue_nzd_key_last == sorted_unique_key_first + (*key_segment_offsets).back()); + } } else { tmp_vertex_value_output_first = vertex_value_output_first; } @@ -1576,52 +1576,98 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std:: conditional_t>, std::byte /* dummy */> key_list_bitmap{}; + std::conditional_t, std::byte /* dummy */> v_list_range{}; if constexpr (try_bitmap) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); + + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_key_first, v_list_size = static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))]__device__(size_t i) { + return (i == 0) ? *sorted_unique_key_first : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + if (minor_comm_size > 1) { - size_t bool_size = - local_vertex_partition_segment_offsets - ? *((*local_vertex_partition_segment_offsets).rbegin() + 1) - : graph_view - .local_vertex_partition_range_size(); // FIXME: if filtered, we can reduce bool_size - - // FIXME: *sorted_unique_nzd_key_last - *sorted_unique_key_first could be smaller than - // bool_size by a non-negligible amount key_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, sorted_unique_nzd_key_last, - graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_first() + bool_size, + v_list_range[0], + v_list_range[1], handle.get_stream()); } } - // 6. collect local_key_list_sizes & use_bitmap_flags + // 6. collect local_key_list_sizes & use_bitmap_flags & key_segment_offsets std::conditional_t, std::byte /* dummy */> local_key_list_sizes{}; + std::conditional_t, std::byte /* dummy */> local_key_list_range_firsts{}; + std::conditional_t, std::byte /* dummy */> local_key_list_range_lasts{}; std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - if constexpr (use_input_key) { - local_key_list_sizes = host_scalar_allgather( + std::conditional_t>>, + std::byte /* dummy */> + key_segment_offset_vectors{}; + if constexpr (use_input_key) { + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + local_key_list_sizes = host_scalar_allgather( minor_comm, static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), handle.get_stream()); - } - if constexpr (try_bitmap) { - auto tmp_flags = host_scalar_allgather( - minor_comm, key_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - use_bitmap_flags.resize(tmp_flags.size()); - std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { - return flag == uint8_t{1}; - }); - } - } else { - if constexpr (use_input_key) { + if constexpr (try_bitmap) { + local_key_list_range_firsts = host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); + local_key_list_range_lasts = host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); + auto tmp_flags = host_scalar_allgather( + minor_comm, key_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); + use_bitmap_flags.resize(tmp_flags.size()); + std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { + return flag == uint8_t{1}; + }); + } + if (key_segment_offsets) { + rmm::device_uvector d_key_segment_offsets((*key_segment_offsets).size(), + handle.get_stream()); + raft::update_device(d_key_segment_offsets.data(), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + rmm::device_uvector d_aggregate_key_segment_offsets( + minor_comm_size * d_key_segment_offsets.size(), handle.get_stream()); + std::vector rx_counts(minor_comm_size, d_key_segment_offsets.size()); + std::vector rx_displacements(minor_comm_size); + std::exclusive_scan( + rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); + device_allgatherv(minor_comm, + d_key_segment_offsets.data(), + d_aggregate_key_segment_offsets.data(), + rx_counts, + rx_displacements, + handle.get_stream()); + std::vector h_aggregate_key_segment_offsets(d_aggregate_key_segment_offsets.size()); + raft::update_host(h_aggregate_key_segment_offsets.data(), + d_aggregate_key_segment_offsets.data(), + d_aggregate_key_segment_offsets.size(), + handle.get_stream()); + handle.sync_stream(); + key_segment_offset_vectors = std::vector>(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + (*key_segment_offset_vectors)[i] = std::vector( + h_aggregate_key_segment_offsets.begin() + i * (*key_segment_offsets).size(), + h_aggregate_key_segment_offsets.begin() + (i + 1) * (*key_segment_offsets).size()); + } + } + } else { local_key_list_sizes = std::vector{ static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>(1); + (*key_segment_offset_vectors)[0] = *key_segment_offsets; + } } } @@ -1746,11 +1792,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, minor_comm_size > 1 ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); if (size_dataframe_buffer(edge_partition_key_buffer) > 0) { if constexpr (try_bitmap) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - std::variant, decltype(sorted_unique_key_first)> v_list{}; if (use_bitmap_flags[partition_idx]) { @@ -1762,13 +1803,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } else { v_list = sorted_unique_key_first; } - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); device_bcast_vertex_list(minor_comm, v_list, get_dataframe_buffer_begin(edge_partition_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, + local_key_list_range_firsts[partition_idx], + local_key_list_range_lasts[partition_idx], local_key_list_sizes[partition_idx], static_cast(partition_idx), loop_stream); @@ -1792,8 +1831,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto subtime2 = std::chrono::steady_clock::now(); #endif - std::vector>> key_segment_offset_vectors{}; - key_segment_offset_vectors.reserve(loop_count); std::conditional_t>, std::byte /* dummy */> @@ -1807,47 +1844,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); - - std::optional> key_segment_offsets{std::nullopt}; - if (segment_offsets) { - if constexpr (use_input_key) { - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); - } - } - - // FIXME: compute_key_segment_offsets() implicitly synchronizes to copy the results to host - key_segment_offsets = compute_key_segment_offsets( - edge_partition_key_first, - edge_partition_key_last, - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - edge_partition.major_range_first(), - loop_stream); - } else { - key_segment_offsets = std::vector((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), - (*key_segment_offsets).begin(), - [](vertex_t offset) { return static_cast(offset); }); - } - } - key_segment_offset_vectors.push_back(std::move(key_segment_offsets)); - if constexpr (GraphViewType::is_multi_gpu && update_major) { size_t buffer_size{0}; if constexpr (use_input_key) { buffer_size = local_key_list_sizes[partition_idx]; } else { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + buffer_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ : edge_partition.major_range_size(); @@ -1915,7 +1921,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - auto const& key_segment_offsets = key_segment_offset_vectors[j]; + std::optional> key_segment_offsets{std::nullopt}; + if constexpr (use_input_key) { + if (key_segment_offset_vectors) { + key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + } + } else { + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } + } edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; @@ -1973,7 +1993,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, major_identity_element, reduce_op, pred_op, - key_segment_offsets, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, edge_partition_stream_pool_indices); } } From 3652c33a19644d993150436f432800fbb1beaec8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 4 Sep 2024 18:57:07 -0700 Subject: [PATCH 065/126] update copy_if_nosync to take a pointer to store the counter --- cpp/src/prims/detail/multi_stream_utils.cuh | 6 +-- .../prims/detail/per_v_transform_reduce_e.cuh | 46 ++++++++++++------- cpp/src/prims/vertex_frontier.cuh | 2 + 3 files changed, 34 insertions(+), 20 deletions(-) diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh index 2f03a22bff5..7a370d5e49f 100644 --- a/cpp/src/prims/detail/multi_stream_utils.cuh +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -62,6 +62,7 @@ void copy_if_nosync(InputIterator input_first, InputIterator input_last, FlagIterator flag_first, OutputIterator output_first, + raft::device_span count /* size = 1 */, rmm::cuda_stream_view stream_view) { CUGRAPH_EXPECTS( @@ -72,14 +73,13 @@ void copy_if_nosync(InputIterator input_first, size_t tmp_storage_bytes{0}; size_t input_size = static_cast(thrust::distance(input_first, input_last)); - rmm::device_scalar num_copied(stream_view); cub::DeviceSelect::Flagged(static_cast(nullptr), tmp_storage_bytes, input_first, flag_first, output_first, - num_copied.data(), + count.data(), input_size, stream_view); @@ -90,7 +90,7 @@ void copy_if_nosync(InputIterator input_first, input_first, flag_first, output_first, - num_copied.data(), + count.data(), input_size, stream_view); } diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 0719d9a8df3..bc15c39a4dc 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -54,6 +54,7 @@ #include #include #include +#include #include #include #include @@ -1582,21 +1583,24 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_size = minor_comm.get_size(); rmm::device_uvector tmps(2, handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - tmps.begin(), - tmps.end(), - [sorted_unique_key_first, v_list_size = static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))]__device__(size_t i) { - return (i == 0) ? *sorted_unique_key_first : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); - }); - raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_key_first, + v_list_size = static_cast(thrust::distance( + sorted_unique_key_first, sorted_unique_nzd_key_last))] __device__(size_t i) { + return (i == 0) ? *sorted_unique_key_first + : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); if (minor_comm_size > 1) { - key_list_bitmap = - compute_vertex_list_bitmap_info(sorted_unique_key_first, - sorted_unique_nzd_key_last, - v_list_range[0], - v_list_range[1], - handle.get_stream()); + key_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + v_list_range[0], + v_list_range[1], + handle.get_stream()); } } @@ -1604,8 +1608,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::conditional_t, std::byte /* dummy */> local_key_list_sizes{}; - std::conditional_t, std::byte /* dummy */> local_key_list_range_firsts{}; - std::conditional_t, std::byte /* dummy */> local_key_list_range_lasts{}; + std::conditional_t, std::byte /* dummy */> + local_key_list_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_key_list_range_lasts{}; std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; std::conditional_t>>, @@ -1620,8 +1626,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), handle.get_stream()); if constexpr (try_bitmap) { - local_key_list_range_firsts = host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); - local_key_list_range_lasts = host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); + local_key_list_range_firsts = + host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); + local_key_list_range_lasts = + host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); auto tmp_flags = host_scalar_allgather( minor_comm, key_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); use_bitmap_flags.resize(tmp_flags.size()); @@ -2146,6 +2154,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, assert(edge_partition_selected_ranks_or_flags[j].index() == 0); auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); resize_dataframe_buffer(values, copy_size, loop_stream); + rmm::device_scalar dummy(size_t{0}, loop_stream); // we already know the count copy_if_nosync( get_dataframe_buffer_begin(output_buffer), get_dataframe_buffer_end(output_buffer), @@ -2154,6 +2163,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, cuda::proclaim_return_type( [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), get_dataframe_buffer_begin(values), + raft::device_span(dummy.data(), size_t{1}), loop_stream); } } else { @@ -2161,10 +2171,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, assert(edge_partition_selected_ranks_or_flags[j].index() == 1); auto& selected_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); resize_dataframe_buffer(values, copy_size, loop_stream); + rmm::device_scalar dummy(size_t{0}, loop_stream); // we already know the count copy_if_nosync(get_dataframe_buffer_begin(output_buffer), get_dataframe_buffer_end(output_buffer), (*selected_flags).begin(), get_dataframe_buffer_begin(values), + raft::device_span(dummy.data(), size_t{1}), loop_stream); (*selected_flags).resize(0, loop_stream); (*selected_flags).shrink_to_fit(loop_stream); diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 9c7c84e9719..f92aec680a9 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -188,6 +188,7 @@ void device_bcast_vertex_list( assert((comm.get_rank() != root) || (std::get<0>(v_list).size() == tmp_bitmap.size())); device_bcast( comm, std::get<0>(v_list).data(), tmp_bitmap.data(), tmp_bitmap.size(), root, stream_view); + rmm::device_scalar dummy(size_t{0}, stream_view); // we already know the count detail::copy_if_nosync( thrust::make_counting_iterator(vertex_range_first), thrust::make_counting_iterator(vertex_range_last), @@ -200,6 +201,7 @@ void device_bcast_vertex_list( packed_bool_empty_mask()); })), output_v_first, + raft::device_span(dummy.data(), size_t{1}), stream_view); } else { device_bcast(comm, std::get<1>(v_list), output_v_first, v_list_size, root, stream_view); From c689e35484504e3a3662391dc347cc2d131c4b18 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 8 Sep 2024 23:04:49 -0700 Subject: [PATCH 066/126] temporary paramter setting for benchmarking --- cpp/include/cugraph/graph_view.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index a2ff3166fa4..f535e35c785 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -243,7 +243,7 @@ namespace detail { // use (key, value) pairs to store source/destination properties if (unique edge // sources/destinations) over (V / major_comm_size|minor_comm_size) is smaller than the threshold // value -double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold = 0.1; +double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold = 0.0; // FIXME: just for benchmarking // FIXME: threshold values require tuning // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller From f7b061bca9fb7382941a6fb1cbe2d266dc939b08 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 8 Sep 2024 23:05:12 -0700 Subject: [PATCH 067/126] bug fix --- cpp/src/prims/detail/extract_transform_v_frontier_e.cuh | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 1caaa0c2181..06a241681d2 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -821,7 +821,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { edge_partition_key_buffers.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); @@ -886,7 +886,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::vector> output_buffer_idx_scalars{}; output_buffer_idx_scalars.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); @@ -936,7 +936,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, #endif for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto edge_partition = edge_partition_device_view_t( From 20e1c7427688565a6b33c2140e6f805d99eebe94 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 8 Sep 2024 23:05:34 -0700 Subject: [PATCH 068/126] add sun_nosync for multi stream execution --- cpp/src/prims/detail/multi_stream_utils.cuh | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh index 7a370d5e49f..78b75fc083d 100644 --- a/cpp/src/prims/detail/multi_stream_utils.cuh +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -124,6 +124,34 @@ void count_nosync(InputIterator input_first, d_tmp_storage.data(), tmp_storage_bytes, input_first, count.data(), input_size, stream_view); } +template +void sum_nosync(InputIterator input_first, + InputIterator input_last, + raft::device_span::value_type> sum /* size = 1 */, + rmm::cuda_stream_view stream_view) +{ + CUGRAPH_EXPECTS( + static_cast(thrust::distance(input_first, input_last)) <= + static_cast(std::numeric_limits::max()), + "cugraph::detail::count_nosync relies on cub::DeviceReduce::Sum which uses int for input size, " + "but thrust::distance(input_first, input_last) exceeds std::numeric_limits::max()."); + + size_t tmp_storage_bytes{0}; + size_t input_size = static_cast(thrust::distance(input_first, input_last)); + + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + input_first, + sum.data(), + input_size, + stream_view); + + auto d_tmp_storage = rmm::device_uvector(tmp_storage_bytes, stream_view); + + cub::DeviceReduce::Sum( + d_tmp_storage.data(), tmp_storage_bytes, input_first, sum.data(), input_size, stream_view); +} + } // namespace detail } // namespace cugraph From 9fa4fb47383ed82059d653d8a459f51844cdb099 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 9 Sep 2024 00:13:09 -0700 Subject: [PATCH 069/126] pre-filter keys --- .../prims/detail/per_v_transform_reduce_e.cuh | 611 ++++++++++++++---- 1 file changed, 468 insertions(+), 143 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index bc15c39a4dc..b31496bc960 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -961,25 +961,37 @@ __host__ __device__ int priority_to_rank( // otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are // selected or not. template -std::variant /* root */, std::optional>> -compute_selected_ranks(raft::comms::comms_t const& comm, - ValueIterator value_first, - ValueIterator value_last, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - typename thrust::iterator_traits::value_type init, - bool ignore_local_values, - rmm::cuda_stream_view stream_view) +std::variant /* root */, std::optional>> +compute_selected_ranks( + raft::comms::comms_t const& comm, + ValueIterator value_first, + ValueIterator value_last, + std::optional> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t range_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + typename thrust::iterator_traits::value_type init, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) { auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); + assert(hypersparse_key_offsets.has_value() || + static_cast(thrust::distance(value_first, value_last)) == + range_size); // we should have for the entire "range_size" if + // hypersparse_key_offsets.has_value() is false + auto contiguous_size = static_cast(thrust::distance(value_first, value_last)) - + (hypersparse_key_offsets ? (*hypersparse_key_offsets).size() : size_t{0}); + // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same // DGX node should be the next) - rmm::device_uvector priorities(thrust::distance(value_first, value_last), - stream_view); + rmm::device_uvector priorities(range_size, stream_view); + if (ignore_local_values) { thrust::fill(rmm::exec_policy_nosync(stream_view), priorities.begin(), @@ -989,7 +1001,7 @@ compute_selected_ranks(raft::comms::comms_t const& comm, thrust::tabulate( rmm::exec_policy_nosync(stream_view), priorities.begin(), - priorities.end(), + priorities.begin() + contiguous_size, [value_first, root, subgroup_size, init, comm_rank, comm_size] __device__(auto offset) { auto val = *(value_first + offset); return (val != init) @@ -997,6 +1009,27 @@ compute_selected_ranks(raft::comms::comms_t const& comm, comm_rank, root, subgroup_size, comm_size, static_cast(offset)) : std::numeric_limits::max(); // lowest priority }); + if (hypersparse_key_offsets) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + priorities.begin() + contiguous_size, + priorities.end(), + std::numeric_limits::max()); + auto priority_first = thrust::make_transform_iterator( + (*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(auto offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + (*hypersparse_key_offsets).size(), + (*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } } device_allreduce(comm, priorities.data(), @@ -1004,7 +1037,6 @@ compute_selected_ranks(raft::comms::comms_t const& comm, priorities.size(), raft::comms::op_t::MIN, stream_view); - if (comm_rank == root) { rmm::device_uvector selected_ranks(priorities.size(), stream_view); auto offset_priority_pair_first = @@ -1024,24 +1056,65 @@ compute_selected_ranks(raft::comms::comms_t const& comm, }); return selected_ranks; } else { - std::optional> keep_flags{std::nullopt}; + std::optional> keep_flags{std::nullopt}; if (!ignore_local_values) { - keep_flags = rmm::device_uvector(priorities.size(), stream_view); + keep_flags = rmm::device_uvector( + packed_bool_size(thrust::distance(value_first, value_last)), stream_view); auto offset_priority_pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); - thrust::transform(rmm::exec_policy_nosync(stream_view), - offset_priority_pair_first, - offset_priority_pair_first + priorities.size(), - (*keep_flags).begin(), - [root, subgroup_size, comm_rank, comm_size] __device__(auto pair) { - auto offset = thrust::get<0>(pair); - auto priority = thrust::get<1>(pair); - auto rank = (priority == std::numeric_limits::max()) - ? comm_size - : priority_to_rank( - priority, root, subgroup_size, comm_size, offset); - return (rank == comm_rank); - }); + thrust::fill(rmm::exec_policy_nosync(stream_view), + (*keep_flags).begin(), + (*keep_flags).end(), + packed_bool_empty_mask()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + offset_priority_pair_first, + offset_priority_pair_first + contiguous_size, + [keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<0>(pair); + auto priority = thrust::get<1>(pair); + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if (hypersparse_key_offsets) { + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + (*hypersparse_key_offsets).begin()), + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + (*hypersparse_key_offsets).begin()) + + (*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, offset); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } } return keep_flags; } @@ -1275,7 +1348,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 1 +#define PER_V_PERFORMANCE_MEASUREMENT 0 template ; + static_assert(update_major || !use_input_key); constexpr bool filter_input_key = - GraphViewType::is_multi_gpu && update_major && use_input_key && + GraphViewType::is_multi_gpu && use_input_key && std::is_same_v>; // if GraphViewType::is_multi_gpu && update_major && // std::is_same_v>, for any @@ -1317,7 +1391,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // if we find any valid local edge (FIXME: this is // applicable even when use_input_key is false). - static_assert(update_major || !use_input_key); static_assert( ReduceOp::pure_function && ((reduce_op::has_compatible_raft_comms_op_v && @@ -1477,6 +1550,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, static_cast(minor_comm_rank)); + // FIXME: we may filter zero local degree vertices first per_v_transform_reduce_e_edge_partition( handle, edge_partition, @@ -1582,20 +1656,24 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - rmm::device_uvector tmps(2, handle.get_stream()); - thrust::tabulate( - handle.get_thrust_policy(), - tmps.begin(), - tmps.end(), - [sorted_unique_key_first, - v_list_size = static_cast(thrust::distance( - sorted_unique_key_first, sorted_unique_nzd_key_last))] __device__(size_t i) { - return (i == 0) ? *sorted_unique_key_first - : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); - }); - raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + v_list_range = {vertex_t{0}, vertex_t{0}}; if (minor_comm_size > 1) { + auto v_list_size = + static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_key_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_key_first + : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + key_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, sorted_unique_nzd_key_last, v_list_range[0], @@ -1791,7 +1869,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_key_buffers.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); @@ -1827,6 +1905,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_cast(partition_idx), loop_stream); } + if constexpr (filter_input_key) { + auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + if (!process_local_edges) { + resize_dataframe_buffer(edge_partition_key_buffer, 0, loop_stream); + shrink_to_fit_dataframe_buffer(edge_partition_key_buffer, loop_stream); + } + } } edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); } @@ -1839,6 +1924,161 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto subtime2 = std::chrono::steady_clock::now(); #endif + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in the + // hypersparse region + if constexpr (filter_input_key) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + if (graph_view.use_dcs() && (minor_comm_size > 1)) { + // FIXME: we can pre-compute this & store in graph_t + std::vector> edge_partition_hypersparse_bitmap_vectors{}; + edge_partition_hypersparse_bitmap_vectors.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + rmm::device_uvector bitmap( + process_local_edges ? packed_bool_size((*segment_offsets)[4] - (*segment_offsets)[3]) + : size_t{0}, + loop_stream); + if (process_local_edges) { + thrust::fill(rmm::exec_policy_nosync(loop_stream), + bitmap.begin(), + bitmap.end(), + packed_bool_empty_mask()); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + *(edge_partition.dcs_nzd_vertices()), + *(edge_partition.dcs_nzd_vertices()) + *(edge_partition.dcs_nzd_vertex_count()), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(auto major) { + auto major_offset = major - major_hypersparse_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(major_offset)]); + word.fetch_or(packed_bool_mask(major_offset), cuda::std::memory_order_relaxed); + }); + } + edge_partition_hypersparse_bitmap_vectors.push_back(std::move(bitmap)); + } + + edge_partition_hypersparse_key_offset_vectors = std::vector>{}; + (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + + std::vector> edge_partition_tmp_key_buffers{}; + edge_partition_tmp_key_buffers.reserve(loop_count); + std::vector> edge_partition_tmp_count_scalars{}; + edge_partition_tmp_count_scalars.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + + auto keys = allocate_dataframe_buffer( + process_local_edges ? (key_segment_offsets[4] - key_segment_offsets[3]) : size_t{0}, + loop_stream); + rmm::device_uvector offsets( + process_local_edges ? (key_segment_offsets[4] - key_segment_offsets[3]) : size_t{0}, + loop_stream); + rmm::device_scalar count(size_t{0}, loop_stream); + if (process_local_edges) { + auto input_first = + thrust::make_zip_iterator(edge_partition_key_first + key_segment_offsets[3], + thrust::make_counting_iterator(key_segment_offsets[3])); + auto flag_first = thrust::make_transform_iterator( + edge_partition_key_first + key_segment_offsets[3], + cuda::proclaim_return_type( + [bitmap = + raft::device_span(edge_partition_hypersparse_bitmap_vectors[j].data(), + edge_partition_hypersparse_bitmap_vectors[j].size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(key_t key) { + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = major - major_hypersparse_first; + return ((bitmap[packed_bool_offset(major_offset)] & + packed_bool_mask(major_offset)) != packed_bool_empty_mask()); + })); + copy_if_nosync( + input_first, + input_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), offsets.begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); + } + edge_partition_tmp_key_buffers.push_back(std::move(keys)); + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + edge_partition_tmp_count_scalars.push_back(std::move(count)); + } + + std::vector h_counts(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + h_counts[j] = edge_partition_tmp_count_scalars[j].value(loop_stream); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + if (process_local_edges) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + resize_dataframe_buffer(edge_partition_tmp_key_buffers[j], h_counts[j], loop_stream); + shrink_to_fit_dataframe_buffer( + edge_partition_tmp_key_buffers[j], + loop_stream); // FIXME: we can skip this to cut execution time + resize_dataframe_buffer( + (*edge_partition_hypersparse_key_offset_vectors)[j], h_counts[j], loop_stream); + shrink_to_fit_dataframe_buffer( + (*edge_partition_hypersparse_key_offset_vectors)[j], + loop_stream); // FIXME: we can skip this to cut execution time + auto keys = + allocate_dataframe_buffer(key_segment_offsets[3] + h_counts[j], loop_stream); + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + key_segment_offsets[3], + get_dataframe_buffer_begin(keys)); + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(edge_partition_tmp_key_buffers[j]), + get_dataframe_buffer_end(edge_partition_tmp_key_buffers[j]), + get_dataframe_buffer_begin(keys) + key_segment_offsets[3]); + edge_partition_key_buffers[j] = std::move(keys); + } + } + } + } + std::conditional_t>, std::byte /* dummy */> @@ -1847,7 +2087,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, major_output_buffers.reserve(loop_count); } for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); @@ -1855,12 +2095,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu && update_major) { size_t buffer_size{0}; if constexpr (use_input_key) { - buffer_size = local_key_list_sizes[partition_idx]; + buffer_size = size_dataframe_buffer(edge_partition_key_buffers[j]); } else { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); buffer_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ @@ -1878,7 +2119,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto edge_partition = edge_partition_device_view_t( @@ -1933,9 +2174,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (use_input_key) { if (key_segment_offset_vectors) { key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + (*key_segment_offsets).back() = static_cast( + thrust::distance(edge_partition_key_first, edge_partition_key_last)); + *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back(); + } + } } } else { - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); if (segment_offsets) { key_segment_offsets = std::vector((*segment_offsets).size()); std::transform((*segment_offsets).begin(), @@ -2020,36 +2269,53 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); + if constexpr (use_input_key) { + if (minor_comm_size > 1) { + edge_partition_key_buffers.clear(); + edge_partition_key_buffers.shrink_to_fit(); + } + } + if constexpr (std::is_same_v>) { std::vector< - std::variant, std::optional>>> + std::variant, std::optional>>> edge_partition_selected_ranks_or_flags{}; edge_partition_selected_ranks_or_flags.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - if constexpr (use_input_key) { - if (minor_comm_size > 1) { - resize_dataframe_buffer(edge_partition_key_buffers[j], 0, loop_stream); - shrink_to_fit_dataframe_buffer(edge_partition_key_buffers[j], loop_stream); - } - } - bool process_local_edges = true; if constexpr (filter_input_key) { if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } } auto const& output_buffer = major_output_buffers[j]; + std::optional> hypersparse_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + hypersparse_key_offsets = raft::device_span( + (*edge_partition_hypersparse_key_offset_vectors)[j].data(), + (*edge_partition_hypersparse_key_offset_vectors)[j].size()); + } + } + + size_t range_size{0}; + if constexpr (filter_input_key) { + range_size = local_key_list_sizes[partition_idx]; + } else { + range_size = size_dataframe_buffer(output_buffer); + } if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t auto selected_ranks_or_flags = compute_selected_ranks( minor_comm, get_dataframe_buffer_begin(output_buffer), get_dataframe_buffer_end(output_buffer), + hypersparse_key_offsets, + range_size, static_cast(partition_idx), subgroup_size, init, @@ -2065,6 +2331,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, minor_comm, get_dataframe_buffer_begin(output_buffer), get_dataframe_buffer_end(output_buffer), + hypersparse_key_offsets, + range_size, static_cast(partition_idx), subgroup_size, init, @@ -2072,25 +2340,33 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, loop_stream); edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); } + + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + resize_dataframe_buffer( + (*edge_partition_hypersparse_key_offset_vectors)[j], 0, loop_stream); + shrink_to_fit_dataframe_buffer((*edge_partition_hypersparse_key_offset_vectors)[j], + loop_stream); + } + } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime7 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime8 = std::chrono::steady_clock::now(); auto subtime9 = std::chrono::steady_clock::now(); auto subtime10 = std::chrono::steady_clock::now(); #endif - std::vector copy_sizes(loop_count, 0); + std::vector copy_sizes(loop_count); { - rmm::device_uvector d_copy_sizes(loop_count, handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), d_copy_sizes.begin(), d_copy_sizes.end(), size_t{0}); - if (stream_pool_indices) { handle.sync_stream(); } + std::vector> edge_partition_copy_sizes{}; + edge_partition_copy_sizes.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); @@ -2102,36 +2378,46 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + rmm::device_scalar copy_size(size_t{0}, loop_stream); if (minor_comm_rank == static_cast(partition_idx)) { if (process_local_edges) { assert(edge_partition_selected_ranks_or_flags[j].index() == 0); auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); count_nosync(selected_ranks.begin(), selected_ranks.end(), - raft::device_span(d_copy_sizes.data() + j, size_t{1}), + raft::device_span(copy_size.data(), size_t{1}), minor_comm_rank, loop_stream); } } else { assert(edge_partition_selected_ranks_or_flags[j].index() == 1); - auto& selected_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); - if (selected_flags) { - count_nosync((*selected_flags).begin(), - (*selected_flags).end(), - raft::device_span(d_copy_sizes.data() + j, size_t{1}), - true, - loop_stream); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + if (keep_flags) { + auto count_first = thrust::make_transform_iterator( + (*keep_flags).begin(), + cuda::proclaim_return_type( + [] __device__(uint32_t word) { return static_cast(__popc(word)); })); + sum_nosync(count_first, + count_first + (*keep_flags).size(), + raft::device_span(copy_size.data(), size_t{1}), + loop_stream); } } + + edge_partition_copy_sizes.push_back(std::move(copy_size)); } + #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete subtime9 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - raft::update_host( - copy_sizes.data(), d_copy_sizes.data(), d_copy_sizes.size(), handle.get_stream()); - handle.sync_stream(); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + copy_sizes[j] = edge_partition_copy_sizes[j].value(loop_stream); + } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } subtime10 = std::chrono::steady_clock::now(); #endif } @@ -2139,47 +2425,60 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::vector> edge_partition_values{}; edge_partition_values.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); auto& output_buffer = major_output_buffers[j]; - - auto copy_size = copy_sizes[j]; + auto copy_size = copy_sizes[j]; auto values = allocate_dataframe_buffer(0, loop_stream); if (minor_comm_rank == static_cast(partition_idx)) { if (copy_size > 0) { - assert(edge_partition_selected_ranks_or_flags[j].index() == 0); - auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if constexpr (filter_input_key) { + assert(false); // should not be reached + } else { + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + resize_dataframe_buffer(values, copy_size, loop_stream); + rmm::device_scalar dummy(size_t{0}, + loop_stream); // we already know the count + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_end(output_buffer), + thrust::make_transform_iterator( + selected_ranks.begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + raft::device_span(dummy.data(), size_t{1}), + loop_stream); + } + } + } else { + if (copy_size > 0) { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); resize_dataframe_buffer(values, copy_size, loop_stream); - rmm::device_scalar dummy(size_t{0}, loop_stream); // we already know the count + rmm::device_scalar dummy(size_t{0}, + loop_stream); // we already know the count copy_if_nosync( get_dataframe_buffer_begin(output_buffer), get_dataframe_buffer_end(output_buffer), thrust::make_transform_iterator( - selected_ranks.begin(), + thrust::make_counting_iterator(size_t{0}), cuda::proclaim_return_type( - [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + [keep_flags = raft::device_span( + (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { + auto word = keep_flags[packed_bool_offset(offset)]; + return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); + })), get_dataframe_buffer_begin(values), raft::device_span(dummy.data(), size_t{1}), loop_stream); - } - } else { - if (copy_size > 0) { - assert(edge_partition_selected_ranks_or_flags[j].index() == 1); - auto& selected_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); - resize_dataframe_buffer(values, copy_size, loop_stream); - rmm::device_scalar dummy(size_t{0}, loop_stream); // we already know the count - copy_if_nosync(get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_end(output_buffer), - (*selected_flags).begin(), - get_dataframe_buffer_begin(values), - raft::device_span(dummy.data(), size_t{1}), - loop_stream); - (*selected_flags).resize(0, loop_stream); - (*selected_flags).shrink_to_fit(loop_stream); + (*keep_flags).resize(0, loop_stream); + (*keep_flags).shrink_to_fit(loop_stream); } } @@ -2196,55 +2495,75 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto subtime12 = std::chrono::steady_clock::now(); #endif - std::vector rx_sizes{}; - std::vector rx_displs{}; - auto rx_values = allocate_dataframe_buffer(0, handle.get_stream()); - if (stream_pool_indices) { handle.sync_stream(); } - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - auto& values = edge_partition_values[j]; - // FIXME: host_scalar_gather implicitly synchronizes to copy the results to host - auto tmp_rx_sizes = host_scalar_gather(minor_comm, - size_dataframe_buffer(values), - static_cast(partition_idx), - loop_stream); - - if (minor_comm_rank == static_cast(partition_idx)) { - rx_sizes = std::move(tmp_rx_sizes); - rx_displs.resize(rx_sizes.size()); - std::exclusive_scan(rx_sizes.begin(), rx_sizes.end(), rx_displs.begin(), size_t{0}); - rx_values = - allocate_dataframe_buffer(rx_displs.back() + rx_sizes.back(), loop_stream); + std::optional> rx_sizes{}; + std::optional> rx_displs{}; + std::optional> rx_values{}; + { + std::vector h_value_buffer_sizes(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + h_value_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); + } + rmm::device_uvector d_value_buffer_sizes(loop_count, handle.get_stream()); + raft::update_device(d_value_buffer_sizes.data(), + h_value_buffer_sizes.data(), + h_value_buffer_sizes.size(), + handle.get_stream()); + rmm::device_uvector d_aggregate_value_buffer_sizes(minor_comm_size * loop_count, + handle.get_stream()); + std::vector tmp_rx_sizes(minor_comm_size, loop_count); + std::vector tmp_rx_displs = std::vector(minor_comm_size); + std::exclusive_scan( + tmp_rx_sizes.begin(), tmp_rx_sizes.end(), tmp_rx_displs.begin(), size_t{0}); + device_allgatherv(minor_comm, + d_value_buffer_sizes.data(), + d_aggregate_value_buffer_sizes.data(), + tmp_rx_sizes, + tmp_rx_displs, + handle.get_stream()); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + std::vector h_aggregate_value_buffer_sizes( + d_aggregate_value_buffer_sizes.size()); + raft::update_host(h_aggregate_value_buffer_sizes.data(), + d_aggregate_value_buffer_sizes.data(), + d_aggregate_value_buffer_sizes.size(), + handle.get_stream()); + handle.sync_stream(); + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + rx_sizes = std::vector(minor_comm_size); + rx_displs = std::vector(minor_comm_size); + for (int k = 0; k < minor_comm_size; ++k) { + (*rx_sizes)[k] = h_aggregate_value_buffer_sizes[k * loop_count + j]; + } + std::exclusive_scan( + (*rx_sizes).begin(), (*rx_sizes).end(), (*rx_displs).begin(), size_t{0}); + rx_values = allocate_dataframe_buffer((*rx_displs).back() + (*rx_sizes).back(), + handle.get_stream()); } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime13 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + handle.sync_stream(); #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime14 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); auto& values = edge_partition_values[j]; - // FIXME: this if-else might be unnecessary if (minor_comm_rank == static_cast(partition_idx)) { device_gatherv(minor_comm, get_dataframe_buffer_begin(values), - get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_begin(*rx_values), values.size(), - rx_sizes, - rx_displs, + *rx_sizes, + *rx_displs, static_cast(partition_idx), loop_stream); } else { @@ -2264,32 +2583,38 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime15 = std::chrono::steady_clock::now(); #endif - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime16 = std::chrono::steady_clock::now(); #endif - if (size_dataframe_buffer(rx_values) > 0) { - auto j = static_cast(minor_comm_rank % num_concurrent_loops); + if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + // FIXME: we can use 8 bit integer for ranks (and 32 bit integers for rx_offsets) to cut + // sort time significantly auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); - rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); - thrust::sequence( - handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), vertex_t{0}); - thrust::stable_sort_by_key(handle.get_thrust_policy(), + rmm::device_uvector rx_offsets(selected_ranks.size(), loop_stream); + thrust::sequence(rmm::exec_policy_nosync(loop_stream), + rx_offsets.begin(), + rx_offsets.end(), + vertex_t{0}); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(loop_stream), selected_ranks.begin(), selected_ranks.end(), rx_offsets.begin()); // selected_ranks[] == comm_size if no GPU in minor_comm has a non-init value - rx_offsets.resize(rx_displs.back() + rx_sizes.back(), handle.get_stream()); - - thrust::scatter(handle.get_thrust_policy(), - get_dataframe_buffer_begin(rx_values), - get_dataframe_buffer_end(rx_values), + rx_offsets.resize((*rx_displs).back() + (*rx_sizes).back(), loop_stream); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), rx_offsets.begin(), tmp_vertex_value_output_first); - handle.sync_stream(); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime17 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; @@ -2319,7 +2644,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif } else { for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i * num_concurrent_loops + j; + auto partition_idx = i + j; auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); From b6a1fb06d9ff13ec2020e87a153e28a768b03d81 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 9 Sep 2024 00:13:45 -0700 Subject: [PATCH 070/126] multi-stream execution --- cpp/src/prims/fill_edge_src_dst_property.cuh | 232 ++++++++++-------- .../prims/update_edge_src_dst_property.cuh | 3 +- 2 files changed, 132 insertions(+), 103 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 46a12555eca..3197ac8e963 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -316,7 +316,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { auto& comm = handle.get_comms(); - auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); @@ -362,128 +362,158 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto local_v_list_range_lasts = host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + auto num_concurrent_bcasts = + (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * + sizeof(vertex_t)) / + std::min( + (std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / major_comm_size) * + sizeof(vertex_t), + size_t{1}); + num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); + num_concurrent_bcasts = + std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); + + std::optional> stream_pool_indices{std::nullopt}; + if (num_concurrent_bcasts > 1) { + stream_pool_indices = std::vector(num_concurrent_bcasts); + std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); + } + std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); } else { key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } + if (stream_pool_indices) { handle.sync_stream(); } auto edge_partition_keys = edge_minor_property_output.keys(); - for (int i = 0; i < major_comm_size; ++i) { - if (is_packed_bool() && - !edge_partition_keys && use_bitmap_flags[i]) { - rmm::device_uvector rx_bitmap( - packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]), - handle.get_stream()); - device_bcast( - major_comm, - (i == major_comm_rank) ? (*v_list_bitmap).data() : static_cast(nullptr), - rx_bitmap.data(), - rx_bitmap.size(), - i, - handle.get_stream()); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_bitmap.size()), - [input, - output_value_first = - edge_partition_value_first + - packed_bool_offset(local_v_list_range_firsts[i] - minor_range_first), - rx_bitmap = raft::device_span(rx_bitmap.data(), - rx_bitmap.size())] __device__(size_t i) { - if ((i == 0) || (i == (rx_bitmap.size() - 1))) { // first or last - cuda::atomic_ref word(*(output_value_first + i)); - if (input) { - word.fetch_or(rx_bitmap[i], cuda::std::memory_order_relaxed); - } else { - word.fetch_and(~rx_bitmap[i], cuda::std::memory_order_relaxed); - } - } else { - if (input) { - *(output_value_first + i) |= rx_bitmap[i]; - } else { - *(output_value_first + i) &= ~rx_bitmap[i]; - } - } - }); - } else { - rmm::device_uvector rx_vertices(local_v_list_sizes[i], handle.get_stream()); - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - std::variant, decltype(sorted_unique_vertex_first)> - v_list{}; - if (use_bitmap_flags[i]) { - v_list = - (i == major_comm_rank) - ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) - : raft::device_span(static_cast(nullptr), size_t{0}); - } else { - v_list = sorted_unique_vertex_first; - } - device_bcast_vertex_list(major_comm, - v_list, - rx_vertices.begin(), - local_v_list_range_firsts[i], - local_v_list_range_lasts[i], - local_v_list_sizes[i], - i, - handle.get_stream()); - - if (edge_partition_keys) { + for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { + auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); + + if (is_packed_bool() && + !edge_partition_keys && use_bitmap_flags[partition_idx]) { + rmm::device_uvector rx_bitmap( + packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + loop_stream); + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) ? (*v_list_bitmap).data() + : static_cast(nullptr), + rx_bitmap.data(), + rx_bitmap.size(), + partition_idx, + loop_stream); thrust::for_each( - handle.get_thrust_policy(), + rmm::exec_policy_nosync(loop_stream), thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_v_list_sizes[i]), - [rx_vertex_first = rx_vertices.begin(), - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[i], - subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[i + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[i]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); + thrust::make_counting_iterator(rx_bitmap.size()), + [input, + output_value_first = + edge_partition_value_first + + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first), + rx_bitmap = raft::device_span(rx_bitmap.data(), + rx_bitmap.size())] __device__(size_t i) { + if ((i == 0) || (i == (rx_bitmap.size() - 1))) { // first or last + cuda::atomic_ref word( + *(output_value_first + i)); + if (input) { + word.fetch_or(rx_bitmap[i], cuda::std::memory_order_relaxed); } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + word.fetch_and(~rx_bitmap[i], cuda::std::memory_order_relaxed); + } + } else { + if (input) { + *(output_value_first + i) |= rx_bitmap[i]; + } else { + *(output_value_first + i) &= ~rx_bitmap[i]; } } }); } else { - if constexpr (contains_packed_bool_element) { + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], + loop_stream); + // FIXME: these broadcast operations can be placed between ncclGroupStart() and + // ncclGroupEnd() + std::variant, decltype(sorted_unique_vertex_first)> + v_list{}; + if (use_bitmap_flags[partition_idx]) { + v_list = (static_cast(partition_idx) == major_comm_rank) ? raft::device_span( + (*v_list_bitmap).data(), (*v_list_bitmap).size()) + : raft::device_span( + static_cast(nullptr), size_t{0}); + } else { + v_list = sorted_unique_vertex_first; + } + device_bcast_vertex_list(major_comm, + v_list, + rx_vertices.begin(), + local_v_list_range_firsts[partition_idx], + local_v_list_range_lasts[partition_idx], + local_v_list_sizes[partition_idx], + partition_idx, + loop_stream); + + if (edge_partition_keys) { thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(local_v_list_sizes[i])), - [minor_range_first, - rx_vertex_first = rx_vertices.begin(), + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [rx_vertex_first = rx_vertices.begin(), input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = rx_vertex - minor_range_first; - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], + subrange_key_last = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { + auto minor = *(rx_vertex_first + i); + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + } + } }); } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type( - [minor_range_first] __device__(auto v) { return v - minor_range_first; })); - auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(handle.get_thrust_policy(), - val_first, - val_first + local_v_list_sizes[i], - map_first, - edge_partition_value_first); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator( + static_cast(local_v_list_sizes[partition_idx])), + [minor_range_first, + rx_vertex_first = rx_vertices.begin(), + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + auto rx_vertex = *(rx_vertex_first + i); + auto minor_offset = rx_vertex - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } } } } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } } else { assert(graph_view.local_vertex_partition_range_size() == diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index 2408dcb3d68..f95928520ab 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -495,8 +495,7 @@ void update_edge_minor_property(raft::handle_t const& handle, (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t)) / std::max(bcast_size, size_t{1}); - num_concurrent_bcasts = std::max(num_concurrent_bcasts, size_t{1}); - num_concurrent_bcasts = std::min(num_concurrent_bcasts, static_cast(major_comm_size)); + num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); auto num_rounds = (static_cast(major_comm_size) + num_concurrent_bcasts - size_t{1}) / num_concurrent_bcasts; From 3f71304587071021a871bdf59c006a1a5960d7b7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 9 Sep 2024 00:14:08 -0700 Subject: [PATCH 071/126] more performance logs --- cpp/src/traversal/bfs_impl.cuh | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 21f84d3cfbe..1edb5e296b4 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -287,10 +287,10 @@ void bfs(raft::handle_t const& handle, // 4. BFS iteration vertex_t depth{0}; bool top_down = true; - auto cur_aggregate_vertex_frontier_size = + auto cur_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_cur).aggregate_size()); while (true) { - vertex_t next_aggregate_vertex_frontier_size{}; + vertex_t next_aggregate_frontier_size{}; if (top_down) { #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -338,13 +338,13 @@ void bfs(raft::handle_t const& handle, auto topdown2 = std::chrono::steady_clock::now(); #endif - next_aggregate_vertex_frontier_size = + next_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_next).aggregate_size()); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown3 = std::chrono::steady_clock::now(); #endif - if (next_aggregate_vertex_frontier_size == 0) { + if (next_aggregate_frontier_size == 0) { #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete std::chrono::duration dur0 = topdown1 - topdown0; std::chrono::duration dur1 = topdown2 - topdown1; @@ -423,7 +423,7 @@ void bfs(raft::handle_t const& handle, handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) : m_u; if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && - (next_aggregate_vertex_frontier_size >= cur_aggregate_vertex_frontier_size)) { + (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { top_down = false; } } @@ -455,7 +455,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; std::chrono::duration dur = topdown6 - topdown0; - std::cout << "topdown (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() + std::cout << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," << dur5.count() << ") s." << std::endl; #endif @@ -533,7 +533,7 @@ void bfs(raft::handle_t const& handle, auto bottomup1 = std::chrono::steady_clock::now(); #endif - next_aggregate_vertex_frontier_size = + next_aggregate_frontier_size = GraphViewType::is_multi_gpu ? host_scalar_allreduce(handle.get_comms(), static_cast(new_frontier_vertex_buffer.size()), @@ -544,7 +544,7 @@ void bfs(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup2 = std::chrono::steady_clock::now(); #endif - if (next_aggregate_vertex_frontier_size == 0) { + if (next_aggregate_frontier_size == 0) { #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; @@ -567,7 +567,7 @@ void bfs(raft::handle_t const& handle, #endif // FIXME: better move this right after host_scalar_allreduce??? - auto aggregate_nzd_unvisted_vertices = + auto aggregate_nzd_unvisited_vertices = GraphViewType::is_multi_gpu ? host_scalar_allreduce(handle.get_comms(), static_cast((*nzd_unvisited_vertices).size()), @@ -575,9 +575,9 @@ void bfs(raft::handle_t const& handle, handle.get_stream()) : static_cast((*nzd_unvisited_vertices).size()); - if ((next_aggregate_vertex_frontier_size * direction_optimizing_beta < - aggregate_nzd_unvisted_vertices) && - (next_aggregate_vertex_frontier_size < cur_aggregate_vertex_frontier_size)) { + if ((next_aggregate_frontier_size * direction_optimizing_beta < + aggregate_nzd_unvisited_vertices) && + (next_aggregate_frontier_size < cur_aggregate_frontier_size)) { top_down = true; } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -605,12 +605,10 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; std::chrono::duration dur = bottomup5 - bottomup0; - std::cout << "bottomup (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." - << std::endl; + std::cout << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." << std::endl; #endif } - cur_aggregate_vertex_frontier_size = next_aggregate_vertex_frontier_size; + cur_aggregate_frontier_size = next_aggregate_frontier_size; depth++; if (depth >= depth_limit) { break; } From 35776991636b8cdf162960ceda090f845b668af9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 9 Sep 2024 17:59:36 -0700 Subject: [PATCH 072/126] update logging --- .../prims/detail/per_v_transform_reduce_e.cuh | 9 ++-- cpp/src/prims/fill_edge_src_dst_property.cuh | 1 + .../create_graph_from_edgelist_impl.cuh | 45 +++++++++++++++- cpp/src/structure/renumber_edgelist_impl.cuh | 52 +++++++++++++++++++ cpp/src/traversal/bfs_impl.cuh | 37 ++++++++----- 5 files changed, 127 insertions(+), 17 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index b31496bc960..eea8b3ccdec 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1348,7 +1348,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 0 +#define PER_V_PERFORMANCE_MEASUREMENT 1 template subdur14 = subtime15 - subtime14; std::chrono::duration subdur15 = subtime16 - subtime15; std::chrono::duration subdur16 = subtime17 - subtime16; - std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," + std::cout << comm_rank << ":sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," @@ -2760,8 +2761,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\tdetail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << ")" << std::endl; + std::cout << "\t\t" << comm_rank << ":detail::per_v (prep, ep, comm) took (" << dur0.count() + << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 3197ac8e963..b30397f5d8c 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -372,6 +372,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); + std::cout << comm.get_rank() << ":" << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; std::optional> stream_pool_indices{std::nullopt}; if (num_concurrent_bcasts > 1) { diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 9796ddd60a1..b2ac05b3fa0 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -335,8 +335,13 @@ create_graph_from_partitioned_edgelist( auto const minor_comm_size = minor_comm.get_size(); // 1. renumber +#if 1 + auto const comm_rank = handle.get_comms().get_rank(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 0" << std::endl; +#endif - std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); + std::vector < edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0}); for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) { edgelist_edge_counts[i] = static_cast(edge_partition_edgelist_srcs[i].size()); } @@ -362,6 +367,10 @@ create_graph_from_partitioned_edgelist( num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 1" << std::endl; +#endif auto total_global_mem = handle.get_device_properties().totalGlobalMem; size_t element_size = sizeof(vertex_t) * 2; @@ -567,6 +576,10 @@ create_graph_from_partitioned_edgelist( } // 3. segmented sort neighbors +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 3" << std::endl; +#endif for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { if (edge_partition_weights) { @@ -653,6 +666,10 @@ create_graph_from_partitioned_edgelist( } // 4. create a graph and an edge_property_t object. +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_partitioned 4" << std::endl; +#endif std::optional, weight_t>> edge_weights{std::nullopt}; @@ -933,6 +950,11 @@ create_graph_from_edgelist_impl( bool renumber, bool do_expensive_check) { +#if 1 + auto const comm_rank = handle.get_comms().get_rank(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 0" << std::endl; +#endif auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -1024,6 +1046,10 @@ create_graph_from_edgelist_impl( // 1. groupby each edge chunks to their target local adjacency matrix partition (and further // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex // IDs). +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 1" << std::endl; +#endif std::vector>> edgelist_partitioned_srcs( edgelist_srcs.size()); @@ -1154,6 +1180,10 @@ create_graph_from_edgelist_impl( if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } // 2. split the grouped edge chunks to local partitions +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl; +#endif auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); @@ -1201,7 +1231,13 @@ create_graph_from_edgelist_impl( intra_partition_segment_sizes.end(), intra_partition_segment_offsets.begin() + 1); +#if 1 + std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl; +#endif rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); +#if 1 + std::cout << comm_rank << ": i=" << i << " tmp_srcs allocated" << std::endl; +#endif for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; @@ -1218,6 +1254,9 @@ create_graph_from_edgelist_impl( edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); +#if 1 + std::cout << comm_rank << ": i=" << i << " tmp_dsts allocated" << std::endl; +#endif for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; @@ -1289,6 +1328,10 @@ create_graph_from_edgelist_impl( edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl; +#endif return create_graph_from_partitioned_edgelist, std::vector, vertex_t> compu // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to // construct local_vertices) +#if 1 + auto comm_rank = multi_gpu ? handle.get_comms().get_rank() : int{0}; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 0" << std::endl; +#endif rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { constexpr size_t num_bins{ @@ -521,6 +526,10 @@ std::tuple, std::vector, vertex_t> compu thrust::sort( handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end()); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 1" << std::endl; +#endif // 2. find an unused vertex ID @@ -531,6 +540,10 @@ std::tuple, std::vector, vertex_t> compu CUGRAPH_EXPECTS(locally_unused_vertex_id.has_value(), "Invalid input arguments: there is no unused value in the entire range of " "vertex_t, increase vertex_t to 64 bit."); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 2" << std::endl; +#endif // 3. compute global degrees for the sorted local vertices @@ -706,6 +719,10 @@ std::tuple, std::vector, vertex_t> compu offset += this_chunk_size; } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 3" << std::endl; +#endif // 5. sort local vertices by degree (descending) @@ -773,6 +790,10 @@ std::tuple, std::vector, vertex_t> compu d_segment_offsets.size(), handle.get_stream()); handle.sync_stream(); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 4" << std::endl; +#endif return std::make_tuple( std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id); @@ -1065,12 +1086,20 @@ renumber_edgelist( // 1. compute renumber map +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 0" << std::endl; +#endif auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, edgelist_const_minors, edgelist_edge_counts); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 1" << std::endl; +#endif // 2. initialize partition_t object, number_of_vertices, and number_of_edges @@ -1106,6 +1135,10 @@ renumber_edgelist( // 3. renumber edges +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 2" << std::endl; +#endif { vertex_t max_edge_partition_major_range_size{0}; for (size_t i = 0; i < edgelist_majors.size(); ++i) { @@ -1138,11 +1171,23 @@ renumber_edgelist( } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank + << ":renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" + << partition.local_edge_partition_minor_range_size() + << " number_of_edges=" << number_of_edges << " comm_size=" << comm_size + << " edgelist_intra_partition_segment_offsets.has_value()=" + << edgelist_intra_partition_segment_offsets.has_value() << std::endl; +#endif if ((static_cast(partition.local_edge_partition_minor_range_size() * 2.5 /* tuning parameter */) >= static_cast(number_of_edges / comm_size)) && edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) // part than the O(E/P) part +#if 1 + std::cout << comm_rank << "path A" << std::endl; +#endif vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = @@ -1182,6 +1227,9 @@ renumber_edgelist( } } } else { +#if 1 + std::cout << comm_rank << ":path B" << std::endl; +#endif rmm::device_uvector renumber_map_minor_labels( partition.local_edge_partition_minor_range_size(), handle.get_stream()); std::vector recvcounts(major_comm_size); @@ -1216,6 +1264,10 @@ renumber_edgelist( } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":renumber_edgelist 4" << std::endl; +#endif auto edge_partition_segment_offsets = detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 1edb5e296b4..6afa5505af5 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -120,6 +120,7 @@ void bfs(raft::handle_t const& handle, "GraphViewType should support the push model."); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + auto const comm_rank = GraphViewType::is_multi_gpu ? handle.get_comms().get_rank() : int{0}; RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep0 = std::chrono::steady_clock::now(); #endif @@ -275,13 +276,15 @@ void bfs(raft::handle_t const& handle, true); #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto prep4 = std::chrono::steady_clock::now(); + auto prep4 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = prep1 - prep0; std::chrono::duration dur1 = prep2 - prep1; std::chrono::duration dur2 = prep3 - prep2; std::chrono::duration dur3 = prep4 - prep3; - std::chrono::duration dur = prep4 - prep0; - std::cout << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; + std::chrono::duration dur = prep4 - prep0; + std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() + << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." + << std::endl; #endif // 4. BFS iteration @@ -350,8 +353,9 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur = topdown3 - topdown0; - std::cout << "topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took " + << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ") s." << std::endl; #endif break; } @@ -368,7 +372,8 @@ void bfs(raft::handle_t const& handle, #endif if (direction_optimizing) { - // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed concurrently. + // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed + // concurrently. // FIXME: also the above fill_edge_dst_property can be executed concurrently. auto m_f = thrust::transform_reduce( handle.get_thrust_policy(), @@ -455,9 +460,11 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; std::chrono::duration dur = topdown6 - topdown0; - std::cout << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," - << dur5.count() << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth + << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << "," << dur5.count() << ") s." << std::endl; #endif } else { // bottom up #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -549,8 +556,9 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; - std::cout << "bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() - << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took " + << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s." + << std::endl; #endif break; } @@ -605,7 +613,12 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; std::chrono::duration dur = bottomup5 - bottomup0; - std::cout << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") s." << std::endl; + std::cout << comm_rank << ":depth=" << depth + << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices + << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << ") s." << std::endl; #endif } cur_aggregate_frontier_size = next_aggregate_frontier_size; From 42c4d0b48d4792ddaf92eac11311eb2650a1cf52 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 10 Sep 2024 16:45:54 -0700 Subject: [PATCH 073/126] use global comm to shuffle in compute_renumber_map (to avoid P2P buffer initilaization in subcomm) --- cpp/src/structure/renumber_edgelist_impl.cuh | 399 +++++++++---------- 1 file changed, 197 insertions(+), 202 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 27bf9c4fcdf..e4958e65d94 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -262,166 +262,113 @@ std::tuple, std::vector, vertex_t> compu 1; // shouldn't be 0 (in that case this hash function will coincide with the hash function // used to map vertices to GPUs, and we may not see the expected randomization) - rmm::device_uvector sorted_unique_majors(0, handle.get_stream()); - { - auto edge_count_vectors = num_bins > 1 - ? std::make_optional>>( - edgelist_majors.size(), std::vector(num_bins)) - : std::nullopt; - if (edge_count_vectors) { - for (size_t i = 0; i < edgelist_majors.size(); ++i) { - rmm::device_uvector d_edge_counts(num_bins, handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), d_edge_counts.begin(), d_edge_counts.end(), edge_t{0}); - thrust::for_each(handle.get_thrust_policy(), - edgelist_majors[i], - edgelist_majors[i] + edgelist_edge_counts[i], - [counts = raft::device_span( - d_edge_counts.data(), d_edge_counts.size())] __device__(auto v) { - cuco::detail::MurmurHash3_32 hash_func{hash_seed}; - cuda::atomic_ref atomic_counter( - counts[hash_func(v) % num_bins]); - atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); - }); - raft::update_host((*edge_count_vectors)[i].data(), - d_edge_counts.data(), - d_edge_counts.size(), - handle.get_stream()); - } - handle.sync_stream(); + auto edge_major_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_majors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_major_count_vectors) { + for (size_t i = 0; i < edgelist_majors.size(); ++i) { + rmm::device_uvector d_edge_major_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_major_counts.begin(), + d_edge_major_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_major_counts.data(), + d_edge_major_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_major_count_vectors)[i].data(), + d_edge_major_counts.data(), + d_edge_major_counts.size(), + handle.get_stream()); + } + } + + auto edge_minor_count_vectors = num_bins > 1 + ? std::make_optional>>( + edgelist_minors.size(), std::vector(num_bins)) + : std::nullopt; + if (edge_minor_count_vectors) { + for (size_t i = 0; i < edgelist_minors.size(); ++i) { + rmm::device_uvector d_edge_minor_counts(num_bins, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + d_edge_minor_counts.begin(), + d_edge_minor_counts.end(), + edge_t{0}); + thrust::for_each( + handle.get_thrust_policy(), + edgelist_minors[i], + edgelist_minors[i] + edgelist_edge_counts[i], + [counts = raft::device_span(d_edge_minor_counts.data(), + d_edge_minor_counts.size())] __device__(auto v) { + cuco::detail::MurmurHash3_32 hash_func{hash_seed}; + cuda::atomic_ref atomic_counter( + counts[hash_func(v) % num_bins]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); + raft::update_host((*edge_minor_count_vectors)[i].data(), + d_edge_minor_counts.data(), + d_edge_minor_counts.size(), + handle.get_stream()); } + } - for (size_t i = 0; i < num_bins; ++i) { - std::vector> - edge_partition_sorted_unique_majors{}; // for bin "i" - edge_partition_sorted_unique_majors.reserve(edgelist_majors.size()); + handle.sync_stream(); + + for (size_t i = 0; i < num_bins; ++i) { + std::vector> edge_partition_tmp_majors{}; // for bin "i" + { + edge_partition_tmp_majors.reserve(edgelist_majors.size()); for (size_t j = 0; j < edgelist_majors.size(); ++j) { - rmm::device_uvector majors(0, handle.get_stream()); + rmm::device_uvector tmp_majors(0, handle.get_stream()); if (num_bins > 1) { - majors.resize((*edge_count_vectors)[j][i], handle.get_stream()); + tmp_majors.resize((*edge_major_count_vectors)[j][i], handle.get_stream()); thrust::copy_if(handle.get_thrust_policy(), edgelist_majors[j], edgelist_majors[j] + edgelist_edge_counts[j], - majors.begin(), + tmp_majors.begin(), [i] __device__(auto v) { cuco::detail::MurmurHash3_32 hash_func{hash_seed}; return (static_cast(hash_func(v) % num_bins) == i); }); } else { - majors.resize(edgelist_edge_counts[j], handle.get_stream()); + tmp_majors.resize(edgelist_edge_counts[j], handle.get_stream()); thrust::copy(handle.get_thrust_policy(), edgelist_majors[j], edgelist_majors[j] + edgelist_edge_counts[j], - majors.begin()); + tmp_majors.begin()); } - thrust::sort(handle.get_thrust_policy(), majors.begin(), majors.end()); - majors.resize(thrust::distance( - majors.begin(), - thrust::unique(handle.get_thrust_policy(), majors.begin(), majors.end())), - handle.get_stream()); - majors.shrink_to_fit(handle.get_stream()); - - edge_partition_sorted_unique_majors.push_back(std::move(majors)); - } - - rmm::device_uvector tmp_majors(0, handle.get_stream()); - if constexpr (multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - assert(static_cast(minor_comm_size) == - edge_partition_sorted_unique_majors.size()); - - if (minor_comm_size > 1) { - std::vector tx_counts(minor_comm_size); - for (size_t j = 0; j < edge_partition_sorted_unique_majors.size(); ++j) { - tx_counts[j] = edge_partition_sorted_unique_majors[j].size(); - } - tmp_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()), handle.get_stream()); - size_t offset{0}; - for (size_t j = 0; j < edge_partition_sorted_unique_majors.size(); ++j) { - thrust::copy(handle.get_thrust_policy(), - edge_partition_sorted_unique_majors[j].begin(), - edge_partition_sorted_unique_majors[j].end(), - tmp_majors.begin() + offset); - offset += tx_counts[j]; - } - edge_partition_sorted_unique_majors.clear(); - - std::tie(tmp_majors, std::ignore) = - shuffle_values(minor_comm, tmp_majors.begin(), tx_counts, handle.get_stream()); - - thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - tmp_majors.resize( - thrust::distance( - tmp_majors.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), - handle.get_stream()); - tmp_majors.shrink_to_fit(handle.get_stream()); - } else { - assert(edge_partition_sorted_unique_majors.size() == 1); - tmp_majors = std::move(edge_partition_sorted_unique_majors[0]); - } - } else { - assert(edge_partition_sorted_unique_majors.size() == 1); - tmp_majors = std::move(edge_partition_sorted_unique_majors[0]); - } - - if (i == 0) { - sorted_unique_majors = std::move(tmp_majors); - } else { - rmm::device_uvector merged_majors( - sorted_unique_majors.size() + tmp_majors.size(), handle.get_stream()); - thrust::merge(handle.get_thrust_policy(), - sorted_unique_majors.begin(), - sorted_unique_majors.end(), - tmp_majors.begin(), - tmp_majors.end(), - merged_majors.begin()); // merging two unique sets from different hash - // bins, so the merged set can't have duplicates - sorted_unique_majors = std::move(merged_majors); - } - } - } + thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); + tmp_majors.resize( + thrust::distance( + tmp_majors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end())), + handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); - rmm::device_uvector sorted_unique_minors(0, handle.get_stream()); - { - auto edge_count_vectors = num_bins > 1 - ? std::make_optional>>( - edgelist_minors.size(), std::vector(num_bins)) - : std::nullopt; - if (edge_count_vectors) { - for (size_t i = 0; i < edgelist_minors.size(); ++i) { - rmm::device_uvector d_edge_counts(num_bins, handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), d_edge_counts.begin(), d_edge_counts.end(), edge_t{0}); - thrust::for_each(handle.get_thrust_policy(), - edgelist_minors[i], - edgelist_minors[i] + edgelist_edge_counts[i], - [counts = raft::device_span( - d_edge_counts.data(), d_edge_counts.size())] __device__(auto v) { - cuco::detail::MurmurHash3_32 hash_func{hash_seed}; - cuda::atomic_ref atomic_counter( - counts[hash_func(v) % num_bins]); - atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); - }); - raft::update_host((*edge_count_vectors)[i].data(), - d_edge_counts.data(), - d_edge_counts.size(), - handle.get_stream()); + edge_partition_tmp_majors.push_back(std::move(tmp_majors)); } - handle.sync_stream(); } - for (size_t i = 0; i < num_bins; ++i) { + rmm::device_uvector tmp_minors(0, handle.get_stream()); + { edge_t bin_size{0}; - if (edge_count_vectors) { + if (edge_minor_count_vectors) { for (size_t j = 0; j < edgelist_minors.size(); ++j) { - bin_size += (*edge_count_vectors)[j][i]; + bin_size += (*edge_minor_count_vectors)[j][i]; } } else { bin_size = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); } - rmm::device_uvector tmp_minors(bin_size, handle.get_stream()); + tmp_minors.resize(bin_size, handle.get_stream()); + edge_t offset{0}; for (size_t j = 0; j < edgelist_minors.size(); ++j) { if (num_bins > 1) { @@ -433,7 +380,7 @@ std::tuple, std::vector, vertex_t> compu cuco::detail::MurmurHash3_32 hash_func{hash_seed}; return (static_cast(hash_func(v) % num_bins) == i); }); - offset += (*edge_count_vectors)[j][i]; + offset += (*edge_minor_count_vectors)[j][i]; } else { thrust::copy(handle.get_thrust_policy(), edgelist_minors[j], @@ -449,78 +396,126 @@ std::tuple, std::vector, vertex_t> compu thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), handle.get_stream()); tmp_minors.shrink_to_fit(handle.get_stream()); + } - if constexpr (multi_gpu) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - compute_gpu_id_from_ext_vertex_t gpu_id_func{ - comm_size, major_comm_size, minor_comm_size}; - std::tie(tmp_minors, std::ignore) = groupby_gpu_id_and_shuffle_values( - major_comm, - tmp_minors.begin(), - tmp_minors.end(), - [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { - auto comm_rank = gpu_id_func(v); - return partition_manager::compute_major_comm_rank_from_global_comm_rank( - major_comm_size, minor_comm_size, comm_rank); - }, - handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); - tmp_minors.resize( - thrust::distance( - tmp_minors.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), - handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); + rmm::device_uvector tmp_vertices(0, handle.get_stream()); + if (multi_gpu && (handle.get_comms().get_size() > 1)) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + auto d_minor_counts = groupby_and_count( + tmp_minors.begin(), + tmp_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, gpu_id_func(v)); + }, + major_comm_size, + std::numeric_limits::max(), + handle.get_stream()); + std::vector h_minor_counts(d_minor_counts.size()); + raft::update_host( + h_minor_counts.data(), d_minor_counts.data(), d_minor_counts.size(), handle.get_stream()); + handle.sync_stream(); + std::vector h_minor_displacements(h_minor_counts.size()); + std::exclusive_scan( + h_minor_counts.begin(), h_minor_counts.end(), h_minor_displacements.begin(), size_t{0}); + + std::vector tx_counts(comm_size, 0); + for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { + auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, major_comm_rank, j); + tx_counts[idx] = edge_partition_tmp_majors[j].size(); } - - if (i == 0) { - sorted_unique_minors = std::move(tmp_minors); - } else { - rmm::device_uvector merged_minors( - sorted_unique_minors.size() + tmp_minors.size(), handle.get_stream()); - thrust::merge(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end(), - tmp_minors.begin(), - tmp_minors.end(), - merged_minors.begin()); - sorted_unique_minors = std::move(merged_minors); - tmp_minors.resize(0, handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); - sorted_unique_minors.resize(thrust::distance(sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_unique_minors.begin(), - sorted_unique_minors.end())), - handle.get_stream()); - sorted_unique_minors.shrink_to_fit(handle.get_stream()); + for (size_t j = 0; j < h_minor_counts.size(); ++j) { + auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, j, minor_comm_rank); + tx_counts[idx] += h_minor_counts[j]; } + std::vector tx_displacements(comm_size); + std::exclusive_scan( + tx_counts.begin(), tx_counts.end(), tx_displacements.begin(), size_t{0}); + tmp_vertices.resize(tx_displacements.back() + tx_counts.back(), handle.get_stream()); + for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { + auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, major_comm_rank, j); + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_majors[j].begin(), + edge_partition_tmp_majors[j].end(), + tmp_vertices.begin() + tx_displacements[idx]); + edge_partition_tmp_majors[j].resize(0, handle.get_stream()); + edge_partition_tmp_majors[j].shrink_to_fit(handle.get_stream()); + } + for (size_t j = 0; j < h_minor_counts.size(); ++j) { + auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, j, minor_comm_rank); + thrust::copy( + handle.get_thrust_policy(), + tmp_minors.begin() + h_minor_displacements[j], + tmp_minors.begin() + (h_minor_displacements[j] + h_minor_counts[j]), + tmp_vertices.begin() + tx_displacements[idx] + (tx_counts[idx] - h_minor_counts[j])); + } + tmp_minors.resize(0, handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + + // single shuffle_values() on comm instead of one shuffle_values() on minor_comm & one + // shuffle_values() on majro_comm (to cut NCCL P2P buffer size) + std::tie(tmp_vertices, std::ignore) = + shuffle_values(comm, tmp_vertices.begin(), tx_counts, handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end()); + tmp_vertices.resize( + thrust::distance( + tmp_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())), + handle.get_stream()); + tmp_vertices.shrink_to_fit(handle.get_stream()); + } else { + assert(edge_partition_sorted_unique_majors.size() == 1); + auto& tmp_majors = edge_partition_tmp_majors[0]; + rmm::device_uvector merged_vertices(tmp_majors.size() + tmp_minors.size(), + handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + tmp_majors.begin(), + tmp_majors.end(), + tmp_minors.begin(), + tmp_minors.end(), + merged_vertices.begin()); + tmp_majors.resize(0, handle.get_stream()); + tmp_majors.shrink_to_fit(handle.get_stream()); + tmp_minors.resize(0, handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + merged_vertices.resize(thrust::distance(merged_vertices.begin(), + thrust::unique(handle.get_thrust_policy(), + merged_vertices.begin(), + merged_vertices.end())), + handle.get_stream()); + merged_vertices.shrink_to_fit(handle.get_stream()); + tmp_vertices = std::move(merged_vertices); } - } - sorted_local_vertices.resize(sorted_unique_majors.size() + sorted_unique_minors.size(), - handle.get_stream()); - thrust::merge(handle.get_thrust_policy(), - sorted_unique_majors.begin(), - sorted_unique_majors.end(), - sorted_unique_minors.begin(), - sorted_unique_minors.end(), - sorted_local_vertices.begin()); - sorted_unique_majors.resize(0, handle.get_stream()); - sorted_unique_majors.shrink_to_fit(handle.get_stream()); - sorted_unique_minors.resize(0, handle.get_stream()); - sorted_unique_minors.shrink_to_fit(handle.get_stream()); - sorted_local_vertices.resize(thrust::distance(sorted_local_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), - sorted_local_vertices.begin(), - sorted_local_vertices.end())), - handle.get_stream()); - sorted_local_vertices.shrink_to_fit(handle.get_stream()); + if (sorted_local_vertices.size() == 0) { + sorted_local_vertices = std::move(tmp_vertices); + } else { + rmm::device_uvector merged_vertices( + sorted_local_vertices.size() + tmp_vertices.size(), handle.get_stream()); + thrust::merge(handle.get_thrust_policy(), + sorted_local_vertices.begin(), + sorted_local_vertices.end(), + tmp_vertices.begin(), + tmp_vertices.end(), + merged_vertices.begin()); // merging two unique sets from different hash + // bins, so the merged set can't have duplicates + sorted_local_vertices = std::move(merged_vertices); + } + } } else { sorted_local_vertices = std::move(*local_vertices); thrust::sort( From 255766847fd76de3616e4417bc79919050aa72bb Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 10 Sep 2024 23:54:06 -0700 Subject: [PATCH 074/126] reduce small memory allocations --- .../create_graph_from_edgelist_impl.cuh | 293 ++++++++++-------- 1 file changed, 162 insertions(+), 131 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index b2ac05b3fa0..4b6019efcff 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -325,7 +325,7 @@ create_graph_from_partitioned_edgelist( std::optional>>&& edge_partition_edgelist_weights, std::optional>>&& edge_partition_edgelist_edge_ids, std::optional>>&& edge_partition_edgelist_edge_types, - std::vector> const& edgelist_intra_partition_segment_offsets, + std::vector> const& edgelist_intra_partition_segment_offset_vectors, graph_properties_t graph_properties, bool renumber) { @@ -341,7 +341,7 @@ create_graph_from_partitioned_edgelist( std::cout << comm_rank << ":create_graph_from_partitioned 0" << std::endl; #endif - std::vector < edge_t> edgelist_edge_counts(minor_comm_size, edge_t{0}); + std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); for (size_t i = 0; i < edgelist_edge_counts.size(); ++i) { edgelist_edge_counts[i] = static_cast(edge_partition_edgelist_srcs[i].size()); } @@ -352,14 +352,14 @@ create_graph_from_partitioned_edgelist( src_ptrs[i] = edge_partition_edgelist_srcs[i].begin(); dst_ptrs[i] = edge_partition_edgelist_dsts[i].begin(); } - auto [renumber_map_labels, meta] = - cugraph::renumber_edgelist(handle, - std::move(local_vertices), - src_ptrs, - dst_ptrs, - edgelist_edge_counts, - edgelist_intra_partition_segment_offsets, - store_transposed); + auto [renumber_map_labels, meta] = cugraph::renumber_edgelist( + handle, + std::move(local_vertices), + src_ptrs, + dst_ptrs, + edgelist_edge_counts, + edgelist_intra_partition_segment_offset_vectors, + store_transposed); auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); @@ -807,7 +807,7 @@ create_graph_from_edgelist_impl( handle.sync_stream(); std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); - auto edgelist_intra_partition_segment_offsets = std::vector>( + auto edgelist_intra_partition_segment_offset_vectors = std::vector>( minor_comm_size, std::vector(major_comm_size + 1, edge_t{0})); for (int i = 0; i < minor_comm_size; ++i) { edgelist_edge_counts[i] = std::accumulate(h_edge_counts.begin() + major_comm_size * i, @@ -815,7 +815,7 @@ create_graph_from_edgelist_impl( edge_t{0}); std::partial_sum(h_edge_counts.begin() + major_comm_size * i, h_edge_counts.begin() + major_comm_size * (i + 1), - edgelist_intra_partition_segment_offsets[i].begin() + 1); + edgelist_intra_partition_segment_offset_vectors[i].begin() + 1); } std::vector edgelist_displacements(minor_comm_size, edge_t{0}); std::partial_sum(edgelist_edge_counts.begin(), @@ -915,7 +915,7 @@ create_graph_from_edgelist_impl( std::move(edge_partition_edgelist_weights), std::move(edge_partition_edgelist_edge_ids), std::move(edge_partition_edgelist_edge_types), - edgelist_intra_partition_segment_offsets, + edgelist_intra_partition_segment_offset_vectors, graph_properties, renumber); } @@ -1051,26 +1051,25 @@ create_graph_from_edgelist_impl( std::cout << comm_rank << ":create_graph_from_edgelist_impl 1" << std::endl; #endif - std::vector>> edgelist_partitioned_srcs( - edgelist_srcs.size()); - std::vector>> edgelist_partitioned_dsts( - edgelist_srcs.size()); + auto num_chunks = edgelist_srcs.size(); + + std::vector> edgelist_edge_offset_vectors(num_chunks); + std::vector>> edgelist_partitioned_srcs(num_chunks); + std::vector>> edgelist_partitioned_dsts(num_chunks); auto edgelist_partitioned_weights = - edgelist_weights ? std::make_optional>>>( - edgelist_srcs.size()) - : std::nullopt; + edgelist_weights + ? std::make_optional>>>(num_chunks) + : std::nullopt; auto edgelist_partitioned_edge_ids = edgelist_edge_ids - ? std::make_optional>>>( - edgelist_srcs.size()) + ? std::make_optional>>>(num_chunks) : std::nullopt; auto edgelist_partitioned_edge_types = edgelist_edge_types - ? std::make_optional>>>( - edgelist_srcs.size()) + ? std::make_optional>>>(num_chunks) : std::nullopt; - for (size_t i = 0; i < edgelist_srcs.size(); ++i) { // iterate over input edge chunks + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks std::optional> this_chunk_weights{std::nullopt}; if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); } std::optional> this_chunk_edge_ids{std::nullopt}; @@ -1093,17 +1092,19 @@ create_graph_from_edgelist_impl( d_this_chunk_edge_counts.size(), handle.get_stream()); handle.sync_stream(); - std::vector h_this_chunk_edge_displacements(h_this_chunk_edge_counts.size()); - std::exclusive_scan(h_this_chunk_edge_counts.begin(), + std::vector h_this_chunk_edge_offsets( + h_this_chunk_edge_counts.size() + 1, + 0); // size = minor_comm_size (# local edge partitions) * major_comm_size (# segments in the + // local minor range) + std::inclusive_scan(h_this_chunk_edge_counts.begin(), h_this_chunk_edge_counts.end(), - h_this_chunk_edge_displacements.begin(), - size_t{0}); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_srcs(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_displacements[j]; + h_this_chunk_edge_offsets.begin() + 1); + + for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { + rmm::device_uvector tmp_srcs(h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size], + handle.get_stream()); + auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; thrust::copy( handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin()); edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); @@ -1111,11 +1112,11 @@ create_graph_from_edgelist_impl( edgelist_srcs[i].resize(0, handle.get_stream()); edgelist_srcs[i].shrink_to_fit(handle.get_stream()); - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_dsts(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_displacements[j]; + for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { + rmm::device_uvector tmp_dsts(h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size], + handle.get_stream()); + auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; thrust::copy( handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin()); edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); @@ -1124,11 +1125,13 @@ create_graph_from_edgelist_impl( edgelist_dsts[i].shrink_to_fit(handle.get_stream()); if (this_chunk_weights) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_weights(h_this_chunk_edge_counts[j], handle.get_stream()); - auto input_first = (*this_chunk_weights).begin() + h_this_chunk_edge_displacements[j]; + for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { + rmm::device_uvector tmp_weights( + h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size], + handle.get_stream()); + auto input_first = + (*this_chunk_weights).begin() + h_this_chunk_edge_offsets[j * major_comm_size]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + tmp_weights.size(), @@ -1140,12 +1143,13 @@ create_graph_from_edgelist_impl( } if (this_chunk_edge_ids) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_ids(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_ids).begin() + h_this_chunk_edge_displacements[j]; + for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { + rmm::device_uvector tmp_edge_ids( + h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size], + handle.get_stream()); + auto input_first = + (*this_chunk_edge_ids).begin() + h_this_chunk_edge_offsets[j * major_comm_size]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + tmp_edge_ids.size(), @@ -1157,12 +1161,13 @@ create_graph_from_edgelist_impl( } if (this_chunk_edge_types) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */ * - major_comm_size /* # segments in the local minor range */; - ++j) { - rmm::device_uvector tmp_edge_types(h_this_chunk_edge_counts[j], - handle.get_stream()); - auto input_first = (*this_chunk_edge_types).begin() + h_this_chunk_edge_displacements[j]; + for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { + rmm::device_uvector tmp_edge_types( + h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size], + handle.get_stream()); + auto input_first = + (*this_chunk_edge_types).begin() + h_this_chunk_edge_offsets[j * major_comm_size]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + tmp_edge_types.size(), @@ -1172,6 +1177,8 @@ create_graph_from_edgelist_impl( (*this_chunk_edge_types).resize(0, handle.get_stream()); (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); } + + edgelist_edge_offset_vectors.push_back(std::move(h_this_chunk_edge_offsets)); } edgelist_srcs.clear(); edgelist_dsts.clear(); @@ -1185,7 +1192,8 @@ create_graph_from_edgelist_impl( std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl; #endif - auto edgelist_intra_partition_segment_offsets = std::vector>(minor_comm_size); + auto edgelist_intra_partition_segment_offset_vectors = + std::vector>(minor_comm_size); std::vector> edge_partition_edgelist_srcs{}; edge_partition_edgelist_srcs.reserve(minor_comm_size); @@ -1213,44 +1221,47 @@ create_graph_from_edgelist_impl( for (int i = 0; i < minor_comm_size; ++i) { // iterate over local edge partitions edge_t edge_count{0}; std::vector intra_partition_segment_sizes(major_comm_size, 0); - std::vector intra_segment_copy_output_displacements(major_comm_size * - edgelist_partitioned_srcs.size()); + std::vector intra_segment_copy_output_displacements(major_comm_size * num_chunks); for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) { edge_t displacement{0}; - for (size_t k = 0; k < edgelist_partitioned_srcs.size() /* # input edge chunks */; ++k) { - auto segment_size = edgelist_partitioned_srcs[k][i * major_comm_size + j].size(); + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]); edge_count += segment_size; intra_partition_segment_sizes[j] += segment_size; - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k] = - displacement; + intra_segment_copy_output_displacements[j * num_chunks + k] = displacement; displacement += segment_size; } } - std::vector intra_partition_segment_offsets(major_comm_size + 1, 0); + std::vector intra_partition_segment_offset_vectors(major_comm_size + 1, 0); std::inclusive_scan(intra_partition_segment_sizes.begin(), intra_partition_segment_sizes.end(), - intra_partition_segment_offsets.begin() + 1); - + intra_partition_segment_offset_vectors.begin() + 1); #if 1 std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl; #endif + rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); #if 1 std::cout << comm_rank << ": i=" << i << " tmp_srcs allocated" << std::endl; #endif for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_srcs.size(); ++k) { - auto& input_buffer = edgelist_partitioned_srcs[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_srcs.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); + for (size_t k = 0; k < num_chunks; ++k) { + auto input_first = edgelist_partitioned_srcs[k][i].begin() + + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; + auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_srcs.begin() + intra_partition_segment_offset_vectors[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); } } + for (size_t k = 0; k < num_chunks; ++k) { + edgelist_partitioned_srcs[k][i].resize(0, handle.get_stream()); + edgelist_partitioned_srcs[k][i].shrink_to_fit(handle.get_stream()); + } edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); @@ -1258,75 +1269,92 @@ create_graph_from_edgelist_impl( std::cout << comm_rank << ": i=" << i << " tmp_dsts allocated" << std::endl; #endif for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = edgelist_partitioned_dsts[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); + for (size_t k = 0; k < num_chunks; ++k) { + auto input_first = edgelist_partitioned_dsts[k][i].begin() + + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; + auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_dsts.begin() + intra_partition_segment_offset_vectors[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); } } + for (size_t k = 0; k < num_chunks; ++k) { + edgelist_partitioned_dsts[k][i].resize(0, handle.get_stream()); + edgelist_partitioned_dsts[k][i].shrink_to_fit(handle.get_stream()); + } edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); if (edge_partition_edgelist_weights) { rmm::device_uvector tmp_weights(edge_count, handle.get_stream()); for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_weights)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_weights.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); + for (size_t k = 0; k < num_chunks; ++k) { + auto input_first = (*edgelist_partitioned_weights)[k][i].begin() + + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; + auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_weights.begin() + intra_partition_segment_offset_vectors[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); } } + for (size_t k = 0; k < num_chunks; ++k) { + (*edgelist_partitioned_weights)[k][i].resize(0, handle.get_stream()); + (*edgelist_partitioned_weights)[k][i].shrink_to_fit(handle.get_stream()); + } (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); } if (edge_partition_edgelist_edge_ids) { rmm::device_uvector tmp_edge_ids(edge_count, handle.get_stream()); for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_ids)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); + for (size_t k = 0; k < num_chunks; ++k) { + auto input_first = (*edgelist_partitioned_edge_ids)[k][i].begin() + + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; + auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_edge_ids.begin() + intra_partition_segment_offset_vectors[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); } } + for (size_t k = 0; k < num_chunks; ++k) { + (*edgelist_partitioned_edge_ids)[k][i].resize(0, handle.get_stream()); + (*edgelist_partitioned_edge_ids)[k][i].shrink_to_fit(handle.get_stream()); + } (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); } if (edge_partition_edgelist_edge_types) { rmm::device_uvector tmp_edge_types(edge_count, handle.get_stream()); for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < edgelist_partitioned_dsts.size(); ++k) { - auto& input_buffer = (*edgelist_partitioned_edge_types)[k][i * major_comm_size + j]; - thrust::copy( - handle.get_thrust_policy(), - input_buffer.begin(), - input_buffer.end(), - tmp_edge_types.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * edgelist_partitioned_dsts.size() + k]); - input_buffer.resize(0, handle.get_stream()); - input_buffer.shrink_to_fit(handle.get_stream()); + for (size_t k = 0; k < num_chunks; ++k) { + auto input_first = (*edgelist_partitioned_edge_types)[k][i].begin() + + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; + auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_edge_types.begin() + intra_partition_segment_offset_vectors[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); } } + for (size_t k = 0; k < num_chunks; ++k) { + (*edgelist_partitioned_edge_types)[k][i].resize(0, handle.get_stream()); + (*edgelist_partitioned_edge_types)[k][i].shrink_to_fit(handle.get_stream()); + } (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); } - edgelist_intra_partition_segment_offsets[i] = std::move(intra_partition_segment_offsets); + edgelist_intra_partition_segment_offset_vectors[i] = + std::move(intra_partition_segment_offset_vectors); } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -1347,7 +1375,7 @@ create_graph_from_edgelist_impl( std::move(edge_partition_edgelist_weights), std::move(edge_partition_edgelist_edge_ids), std::move(edge_partition_edgelist_edge_types), - edgelist_intra_partition_segment_offsets, + edgelist_intra_partition_segment_offset_vectors, graph_properties, renumber); } @@ -1410,7 +1438,8 @@ create_graph_from_edgelist_impl( handle, raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " + "Invalid input arguments: graph_properties.is_symmetric is true but the input edge " + "list is " "not symmetric."); } @@ -1420,7 +1449,8 @@ create_graph_from_edgelist_impl( handle, raft::device_span(edgelist_srcs.data(), edgelist_srcs.size()), raft::device_span(edgelist_dsts.data(), edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but the input edge " + "list " "has parallel edges."); } } @@ -1802,15 +1832,15 @@ create_graph_from_edgelist_impl( renumber); if (graph_properties.is_symmetric) { - CUGRAPH_EXPECTS( - (check_symmetric( - handle, - raft::device_span(aggregate_edgelist_srcs.data(), - aggregate_edgelist_srcs.size()), - raft::device_span(aggregate_edgelist_dsts.data(), - aggregate_edgelist_dsts.size()))), - "Invalid input arguments: graph_properties.is_symmetric is true but the input edge list is " - "not symmetric."); + CUGRAPH_EXPECTS((check_symmetric( + handle, + raft::device_span(aggregate_edgelist_srcs.data(), + aggregate_edgelist_srcs.size()), + raft::device_span(aggregate_edgelist_dsts.data(), + aggregate_edgelist_dsts.size()))), + "Invalid input arguments: graph_properties.is_symmetric is true but the " + "input edge list is " + "not symmetric."); } if (!graph_properties.is_multigraph) { @@ -1820,7 +1850,8 @@ create_graph_from_edgelist_impl( aggregate_edgelist_srcs.size()), raft::device_span(aggregate_edgelist_dsts.data(), aggregate_edgelist_dsts.size())), - "Invalid input arguments: graph_properties.is_multigraph is false but the input edge list " + "Invalid input arguments: graph_properties.is_multigraph is false but " + "the input edge list " "has parallel edges."); } } From 0381f22c74ba7de451b71240bf40fcf21a3750bf Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 11 Sep 2024 14:22:22 -0700 Subject: [PATCH 075/126] bug fix --- .../create_graph_from_edgelist_impl.cuh | 99 +++++++++++++------ 1 file changed, 67 insertions(+), 32 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 4b6019efcff..2b8e25a1894 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -1043,16 +1043,47 @@ create_graph_from_edgelist_impl( } } - // 1. groupby each edge chunks to their target local adjacency matrix partition (and further + auto num_chunks = edgelist_srcs.size(); + + // 1. check whether we can temporarily store a 64 bit vertex ID in 48 bit. + + bool clip_high_order_zero_bits = false; + + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if constexpr (sizeof(vertex_t) == 8) { // 64 bit vertex ID + static_assert(std::is_signed_v); // __clzll takes a signed integer + size_t min_clz{sizeof(vertex_t) * 8}; + for (size_t i = 0; i < num_chunks; ++i) { + min_clz = thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + min_clz = thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + } + if (min_clz >= 16) { clip_high_order_zero_bits = true; } + std::cout << "min_clz=" << min_clz << std::endl; + } + + // 2. groupby each edge chunks to their target local adjacency matrix partition (and further // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex // IDs). #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 1" << std::endl; + std::cout << comm_rank << ":create_graph_from_edgelist_impl 1 clip_high_order_zero_bits=" + << clip_high_order_zero_bits << std::endl; #endif - auto num_chunks = edgelist_srcs.size(); - std::vector> edgelist_edge_offset_vectors(num_chunks); std::vector>> edgelist_partitioned_srcs(num_chunks); std::vector>> edgelist_partitioned_dsts(num_chunks); @@ -1178,7 +1209,7 @@ create_graph_from_edgelist_impl( (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); } - edgelist_edge_offset_vectors.push_back(std::move(h_this_chunk_edge_offsets)); + edgelist_edge_offset_vectors[i] = std::move(h_this_chunk_edge_offsets); } edgelist_srcs.clear(); edgelist_dsts.clear(); @@ -1186,7 +1217,7 @@ create_graph_from_edgelist_impl( if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); } if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } - // 2. split the grouped edge chunks to local partitions + // 3. split the grouped edge chunks to local partitions #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl; @@ -1225,18 +1256,18 @@ create_graph_from_edgelist_impl( for (int j = 0; j < major_comm_size /* # segments in the local minor range */; ++j) { edge_t displacement{0}; for (size_t k = 0; k < num_chunks; ++k) { - auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; edge_count += segment_size; intra_partition_segment_sizes[j] += segment_size; intra_segment_copy_output_displacements[j * num_chunks + k] = displacement; displacement += segment_size; } } - std::vector intra_partition_segment_offset_vectors(major_comm_size + 1, 0); + std::vector intra_partition_segment_offsets(major_comm_size + 1, 0); std::inclusive_scan(intra_partition_segment_sizes.begin(), intra_partition_segment_sizes.end(), - intra_partition_segment_offset_vectors.begin() + 1); + intra_partition_segment_offsets.begin() + 1); #if 1 std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl; #endif @@ -1248,13 +1279,14 @@ create_graph_from_edgelist_impl( for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { auto input_first = edgelist_partitioned_srcs[k][i].begin() + - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + segment_size, - tmp_srcs.begin() + intra_partition_segment_offset_vectors[j] + + tmp_srcs.begin() + intra_partition_segment_offsets[j] + intra_segment_copy_output_displacements[j * num_chunks + k]); } } @@ -1271,13 +1303,14 @@ create_graph_from_edgelist_impl( for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { auto input_first = edgelist_partitioned_dsts[k][i].begin() + - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + segment_size, - tmp_dsts.begin() + intra_partition_segment_offset_vectors[j] + + tmp_dsts.begin() + intra_partition_segment_offsets[j] + intra_segment_copy_output_displacements[j * num_chunks + k]); } } @@ -1292,13 +1325,14 @@ create_graph_from_edgelist_impl( for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { auto input_first = (*edgelist_partitioned_weights)[k][i].begin() + - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + segment_size, - tmp_weights.begin() + intra_partition_segment_offset_vectors[j] + + tmp_weights.begin() + intra_partition_segment_offsets[j] + intra_segment_copy_output_displacements[j * num_chunks + k]); } } @@ -1314,13 +1348,14 @@ create_graph_from_edgelist_impl( for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { auto input_first = (*edgelist_partitioned_edge_ids)[k][i].begin() + - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + segment_size, - tmp_edge_ids.begin() + intra_partition_segment_offset_vectors[j] + + tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + intra_segment_copy_output_displacements[j * num_chunks + k]); } } @@ -1336,13 +1371,14 @@ create_graph_from_edgelist_impl( for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { auto input_first = (*edgelist_partitioned_edge_types)[k][i].begin() + - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - auto segment_size = (edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]); + (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - + edgelist_edge_offset_vectors[k][i * major_comm_size + j]; thrust::copy(handle.get_thrust_policy(), input_first, input_first + segment_size, - tmp_edge_types.begin() + intra_partition_segment_offset_vectors[j] + + tmp_edge_types.begin() + intra_partition_segment_offsets[j] + intra_segment_copy_output_displacements[j * num_chunks + k]); } } @@ -1353,8 +1389,7 @@ create_graph_from_edgelist_impl( (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); } - edgelist_intra_partition_segment_offset_vectors[i] = - std::move(intra_partition_segment_offset_vectors); + edgelist_intra_partition_segment_offset_vectors[i] = std::move(intra_partition_segment_offsets); } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); From eb822da70a50fffb650f1c453b46ce397918443e Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 11 Sep 2024 22:02:01 -0700 Subject: [PATCH 076/126] temporarily store vertex IDs in 48 bit to cut peak memory usage --- .../create_graph_from_edgelist_impl.cuh | 176 ++++++++++++++---- 1 file changed, 138 insertions(+), 38 deletions(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 2b8e25a1894..b83001c377a 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -44,6 +44,7 @@ #include #include +#include namespace cugraph { @@ -1047,7 +1048,7 @@ create_graph_from_edgelist_impl( // 1. check whether we can temporarily store a 64 bit vertex ID in 48 bit. - bool clip_high_order_zero_bits = false; + bool store_v_in_48bit = false; // if set to true, temporarily store vertex IDs in 48 bit static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); if constexpr (sizeof(vertex_t) == 8) { // 64 bit vertex ID @@ -1071,7 +1072,9 @@ create_graph_from_edgelist_impl( min_clz, thrust::minimum{}); } - if (min_clz >= 16) { clip_high_order_zero_bits = true; } + if (min_clz >= 16) { + store_v_in_48bit = true; + } std::cout << "min_clz=" << min_clz << std::endl; } @@ -1080,13 +1083,18 @@ create_graph_from_edgelist_impl( // IDs). #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 1 clip_high_order_zero_bits=" - << clip_high_order_zero_bits << std::endl; + std::cout << comm_rank + << ":create_graph_from_edgelist_impl 1 store_v_in_48bit=" << store_v_in_48bit + << std::endl; #endif std::vector> edgelist_edge_offset_vectors(num_chunks); - std::vector>> edgelist_partitioned_srcs(num_chunks); - std::vector>> edgelist_partitioned_dsts(num_chunks); + std::vector< + std::vector, rmm::device_uvector>>> + edgelist_partitioned_srcs(num_chunks); + std::vector< + std::vector, rmm::device_uvector>>> + edgelist_partitioned_dsts(num_chunks); auto edgelist_partitioned_weights = edgelist_weights ? std::make_optional>>>(num_chunks) @@ -1132,24 +1140,66 @@ create_graph_from_edgelist_impl( h_this_chunk_edge_offsets.begin() + 1); for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - rmm::device_uvector tmp_srcs(h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size], - handle.get_stream()); - auto input_first = edgelist_srcs[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_srcs.size(), tmp_srcs.begin()); + auto partition_size = h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size]; + std::variant, rmm::device_uvector> tmp_srcs = + rmm::device_uvector(0, handle.get_stream()); + if (store_v_in_48bit) { + tmp_srcs = rmm::device_uvector(partition_size * 3, handle.get_stream()); + auto input_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [src_first = edgelist_srcs[i].begin() + + h_this_chunk_edge_offsets[j * major_comm_size]] __device__(size_t i) { + auto v = static_cast(*(src_first + (i / 3))); + return static_cast((v >> (16 * (i % 3))) & uint64_t{0xffff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + partition_size * 3, + std::get<1>(tmp_srcs).begin()); + } else { + std::get<0>(tmp_srcs).resize(partition_size, handle.get_stream()); + auto input_first = + edgelist_srcs[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + partition_size, + std::get<0>(tmp_srcs).begin()); + } edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); } edgelist_srcs[i].resize(0, handle.get_stream()); edgelist_srcs[i].shrink_to_fit(handle.get_stream()); for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - rmm::device_uvector tmp_dsts(h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size], - handle.get_stream()); - auto input_first = edgelist_dsts[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy( - handle.get_thrust_policy(), input_first, input_first + tmp_dsts.size(), tmp_dsts.begin()); + auto partition_size = h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - + h_this_chunk_edge_offsets[j * major_comm_size]; + std::variant, rmm::device_uvector> tmp_dsts = + rmm::device_uvector(0, handle.get_stream()); + if (store_v_in_48bit) { + tmp_dsts = rmm::device_uvector(partition_size * 3, handle.get_stream()); + auto input_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [dst_first = edgelist_dsts[i].begin() + + h_this_chunk_edge_offsets[j * major_comm_size]] __device__(size_t i) { + auto v = static_cast(*(dst_first + (i / 3))); + return static_cast((v >> (16 * (i % 3))) & uint64_t{0xffff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + partition_size * 3, + std::get<1>(tmp_dsts).begin()); + } else { + std::get<0>(tmp_dsts).resize(partition_size, handle.get_stream()); + auto input_first = + edgelist_dsts[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + partition_size, + std::get<0>(tmp_dsts).begin()); + } edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); } edgelist_dsts[i].resize(0, handle.get_stream()); @@ -1278,21 +1328,46 @@ create_graph_from_edgelist_impl( #endif for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { - auto input_first = edgelist_partitioned_srcs[k][i].begin() + - (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_offset = edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]; auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); + if (store_v_in_48bit) { + auto input_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [uint16_first = std::get<1>(edgelist_partitioned_srcs[k][i]).begin() + + segment_offset * 3] __device__(size_t i) { + auto v0 = *(uint16_first + i * 3 + 0); + auto v1 = *(uint16_first + i * 3 + 1); + auto v2 = *(uint16_first + i * 3 + 2); + return static_cast(static_cast(v0) | + (static_cast(v1) << 16) | + (static_cast(v2) << 32)); + })); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_srcs.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); + } else { + auto input_first = std::get<0>(edgelist_partitioned_srcs[k][i]).begin() + segment_offset; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_srcs.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); + } } } for (size_t k = 0; k < num_chunks; ++k) { - edgelist_partitioned_srcs[k][i].resize(0, handle.get_stream()); - edgelist_partitioned_srcs[k][i].shrink_to_fit(handle.get_stream()); + if (store_v_in_48bit) { + std::get<1>(edgelist_partitioned_srcs[k][i]).resize(0, handle.get_stream()); + std::get<1>(edgelist_partitioned_srcs[k][i]).shrink_to_fit(handle.get_stream()); + } else { + std::get<0>(edgelist_partitioned_srcs[k][i]).resize(0, handle.get_stream()); + std::get<0>(edgelist_partitioned_srcs[k][i]).shrink_to_fit(handle.get_stream()); + } } edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); @@ -1302,21 +1377,46 @@ create_graph_from_edgelist_impl( #endif for (int j = 0; j < major_comm_size; ++j) { for (size_t k = 0; k < num_chunks; ++k) { - auto input_first = edgelist_partitioned_dsts[k][i].begin() + - (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]); + auto segment_offset = edgelist_edge_offset_vectors[k][i * major_comm_size + j] - + edgelist_edge_offset_vectors[k][i * major_comm_size]; auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); + if (store_v_in_48bit) { + auto input_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [uint16_first = std::get<1>(edgelist_partitioned_dsts[k][i]).begin() + + segment_offset * 3] __device__(size_t i) { + auto v0 = *(uint16_first + i * 3 + 0); + auto v1 = *(uint16_first + i * 3 + 1); + auto v2 = *(uint16_first + i * 3 + 2); + return static_cast(static_cast(v0) | + (static_cast(v1) << 16) | + (static_cast(v2) << 32)); + })); + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_dsts.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); + } else { + auto input_first = std::get<0>(edgelist_partitioned_dsts[k][i]).begin() + segment_offset; + thrust::copy(handle.get_thrust_policy(), + input_first, + input_first + segment_size, + tmp_dsts.begin() + intra_partition_segment_offsets[j] + + intra_segment_copy_output_displacements[j * num_chunks + k]); + } } } for (size_t k = 0; k < num_chunks; ++k) { - edgelist_partitioned_dsts[k][i].resize(0, handle.get_stream()); - edgelist_partitioned_dsts[k][i].shrink_to_fit(handle.get_stream()); + if (store_v_in_48bit) { + std::get<1>(edgelist_partitioned_dsts[k][i]).resize(0, handle.get_stream()); + std::get<1>(edgelist_partitioned_dsts[k][i]).shrink_to_fit(handle.get_stream()); + } else { + std::get<0>(edgelist_partitioned_dsts[k][i]).resize(0, handle.get_stream()); + std::get<0>(edgelist_partitioned_dsts[k][i]).shrink_to_fit(handle.get_stream()); + } } edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); From a067f086a868b2ad582fd0c4a92fc9add24420dc Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 13 Sep 2024 15:31:25 -0700 Subject: [PATCH 077/126] update v_list bitmap bcast --- .../detail/extract_transform_v_frontier_e.cuh | 123 ++++++++++-------- .../prims/detail/per_v_transform_reduce_e.cuh | 95 +++++++------- cpp/src/prims/fill_edge_src_dst_property.cuh | 66 ++++++---- .../prims/update_edge_src_dst_property.cuh | 42 +++--- cpp/src/prims/vertex_frontier.cuh | 44 +++---- 5 files changed, 203 insertions(+), 167 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 06a241681d2..d7828375543 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -616,31 +616,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } - // update frontier bitmap (used to reduce broadcast bandwidth size) - - std:: - conditional_t>, std::byte /* dummy */> - frontier_bitmap{}; - if constexpr (try_bitmap) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - auto const minor_comm_rank = minor_comm.get_rank(); - auto segment_offsets = - graph_view.local_edge_partition_segment_offsets(static_cast(minor_comm_rank)); - size_t bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : graph_view.local_vertex_partition_range_size(); - - frontier_bitmap = - compute_vertex_list_bitmap_info(frontier_key_first, - frontier_key_last, - graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_first() + bool_size, - handle.get_stream()); - } - } - - // compute local max_pushes + // 2. compute local max_pushes size_t local_max_pushes{}; { @@ -663,10 +639,13 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, frontier_major_first, frontier_major_last, handle.get_stream()); } - // communication over minor_comm + // 3. communication over minor_comm std::vector local_frontier_sizes{}; - std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_firsts{}; + std::conditional_t, std::byte /* dummy */> + local_frontier_range_lasts{}; std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -676,12 +655,25 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, static_cast(thrust::distance(frontier_key_first, frontier_key_last)), handle.get_stream()); if constexpr (try_bitmap) { - auto tmp_flags = host_scalar_allgather( - minor_comm, frontier_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - use_bitmap_flags.resize(tmp_flags.size()); - std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { - return flag == uint8_t{1}; - }); + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + auto v_list_size = + static_cast(thrust::distance(frontier_key_first, frontier_key_last)); + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [frontier_key_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *frontier_key_first + : (*(frontier_key_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } + local_frontier_range_firsts = + host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); + local_frontier_range_lasts = + host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); } if (key_segment_offsets) { rmm::device_uvector d_key_segment_offsets((*key_segment_offsets).size(), @@ -723,6 +715,38 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } + // update frontier bitmap (used to reduce broadcast bandwidth size) + + std:: + conditional_t>, std::byte /* dummy */> + frontier_bitmap{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_frontier_sizes[i]); + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + + constexpr double threshold_ratio = + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + frontier_bitmap = + compute_vertex_list_bitmap_info(frontier_key_first, + frontier_key_last, + local_frontier_range_firsts[minor_comm_rank], + local_frontier_range_lasts[minor_comm_rank], + handle.get_stream()); + } + } + } + // set-up stream ppol std::optional> stream_pool_indices{std::nullopt}; @@ -836,11 +860,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(partition_idx)); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); if constexpr (try_bitmap) { std::variant, decltype(frontier_key_first)> v_list{}; - if (use_bitmap_flags[partition_idx]) { + if (frontier_bitmap) { v_list = (static_cast(partition_idx) == minor_comm_rank) ? raft::device_span((*frontier_bitmap).data(), (*frontier_bitmap).size()) @@ -849,13 +872,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } else { v_list = frontier_key_first; } - auto bool_size = segment_offsets ? *((*segment_offsets).rbegin() + 1) - : edge_partition.major_range_size(); device_bcast_vertex_list(minor_comm, v_list, get_dataframe_buffer_begin(edge_partition_key_buffer), - edge_partition.major_range_first(), - edge_partition.major_range_first() + bool_size, + local_frontier_range_firsts[partition_idx], + local_frontier_range_lasts[partition_idx], local_frontier_sizes[partition_idx], static_cast(partition_idx), loop_stream); @@ -1131,20 +1152,21 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto tmp_buffer_size = tmp_buffer_sizes[j]; + auto tmp_buffer_size = tmp_buffer_sizes[j]; if (tmp_buffer_size > 0) { - auto& tmp_key_buffer = output_key_buffers[j]; - auto& tmp_value_buffer = output_value_buffers[j]; + auto& tmp_key_buffer = output_key_buffers[j]; + auto& tmp_value_buffer = output_value_buffers[j]; - resize_optional_dataframe_buffer(tmp_key_buffer, tmp_buffer_size, loop_stream); - shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, loop_stream); + resize_optional_dataframe_buffer( + tmp_key_buffer, tmp_buffer_size, loop_stream); + shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, loop_stream); - resize_optional_dataframe_buffer( - tmp_value_buffer, tmp_buffer_size, loop_stream); - shrink_to_fit_optional_dataframe_buffer(tmp_value_buffer, loop_stream); + resize_optional_dataframe_buffer( + tmp_value_buffer, tmp_buffer_size, loop_stream); + shrink_to_fit_optional_dataframe_buffer(tmp_value_buffer, loop_stream); - key_buffers.push_back(std::move(tmp_key_buffer)); - value_buffers.push_back(std::move(tmp_value_buffer)); + key_buffers.push_back(std::move(tmp_key_buffer)); + value_buffers.push_back(std::move(tmp_value_buffer)); } } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1180,8 +1202,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto value_buffer = allocate_optional_dataframe_buffer(0, handle.get_stream()); if (key_buffers.size() == 0) { /* nothing to do */ - } - else if (key_buffers.size() == 1) { + } else if (key_buffers.size() == 1) { key_buffer = std::move(key_buffers[0]); value_buffer = std::move(value_buffers[0]); } else { diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index eea8b3ccdec..753fbc05157 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1647,43 +1647,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 5. compute optional bitmap info - - std:: - conditional_t>, std::byte /* dummy */> - key_list_bitmap{}; - std::conditional_t, std::byte /* dummy */> v_list_range{}; - if constexpr (try_bitmap) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - v_list_range = {vertex_t{0}, vertex_t{0}}; - - if (minor_comm_size > 1) { - auto v_list_size = - static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); - if (v_list_size > 0) { - rmm::device_uvector tmps(2, handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - tmps.begin(), - tmps.end(), - [sorted_unique_key_first, v_list_size] __device__(size_t i) { - return (i == 0) ? *sorted_unique_key_first - : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); - }); - raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); - handle.sync_stream(); - } - - key_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, - sorted_unique_nzd_key_last, - v_list_range[0], - v_list_range[1], - handle.get_stream()); - } - } - - // 6. collect local_key_list_sizes & use_bitmap_flags & key_segment_offsets + // 5. collect local_key_list_sizes & local_key_list_range_firsts & local_key_list_range_lasts & + // key_segment_offsets std::conditional_t, std::byte /* dummy */> local_key_list_sizes{}; @@ -1691,7 +1656,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_key_list_range_firsts{}; std::conditional_t, std::byte /* dummy */> local_key_list_range_lasts{}; - std::conditional_t, std::byte /* dummy */> use_bitmap_flags{}; std::conditional_t>>, std::byte /* dummy */> @@ -1705,16 +1669,25 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), handle.get_stream()); if constexpr (try_bitmap) { + std::array v_list_range = {vertex_t{0}, vertex_t{0}}; + auto v_list_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + if (v_list_size > 0) { + rmm::device_uvector tmps(2, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + tmps.begin(), + tmps.end(), + [sorted_unique_key_first, v_list_size] __device__(size_t i) { + return (i == 0) ? *sorted_unique_key_first + : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); + }); + raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); + handle.sync_stream(); + } local_key_list_range_firsts = host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); local_key_list_range_lasts = host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); - auto tmp_flags = host_scalar_allgather( - minor_comm, key_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - use_bitmap_flags.resize(tmp_flags.size()); - std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { - return flag == uint8_t{1}; - }); } if (key_segment_offsets) { rmm::device_uvector d_key_segment_offsets((*key_segment_offsets).size(), @@ -1758,6 +1731,38 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + // 6. compute optional bitmap info + + std:: + conditional_t>, std::byte /* dummy */> + key_list_bitmap{}; + if constexpr (try_bitmap) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + auto const minor_comm_rank = minor_comm.get_rank(); + double avg_fill_ratio{0.0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto num_keys = static_cast(local_key_list_sizes[i]); + auto range_size = local_key_list_range_lasts[i] - local_key_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(minor_comm_size); + + constexpr double threshold_ratio = + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + key_list_bitmap = + compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + local_key_list_range_firsts[minor_comm_rank], + local_key_list_range_lasts[minor_comm_rank], + handle.get_stream()); + } + } + } + // 7. set-up stream pool std::optional> stream_pool_indices{std::nullopt}; @@ -1881,7 +1886,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (try_bitmap) { std::variant, decltype(sorted_unique_key_first)> v_list{}; - if (use_bitmap_flags[partition_idx]) { + if (key_list_bitmap) { v_list = (static_cast(partition_idx) == minor_comm_rank) ? raft::device_span((*key_list_bitmap).data(), (*key_list_bitmap).size()) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index b30397f5d8c..74ecb9e256e 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -342,26 +342,34 @@ void fill_edge_minor_property(raft::handle_t const& handle, // !edge_partition_keys.has_value() && v_list_bitmap.has_value()) } - auto v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, - sorted_unique_vertex_last, - v_list_range[0], - v_list_range[1], - handle.get_stream()); - - std::vector use_bitmap_flags(major_comm_size, false); - { - auto tmp_flags = host_scalar_allgather( - major_comm, v_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { - return flag == uint8_t{1}; - }); - } auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); auto local_v_list_range_firsts = host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); auto local_v_list_range_lasts = host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + std::optional> v_list_bitmap{std::nullopt}; + if (major_comm_size > 1) { + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + + constexpr double threshold_ratio = + 0.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } + auto num_concurrent_bcasts = (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t)) / @@ -372,7 +380,10 @@ void fill_edge_minor_property(raft::handle_t const& handle, num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); - std::cout << comm.get_rank() << ":" << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; + std::cout << comm.get_rank() << ":" + << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," + << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() + << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; std::optional> stream_pool_indices{std::nullopt}; if (num_concurrent_bcasts > 1) { @@ -393,18 +404,21 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); if (is_packed_bool() && - !edge_partition_keys && use_bitmap_flags[partition_idx]) { + !edge_partition_keys && v_list_bitmap) { rmm::device_uvector rx_bitmap( packed_bool_size(local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]), loop_stream); device_bcast(major_comm, - (static_cast(partition_idx) == major_comm_rank) ? (*v_list_bitmap).data() - : static_cast(nullptr), + (static_cast(partition_idx) == major_comm_rank) + ? (*v_list_bitmap).data() + : static_cast(nullptr), rx_bitmap.data(), rx_bitmap.size(), partition_idx, @@ -436,17 +450,17 @@ void fill_edge_minor_property(raft::handle_t const& handle, } }); } else { - rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], - loop_stream); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], loop_stream); // FIXME: these broadcast operations can be placed between ncclGroupStart() and // ncclGroupEnd() std::variant, decltype(sorted_unique_vertex_first)> v_list{}; - if (use_bitmap_flags[partition_idx]) { - v_list = (static_cast(partition_idx) == major_comm_rank) ? raft::device_span( - (*v_list_bitmap).data(), (*v_list_bitmap).size()) - : raft::device_span( - static_cast(nullptr), size_t{0}); + if (v_list_bitmap) { + v_list = (static_cast(partition_idx) == major_comm_rank) + ? raft::device_span((*v_list_bitmap).data(), + (*v_list_bitmap).size()) + : raft::device_span(static_cast(nullptr), + size_t{0}); } else { v_list = sorted_unique_vertex_first; } diff --git a/cpp/src/prims/update_edge_src_dst_property.cuh b/cpp/src/prims/update_edge_src_dst_property.cuh index f95928520ab..2f842f710ca 100644 --- a/cpp/src/prims/update_edge_src_dst_property.cuh +++ b/cpp/src/prims/update_edge_src_dst_property.cuh @@ -495,7 +495,8 @@ void update_edge_minor_property(raft::handle_t const& handle, (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t)) / std::max(bcast_size, size_t{1}); - num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); + num_concurrent_bcasts = + std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); auto num_rounds = (static_cast(major_comm_size) + num_concurrent_bcasts - size_t{1}) / num_concurrent_bcasts; @@ -731,27 +732,34 @@ void update_edge_minor_property(raft::handle_t const& handle, handle.sync_stream(); } - auto v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, - sorted_unique_vertex_last, - v_list_range[0], - v_list_range[1], - handle.get_stream()); - - std::vector use_bitmap_flags(major_comm_size, false); - { - auto tmp_flags = host_scalar_allgather( - major_comm, v_list_bitmap ? uint8_t{1} : uint8_t{0}, handle.get_stream()); - std::transform(tmp_flags.begin(), tmp_flags.end(), use_bitmap_flags.begin(), [](auto flag) { - return flag == uint8_t{1}; - }); - } - auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); auto local_v_list_range_firsts = host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); auto local_v_list_range_lasts = host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + std::optional> v_list_bitmap{std::nullopt}; + if (major_comm_size > 1) { + double avg_fill_ratio{0.0}; + for (int i = 0; i < major_comm_size; ++i) { + auto num_keys = static_cast(local_v_list_sizes[i]); + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + avg_fill_ratio += + (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + } + avg_fill_ratio /= static_cast(major_comm_size); + + constexpr double threshold_ratio = + 0.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + if (avg_fill_ratio > threshold_ratio) { + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } + std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { key_offsets = graph_view.local_sorted_unique_edge_src_vertex_partition_offsets(); @@ -814,7 +822,7 @@ void update_edge_minor_property(raft::handle_t const& handle, // ncclGroupEnd() std::variant, decltype(sorted_unique_vertex_first)> v_list{}; - if (use_bitmap_flags[i]) { + if (v_list_bitmap) { v_list = (i == major_comm_rank) ? raft::device_span((*v_list_bitmap).data(), (*v_list_bitmap).size()) diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index f92aec680a9..f30ba5693e8 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -126,7 +126,7 @@ std::vector compute_key_segment_offsets(KeyIterator sorted_key_first, } template -std::optional> compute_vertex_list_bitmap_info( +rmm::device_uvector compute_vertex_list_bitmap_info( VertexIterator sorted_unique_vertex_first, VertexIterator sorted_unique_vertex_last, typename thrust::iterator_traits::value_type vertex_range_first, @@ -135,33 +135,21 @@ std::optional> compute_vertex_list_bitmap_info( { using vertex_t = typename thrust::iterator_traits::value_type; - constexpr double threshold_ratio = - 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); - - std::optional> bitmap{std::nullopt}; - - auto v_list_size = - static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); - auto bool_size = vertex_range_last - vertex_range_first; - - if (v_list_size > static_cast(bool_size * threshold_ratio)) { - bitmap = rmm::device_uvector(packed_bool_size(bool_size), stream_view); - thrust::fill(rmm::exec_policy_nosync(stream_view), - (*bitmap).begin(), - (*bitmap).end(), - packed_bool_empty_mask()); - thrust::for_each(rmm::exec_policy_nosync(stream_view), - sorted_unique_vertex_first, - sorted_unique_vertex_last, - [bitmap = raft::device_span((*bitmap).data(), (*bitmap).size()), - v_first = vertex_range_first] __device__(vertex_t v) { - auto v_offset = v - v_first; - cuda::atomic_ref word( - bitmap[packed_bool_offset(v_offset)]); - word.fetch_or(cugraph::packed_bool_mask(v_offset), - cuda::std::memory_order_relaxed); - }); - } + auto bitmap = rmm::device_uvector( + packed_bool_size(vertex_range_last - vertex_range_first), stream_view); + thrust::fill( + rmm::exec_policy_nosync(stream_view), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + thrust::for_each(rmm::exec_policy_nosync(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + v_first = vertex_range_first] __device__(vertex_t v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(cugraph::packed_bool_mask(v_offset), + cuda::std::memory_order_relaxed); + }); return bitmap; } From 6c9118eab36ce5679ffb9dcb9e0547be512e50f3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 14 Sep 2024 20:16:32 -0700 Subject: [PATCH 078/126] undo a flag --- cpp/include/cugraph/partition_manager.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 6542714b824..e3bb699f00d 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -42,7 +42,7 @@ class partition_manager { // partitioning along the major axis (major sub-communicator is responsible for this) and along // the minor axis (minor sub-communicator is responsible for this). This variable controls whether // to map the major sub-communicator to the GPU row communicator or the GPU column communicator. - static constexpr bool map_major_comm_to_gpu_row_comm = false; // FIXME: this is for benchmarking, reset to true before merging + static constexpr bool map_major_comm_to_gpu_row_comm = true; #ifdef __CUDACC__ __host__ __device__ From 20721e66533a1c3635836a487beed4ad954b4fda Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 19 Sep 2024 15:53:49 -0700 Subject: [PATCH 079/126] peak memory usage --- .../cugraph/utilities/dataframe_buffer.hpp | 11 + cpp/src/prims/fill_edge_src_dst_property.cuh | 168 +++- cpp/src/prims/vertex_frontier.cuh | 27 + .../create_graph_from_edgelist_impl.cuh | 800 ++++++++++-------- cpp/src/structure/detail/structure_utils.cuh | 136 ++- cpp/src/structure/renumber_edgelist_impl.cuh | 252 +++--- 6 files changed, 856 insertions(+), 538 deletions(-) diff --git a/cpp/include/cugraph/utilities/dataframe_buffer.hpp b/cpp/include/cugraph/utilities/dataframe_buffer.hpp index 5d839d22fc5..6d47ec540da 100644 --- a/cpp/include/cugraph/utilities/dataframe_buffer.hpp +++ b/cpp/include/cugraph/utilities/dataframe_buffer.hpp @@ -90,6 +90,17 @@ struct dataframe_buffer_type { template using dataframe_buffer_type_t = typename dataframe_buffer_type::type; +template +std::optional> try_allocate_dataframe_buffer( + size_t buffer_size, rmm::cuda_stream_view stream_view) +{ + try { + return allocate_dataframe_buffer(buffer_size, stream_view); + } catch (std::exception const& e) { + return std::nullopt; + } +} + template struct dataframe_buffer_iterator_type { using type = typename rmm::device_uvector::iterator; diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 74ecb9e256e..553dfd521a6 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -298,6 +298,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, EdgeMinorPropertyOutputWrapper edge_minor_property_output, T input) { +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = std::chrono::steady_clock::now(); constexpr bool contains_packed_bool_element = cugraph::has_packed_bool_element(); @@ -360,7 +361,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, avg_fill_ratio /= static_cast(major_comm_size); constexpr double threshold_ratio = - 0.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); if (avg_fill_ratio > threshold_ratio) { v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, sorted_unique_vertex_last, @@ -397,11 +398,169 @@ void fill_edge_minor_property(raft::handle_t const& handle, } else { key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } - if (stream_pool_indices) { handle.sync_stream(); } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t1 = std::chrono::steady_clock::now(); auto edge_partition_keys = edge_minor_property_output.keys(); for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub0 = std::chrono::steady_clock::now(); auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); + + std::vector, rmm::device_uvector>> + edge_partition_key_buffers{}; + std::vector> edge_partition_dummy_counter_scalars{}; + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + std::variant, rmm::device_uvector> key_buffer = + rmm::device_uvector(0, handle.get_stream()); + if (v_list_bitmap) { + key_buffer = rmm::device_uvector( + packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); + } else { + std::get<0>(key_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); + } + edge_partition_key_buffers.push_back(std::move(key_buffer)); + edge_partition_dummy_counter_scalars.push_back( + rmm::device_scalar(size_t{0}, handle.get_stream())); + } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub1 = std::chrono::steady_clock::now(); + + device_group_start(major_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto& key_buffer = edge_partition_key_buffers[j]; + if (v_list_bitmap) { + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) + ? (*v_list_bitmap).data() + : static_cast(nullptr), + std::get<1>(key_buffer).data(), + std::get<1>(key_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(major_comm, + sorted_unique_vertex_first, + std::get<0>(key_buffer).data(), + std::get<0>(key_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(major_comm); + if (stream_pool_indices) { handle.sync_stream(); } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub2 = std::chrono::steady_clock::now(); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_key_buffers[j]); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(rx_bitmap.size()), + [input, + output_value_first = + edge_partition_value_first + + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first), + rx_bitmap = raft::device_span(rx_bitmap.data(), + rx_bitmap.size())] __device__(size_t i) { + if ((i == 0) || (i == (rx_bitmap.size() - 1))) { // first or last + cuda::atomic_ref word( + *(output_value_first + i)); + if (input) { + word.fetch_or(rx_bitmap[i], cuda::std::memory_order_relaxed); + } else { + word.fetch_and(~rx_bitmap[i], cuda::std::memory_order_relaxed); + } + } else { + if (input) { + *(output_value_first + i) |= rx_bitmap[i]; + } else { + *(output_value_first + i) &= ~rx_bitmap[i]; + } + } + }); + } else { + if (v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_key_buffers[j]); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], loop_stream); + rmm::device_scalar dummy(size_t{0}, loop_stream); + retrieve_vertex_list_from_bitmap(raft::device_span(rx_bitmap.data(), rx_bitmap.size()), rx_vertices.begin(), raft::device_span(dummy.data(), size_t{1}), local_v_list_range_firsts[partition_idx], local_v_list_range_lasts[partition_idx], loop_stream); + edge_partition_key_buffers[j] = std::move(rx_vertices); + } + auto const& rx_vertices = std::get<0>(edge_partition_key_buffers[j]); + if (edge_partition_keys) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [rx_vertex_first = rx_vertices.begin(), + input, + subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], + subrange_key_last = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { + auto minor = *(rx_vertex_first + i); + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; + } + } + }); + } else { + if constexpr (contains_packed_bool_element) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator( + static_cast(local_v_list_sizes[partition_idx])), + [minor_range_first, + rx_vertex_first = rx_vertices.begin(), + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + auto rx_vertex = *(rx_vertex_first + i); + auto minor_offset = rx_vertex - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto map_first = thrust::make_transform_iterator( + rx_vertices.begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } + } + } + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::now(); +std::chrono::duration subdur0 = sub1 - sub0; +std::chrono::duration subdur1 = sub2 - sub1; +std::chrono::duration subdur2 = sub3 - sub2; +std::cout << comm.get_rank() << ":sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << ")" << std::endl; +#if 0 for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto loop_stream = stream_pool_indices @@ -529,7 +688,12 @@ void fill_edge_minor_property(raft::handle_t const& handle, } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#endif } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t2 = std::chrono::steady_clock::now(); +std::chrono::duration dur0 = t1 - t0; +std::chrono::duration dur1 = t2 - t1; +std::cout << comm.get_rank() << ":fill_edge_minor took (" << dur0.count() << "," << dur1.count() << ")" << std::endl; } else { assert(graph_view.local_vertex_partition_range_size() == graph_view.local_edge_partition_src_range_size()); diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index f30ba5693e8..80ab3e80852 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -196,6 +196,33 @@ void device_bcast_vertex_list( } } +template +void retrieve_vertex_list_from_bitmap( + raft::device_span bitmap, + OutputVertexIterator output_v_first, + raft::device_span count /* size = 1 */, + typename thrust::iterator_traits::value_type vertex_range_first, + typename thrust::iterator_traits::value_type vertex_range_last, + rmm::cuda_stream_view stream_view) +{ + using vertex_t = typename thrust::iterator_traits::value_type; + + assert((comm.get_rank() != root) || (bitmap.size() == packed_bool_size(vertex_range_last - vertex_ragne_first))); + detail::copy_if_nosync( + thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [bitmap] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != + packed_bool_empty_mask()); + })), + output_v_first, + count, + stream_view); +} + // key type is either vertex_t (tag_t == void) or thrust::tuple (tag_t != void) // if sorted_unique is true, stores unique key objects in the sorted (non-descending) order. // if false, there can be duplicates and the elements may not be sorted. diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index b83001c377a..1539abcf3c9 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -300,6 +300,121 @@ bool check_no_parallel_edge(raft::handle_t const& handle, (org_edge_first + edgelist_srcs.size()); } +template +std::vector> +split_edge_chunk_compressed_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_compressed_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors, + size_t element_size) +{ + auto num_chunks = edgelist_compressed_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_compressed_elements{}; + edge_partition_compressed_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_compressed_elements.push_back(rmm::device_uvector( + edge_partition_edge_counts[i] * element_size, handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy( + handle.get_thrust_policy(), + edgelist_compressed_elements[k].begin() + segment_offset * element_size, + edgelist_compressed_elements[k].begin() + (segment_offset + segment_size) * element_size, + edge_partition_compressed_elements[i].begin() + output_offset * element_size); + } + } + } + edgelist_compressed_elements.clear(); + + return edge_partition_compressed_elements; +} + +template +std::vector> split_edge_chunk_elements_to_local_edge_partitions( + raft::handle_t const& handle, + std::vector>&& edgelist_elements, + std::vector> const& edgelist_edge_offset_vectors, + std::vector const& edge_partition_edge_counts, + std::vector> const& edge_partition_intra_partition_segment_offset_vectors, + std::vector> const& + edge_partition_intra_segment_copy_output_displacement_vectors) +{ + static_assert(std::is_arithmetic_v); // otherwise, unimplemented. + auto num_chunks = edgelist_elements.size(); + auto num_edge_partitions = edge_partition_edge_counts.size(); + auto num_segments = edge_partition_intra_partition_segment_offset_vectors[0].size() - 1; + for (size_t i = 0; i < edge_partition_intra_partition_segment_offset_vectors.size(); ++i) { + assert(edge_partition_intra_partition_segment_offset_vectors[i].size() == (num_segments + 1)); + } + + std::vector> edge_partition_elements{}; + edge_partition_elements.reserve(num_edge_partitions); + for (size_t i = 0; i < num_edge_partitions; ++i) { + edge_partition_elements.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); + } + + for (size_t i = 0; i < num_edge_partitions; ++i) { + for (size_t j = 0; j < num_segments; ++j) { + for (size_t k = 0; k < num_chunks; ++k) { + auto segment_offset = edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto segment_size = edgelist_edge_offset_vectors[k][i * num_segments + j + 1] - + edgelist_edge_offset_vectors[k][i * num_segments + j]; + auto output_offset = + edge_partition_intra_partition_segment_offset_vectors[i][j] + + edge_partition_intra_segment_copy_output_displacement_vectors[i][j * num_chunks + k]; + thrust::copy(handle.get_thrust_policy(), + edgelist_elements[k].begin() + segment_offset, + edgelist_elements[k].begin() + (segment_offset + segment_size), + edge_partition_elements[i].begin() + output_offset); + } + } + } + edgelist_elements.clear(); + + return edge_partition_elements; +} + +template +void decompress_vertices(raft::handle_t const& handle, + raft::device_span compressed_vertices, + raft::device_span vertices, + size_t compressed_v_size) +{ + auto input_v_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [byte_first = compressed_vertices.begin(), compressed_v_size] __device__(size_t i) { + uint64_t v{0}; + for (size_t j = 0; j < compressed_v_size; ++j) { + auto b = *(byte_first + i * compressed_v_size + j); + v |= static_cast(b) << (8 * j); + } + return static_cast(v); + })); + thrust::copy( + handle.get_thrust_policy(), input_v_first, input_v_first + vertices.size(), vertices.begin()); +} + template (static_cast(total_global_mem / element_size) * mem_frugal_ratio); @@ -549,6 +663,7 @@ create_graph_from_partitioned_edgelist( mem_frugal_threshold, handle.get_stream()); } else { +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph_from_partitioned 2-1 i=" << i << std::endl; std::forward_as_tuple(offsets, indices, dcs_nzd_vertices) = detail::sort_and_compress_edgelist( std::move(edge_partition_edgelist_srcs[i]), @@ -560,6 +675,7 @@ create_graph_from_partitioned_edgelist( minor_range_last, mem_frugal_threshold, handle.get_stream()); +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph_from_partitioned 2-2 i=" << i << std::endl; } } } @@ -1046,68 +1162,71 @@ create_graph_from_edgelist_impl( auto num_chunks = edgelist_srcs.size(); - // 1. check whether we can temporarily store a 64 bit vertex ID in 48 bit. + // 1. set whether to temporarily compress vertex IDs or not in splitting edge chunks - bool store_v_in_48bit = false; // if set to true, temporarily store vertex IDs in 48 bit + size_t compressed_v_size = + sizeof(vertex_t); // if set to a value smaller than sizeof(vertex_t), temporarily store vertex + // IDs in compressed_v_size byte variables static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); if constexpr (sizeof(vertex_t) == 8) { // 64 bit vertex ID static_assert(std::is_signed_v); // __clzll takes a signed integer - size_t min_clz{sizeof(vertex_t) * 8}; - for (size_t i = 0; i < num_chunks; ++i) { - min_clz = thrust::transform_reduce(handle.get_thrust_policy(), - edgelist_srcs[i].begin(), - edgelist_srcs[i].end(), - cuda::proclaim_return_type([] __device__(auto v) { - return static_cast(__clzll(v)); - }), - min_clz, - thrust::minimum{}); - min_clz = thrust::transform_reduce(handle.get_thrust_policy(), - edgelist_dsts[i].begin(), - edgelist_dsts[i].end(), - cuda::proclaim_return_type([] __device__(auto v) { - return static_cast(__clzll(v)); - }), - min_clz, - thrust::minimum{}); + + auto total_global_mem = handle.get_device_properties().totalGlobalMem; + size_t element_size = sizeof(vertex_t) * 2; + if (edgelist_weights) { element_size += sizeof(weight_t); } + if (edgelist_edge_ids) { element_size += sizeof(edge_id_t); } + if (edgelist_edge_types) { element_size += sizeof(edge_type_t); } + edge_t num_edges{0}; + for (size_t i = 0; i < edgelist_srcs.size(); ++i) { + num_edges += edgelist_srcs[i].size(); } - if (min_clz >= 16) { - store_v_in_48bit = true; + bool compress{false}; + if (static_cast(num_edges) * element_size > + static_cast(total_global_mem * 0.5 /* tuning parameter */)) { + compress = true; } - std::cout << "min_clz=" << min_clz << std::endl; - } - // 2. groupby each edge chunks to their target local adjacency matrix partition (and further - // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex - // IDs). + if (compress) { + size_t min_clz{sizeof(vertex_t) * 8}; + for (size_t i = 0; i < num_chunks; ++i) { + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_srcs[i].begin(), + edgelist_srcs[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + min_clz = + thrust::transform_reduce(handle.get_thrust_policy(), + edgelist_dsts[i].begin(), + edgelist_dsts[i].end(), + cuda::proclaim_return_type([] __device__(auto v) { + return static_cast(__clzll(v)); + }), + min_clz, + thrust::minimum{}); + } + compressed_v_size = sizeof(vertex_t) - (min_clz / 8); + compressed_v_size = std::max( + compressed_v_size, size_t{5}); // FIXME: max(compressed_v_size, size_t{1}) is sufficient, + // but we need to check whether this works at scale 40 + } + } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank - << ":create_graph_from_edgelist_impl 1 store_v_in_48bit=" << store_v_in_48bit + << ":create_graph_from_edgelist_impl 1 compressed_v_size=" << compressed_v_size << std::endl; #endif - std::vector> edgelist_edge_offset_vectors(num_chunks); - std::vector< - std::vector, rmm::device_uvector>>> - edgelist_partitioned_srcs(num_chunks); - std::vector< - std::vector, rmm::device_uvector>>> - edgelist_partitioned_dsts(num_chunks); - auto edgelist_partitioned_weights = - edgelist_weights - ? std::make_optional>>>(num_chunks) - : std::nullopt; - auto edgelist_partitioned_edge_ids = - edgelist_edge_ids - ? std::make_optional>>>(num_chunks) - : std::nullopt; - auto edgelist_partitioned_edge_types = - edgelist_edge_types - ? std::make_optional>>>(num_chunks) - : std::nullopt; + // 2. groupby each edge chunks to their target local adjacency matrix partition (and further + // groupby within the local partition by applying the compute_gpu_id_from_vertex_t to minor vertex + // IDs). + std::vector> edgelist_edge_offset_vectors(num_chunks); for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks std::optional> this_chunk_weights{std::nullopt}; if (edgelist_weights) { this_chunk_weights = std::move((*edgelist_weights)[i]); } @@ -1124,6 +1243,9 @@ create_graph_from_edgelist_impl( this_chunk_edge_ids, this_chunk_edge_types, true); + if (this_chunk_weights) { (*edgelist_weights)[i] = std::move(*this_chunk_weights); } + if (this_chunk_edge_ids) { (*edgelist_edge_ids)[i] = std::move(*this_chunk_edge_ids); } + if (this_chunk_edge_types) { (*edgelist_edge_types)[i] = std::move(*this_chunk_edge_types); } std::vector h_this_chunk_edge_counts(d_this_chunk_edge_counts.size()); raft::update_host(h_this_chunk_edge_counts.data(), @@ -1131,175 +1253,82 @@ create_graph_from_edgelist_impl( d_this_chunk_edge_counts.size(), handle.get_stream()); handle.sync_stream(); - std::vector h_this_chunk_edge_offsets( + std::vector h_this_chunk_edge_offsets( h_this_chunk_edge_counts.size() + 1, 0); // size = minor_comm_size (# local edge partitions) * major_comm_size (# segments in the // local minor range) std::inclusive_scan(h_this_chunk_edge_counts.begin(), h_this_chunk_edge_counts.end(), h_this_chunk_edge_offsets.begin() + 1); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - auto partition_size = h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size]; - std::variant, rmm::device_uvector> tmp_srcs = - rmm::device_uvector(0, handle.get_stream()); - if (store_v_in_48bit) { - tmp_srcs = rmm::device_uvector(partition_size * 3, handle.get_stream()); - auto input_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [src_first = edgelist_srcs[i].begin() + - h_this_chunk_edge_offsets[j * major_comm_size]] __device__(size_t i) { - auto v = static_cast(*(src_first + (i / 3))); - return static_cast((v >> (16 * (i % 3))) & uint64_t{0xffff}); - })); - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + partition_size * 3, - std::get<1>(tmp_srcs).begin()); - } else { - std::get<0>(tmp_srcs).resize(partition_size, handle.get_stream()); - auto input_first = - edgelist_srcs[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + partition_size, - std::get<0>(tmp_srcs).begin()); - } - edgelist_partitioned_srcs[i].push_back(std::move(tmp_srcs)); - } - edgelist_srcs[i].resize(0, handle.get_stream()); - edgelist_srcs[i].shrink_to_fit(handle.get_stream()); - - for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - auto partition_size = h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size]; - std::variant, rmm::device_uvector> tmp_dsts = - rmm::device_uvector(0, handle.get_stream()); - if (store_v_in_48bit) { - tmp_dsts = rmm::device_uvector(partition_size * 3, handle.get_stream()); - auto input_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [dst_first = edgelist_dsts[i].begin() + - h_this_chunk_edge_offsets[j * major_comm_size]] __device__(size_t i) { - auto v = static_cast(*(dst_first + (i / 3))); - return static_cast((v >> (16 * (i % 3))) & uint64_t{0xffff}); - })); - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + partition_size * 3, - std::get<1>(tmp_dsts).begin()); - } else { - std::get<0>(tmp_dsts).resize(partition_size, handle.get_stream()); - auto input_first = - edgelist_dsts[i].begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + partition_size, - std::get<0>(tmp_dsts).begin()); - } - edgelist_partitioned_dsts[i].push_back(std::move(tmp_dsts)); - } - edgelist_dsts[i].resize(0, handle.get_stream()); - edgelist_dsts[i].shrink_to_fit(handle.get_stream()); - - if (this_chunk_weights) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - rmm::device_uvector tmp_weights( - h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size], - handle.get_stream()); - auto input_first = - (*this_chunk_weights).begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_weights.size(), - tmp_weights.begin()); - (*edgelist_partitioned_weights)[i].push_back(std::move(tmp_weights)); - } - (*this_chunk_weights).resize(0, handle.get_stream()); - (*this_chunk_weights).shrink_to_fit(handle.get_stream()); - } - - if (this_chunk_edge_ids) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - rmm::device_uvector tmp_edge_ids( - h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size], - handle.get_stream()); - auto input_first = - (*this_chunk_edge_ids).begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_ids.size(), - tmp_edge_ids.begin()); - (*edgelist_partitioned_edge_ids)[i].push_back(std::move(tmp_edge_ids)); - } - (*this_chunk_edge_ids).resize(0, handle.get_stream()); - (*this_chunk_edge_ids).shrink_to_fit(handle.get_stream()); - } - - if (this_chunk_edge_types) { - for (int j = 0; j < minor_comm_size /* # local edge partitions */; ++j) { - rmm::device_uvector tmp_edge_types( - h_this_chunk_edge_offsets[(j + 1) * major_comm_size] - - h_this_chunk_edge_offsets[j * major_comm_size], - handle.get_stream()); - auto input_first = - (*this_chunk_edge_types).begin() + h_this_chunk_edge_offsets[j * major_comm_size]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + tmp_edge_types.size(), - tmp_edge_types.begin()); - (*edgelist_partitioned_edge_types)[i].push_back(std::move(tmp_edge_types)); - } - (*this_chunk_edge_types).resize(0, handle.get_stream()); - (*this_chunk_edge_types).shrink_to_fit(handle.get_stream()); - } - edgelist_edge_offset_vectors[i] = std::move(h_this_chunk_edge_offsets); } - edgelist_srcs.clear(); - edgelist_dsts.clear(); - if (edgelist_weights) { (*edgelist_weights).clear(); } - if (edgelist_edge_ids) { (*edgelist_edge_ids).clear(); } - if (edgelist_edge_types) { (*edgelist_edge_types).clear(); } - - // 3. split the grouped edge chunks to local partitions #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl; #endif - auto edgelist_intra_partition_segment_offset_vectors = - std::vector>(minor_comm_size); - - std::vector> edge_partition_edgelist_srcs{}; - edge_partition_edgelist_srcs.reserve(minor_comm_size); - std::vector> edge_partition_edgelist_dsts{}; - edge_partition_edgelist_dsts.reserve(minor_comm_size); - auto edge_partition_edgelist_weights = - edgelist_partitioned_weights ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_weights) { (*edge_partition_edgelist_weights).reserve(minor_comm_size); } - auto edge_partition_edgelist_edge_ids = - edgelist_partitioned_edge_ids - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_ids) { - (*edge_partition_edgelist_edge_ids).reserve(minor_comm_size); - } - auto edge_partition_edgelist_edge_types = - edgelist_partitioned_edge_types - ? std::make_optional>>() - : std::nullopt; - if (edgelist_partitioned_edge_types) { - (*edge_partition_edgelist_edge_types).reserve(minor_comm_size); + // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement + + std::optional>> edgelist_compressed_srcs{std::nullopt}; + std::optional>> edgelist_compressed_dsts{std::nullopt}; + if (compressed_v_size < sizeof(vertex_t)) { + edgelist_compressed_srcs = std::vector>{}; + edgelist_compressed_dsts = std::vector>{}; + (*edgelist_compressed_srcs).reserve(num_chunks); + (*edgelist_compressed_dsts).reserve(num_chunks); + for (size_t i = 0; i < num_chunks; ++i) { // iterate over input edge chunks + // compress source values + auto tmp_srcs = rmm::device_uvector(edgelist_srcs[i].size() * compressed_v_size, + handle.get_stream()); + auto input_src_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [src_first = edgelist_srcs[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(src_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_src_first, + input_src_first + edgelist_srcs[i].size() * compressed_v_size, + tmp_srcs.begin()); + edgelist_srcs[i].resize(0, handle.get_stream()); + edgelist_srcs[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_srcs).push_back(std::move(tmp_srcs)); + + // compress destination values + + auto tmp_dsts = rmm::device_uvector(edgelist_dsts[i].size() * compressed_v_size, + handle.get_stream()); + auto input_dst_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [dst_first = edgelist_dsts[i].begin(), compressed_v_size] __device__(size_t i) { + auto v = static_cast(*(dst_first + (i / compressed_v_size))); + return static_cast((v >> (8 * (i % compressed_v_size))) & uint64_t{0xff}); + })); + thrust::copy(handle.get_thrust_policy(), + input_dst_first, + input_dst_first + edgelist_dsts[i].size() * compressed_v_size, + tmp_dsts.begin()); + edgelist_dsts[i].resize(0, handle.get_stream()); + edgelist_dsts[i].shrink_to_fit(handle.get_stream()); + (*edgelist_compressed_dsts).push_back(std::move(tmp_dsts)); + } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl; +#endif + + // 4. compute additional copy_offset vectors + // FIXME: we can store chunk data in multiple rmm::device_uvector objects to free memory earlier - for (int i = 0; i < minor_comm_size; ++i) { // iterate over local edge partitions + std::vector edge_partition_edge_counts(minor_comm_size); + std::vector> edge_partition_intra_partition_segment_offset_vectors( + minor_comm_size); + std::vector> edge_partition_intra_segment_copy_output_displacement_vectors( + minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { edge_t edge_count{0}; std::vector intra_partition_segment_sizes(major_comm_size, 0); std::vector intra_segment_copy_output_displacements(major_comm_size * num_chunks); @@ -1318,182 +1347,239 @@ create_graph_from_edgelist_impl( std::inclusive_scan(intra_partition_segment_sizes.begin(), intra_partition_segment_sizes.end(), intra_partition_segment_offsets.begin() + 1); + + edge_partition_edge_counts[i] = edge_count; + edge_partition_intra_partition_segment_offset_vectors[i] = + std::move(intra_partition_segment_offsets); + edge_partition_intra_segment_copy_output_displacement_vectors[i] = + std::move(intra_segment_copy_output_displacements); + } #if 1 - std::cout << comm_rank << ": i=" << i << " edge_count=" << edge_count << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 4" << std::endl; #endif - rmm::device_uvector tmp_srcs(edge_count, handle.get_stream()); + // 5. split the grouped edge chunks to local partitions + + std::vector> edge_partition_edgelist_srcs{}; + std::vector> edge_partition_edgelist_dsts{}; + std::optional>> edge_partition_edgelist_weights{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_ids{ + std::nullopt}; + std::optional>> edge_partition_edgelist_edge_types{ + std::nullopt}; + + std::optional>> + edge_partition_edgelist_compressed_srcs{}; + std::optional>> + edge_partition_edgelist_compressed_dsts{}; + + if (compressed_v_size < sizeof(vertex_t)) { + edge_partition_edgelist_compressed_srcs = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); #if 1 - std::cout << comm_rank << ": i=" << i << " tmp_srcs allocated" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 4-1" << std::endl; #endif - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < num_chunks; ++k) { - auto segment_offset = edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]; - auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - if (store_v_in_48bit) { - auto input_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [uint16_first = std::get<1>(edgelist_partitioned_srcs[k][i]).begin() + - segment_offset * 3] __device__(size_t i) { - auto v0 = *(uint16_first + i * 3 + 0); - auto v1 = *(uint16_first + i * 3 + 1); - auto v2 = *(uint16_first + i * 3 + 2); - return static_cast(static_cast(v0) | - (static_cast(v1) << 16) | - (static_cast(v2) << 32)); - })); - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); - } else { - auto input_first = std::get<0>(edgelist_partitioned_srcs[k][i]).begin() + segment_offset; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_srcs.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); - } - } - } - for (size_t k = 0; k < num_chunks; ++k) { - if (store_v_in_48bit) { - std::get<1>(edgelist_partitioned_srcs[k][i]).resize(0, handle.get_stream()); - std::get<1>(edgelist_partitioned_srcs[k][i]).shrink_to_fit(handle.get_stream()); - } else { - std::get<0>(edgelist_partitioned_srcs[k][i]).resize(0, handle.get_stream()); - std::get<0>(edgelist_partitioned_srcs[k][i]).shrink_to_fit(handle.get_stream()); - } - } - edge_partition_edgelist_srcs.push_back(std::move(tmp_srcs)); - rmm::device_uvector tmp_dsts(edge_count, handle.get_stream()); + edge_partition_edgelist_compressed_dsts = + split_edge_chunk_compressed_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_compressed_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors, + compressed_v_size); + } else { + edge_partition_edgelist_srcs = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_srcs), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + + edge_partition_edgelist_dsts = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(edgelist_dsts), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + #if 1 - std::cout << comm_rank << ": i=" << i << " tmp_dsts allocated" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 5" << std::endl; #endif - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < num_chunks; ++k) { - auto segment_offset = edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]; - auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - if (store_v_in_48bit) { - auto input_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [uint16_first = std::get<1>(edgelist_partitioned_dsts[k][i]).begin() + - segment_offset * 3] __device__(size_t i) { - auto v0 = *(uint16_first + i * 3 + 0); - auto v1 = *(uint16_first + i * 3 + 1); - auto v2 = *(uint16_first + i * 3 + 2); - return static_cast(static_cast(v0) | - (static_cast(v1) << 16) | - (static_cast(v2) << 32)); - })); - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); - } else { - auto input_first = std::get<0>(edgelist_partitioned_dsts[k][i]).begin() + segment_offset; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_dsts.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); - } - } - } - for (size_t k = 0; k < num_chunks; ++k) { - if (store_v_in_48bit) { - std::get<1>(edgelist_partitioned_dsts[k][i]).resize(0, handle.get_stream()); - std::get<1>(edgelist_partitioned_dsts[k][i]).shrink_to_fit(handle.get_stream()); + + if (edgelist_weights) { + edge_partition_edgelist_weights = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_weights), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_ids) { + edge_partition_edgelist_edge_ids = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_ids), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } + if (edgelist_edge_types) { + edge_partition_edgelist_edge_types = + split_edge_chunk_elements_to_local_edge_partitions( + handle, + std::move(*edgelist_edge_types), + edgelist_edge_offset_vectors, + edge_partition_edge_counts, + edge_partition_intra_partition_segment_offset_vectors, + edge_partition_intra_segment_copy_output_displacement_vectors); + } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":create_graph_from_edgelist_impl 6" << std::endl; +#endif + + // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory + // requirement + + if (compressed_v_size < sizeof(vertex_t)) { + assert(edge_partition_edgelist_compressed_srcs); + assert(edge_partition_edgelist_compressed_dsts); + edge_partition_edgelist_srcs.reserve(minor_comm_size); + edge_partition_edgelist_dsts.reserve(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + auto tmp_srcs = + try_allocate_dataframe_buffer(edge_partition_edge_counts[i], handle.get_stream()); + if (tmp_srcs) { + decompress_vertices( + handle, + raft::device_span((*edge_partition_edgelist_compressed_srcs)[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].size()), + raft::device_span((*tmp_srcs).data(), (*tmp_srcs).size()), + compressed_v_size); + // defer freeing (*edge_partition_edgelist_compressed_srcs)[i] to reduce memory + // fragmentation (pool allocator) + edge_partition_edgelist_srcs.push_back(std::move(*tmp_srcs)); } else { - std::get<0>(edgelist_partitioned_dsts[k][i]).resize(0, handle.get_stream()); - std::get<0>(edgelist_partitioned_dsts[k][i]).shrink_to_fit(handle.get_stream()); + break; } - } - edge_partition_edgelist_dsts.push_back(std::move(tmp_dsts)); - if (edge_partition_edgelist_weights) { - rmm::device_uvector tmp_weights(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < num_chunks; ++k) { - auto input_first = (*edgelist_partitioned_weights)[k][i].begin() + - (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]); - auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_weights.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); - } - } - for (size_t k = 0; k < num_chunks; ++k) { - (*edgelist_partitioned_weights)[k][i].resize(0, handle.get_stream()); - (*edgelist_partitioned_weights)[k][i].shrink_to_fit(handle.get_stream()); + auto tmp_dsts = + try_allocate_dataframe_buffer(edge_partition_edge_counts[i], handle.get_stream()); + if (tmp_dsts) { + decompress_vertices( + handle, + raft::device_span((*edge_partition_edgelist_compressed_dsts)[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].size()), + raft::device_span((*tmp_dsts).data(), (*tmp_dsts).size()), + compressed_v_size); + // defer freeing (*edge_partition_edgelist_compressed_dsts)[i] to reduce memory + // fragmentation (pool allocator) + edge_partition_edgelist_dsts.push_back(std::move(*tmp_dsts)); + } else { + break; } - (*edge_partition_edgelist_weights).push_back(std::move(tmp_weights)); } - if (edge_partition_edgelist_edge_ids) { - rmm::device_uvector tmp_edge_ids(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < num_chunks; ++k) { - auto input_first = (*edgelist_partitioned_edge_ids)[k][i].begin() + - (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]); - auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_edge_ids.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); - } + auto num_src_allocs = edge_partition_edgelist_srcs.size(); + auto num_dst_allocs = edge_partition_edgelist_dsts.size(); + if ((num_src_allocs < static_cast(minor_comm_size)) || + (num_dst_allocs < static_cast(minor_comm_size))) { + std::vector> h_edge_partition_edgelist_compressed_srcs( + static_cast(minor_comm_size) - num_src_allocs); + std::vector> h_edge_partition_edgelist_compressed_dsts( + static_cast(minor_comm_size) - num_dst_allocs); + for (size_t i = 0; i < static_cast(minor_comm_size) - num_src_allocs; ++i) { + h_edge_partition_edgelist_compressed_srcs[i].resize( + edge_partition_edge_counts[num_src_allocs + i] * compressed_v_size); + raft::update_host(h_edge_partition_edgelist_compressed_srcs[i].data(), + (*edge_partition_edgelist_compressed_srcs)[num_src_allocs + i].data(), + (*edge_partition_edgelist_compressed_srcs)[num_src_allocs + i].size(), + handle.get_stream()); } - for (size_t k = 0; k < num_chunks; ++k) { - (*edgelist_partitioned_edge_ids)[k][i].resize(0, handle.get_stream()); - (*edgelist_partitioned_edge_ids)[k][i].shrink_to_fit(handle.get_stream()); + for (size_t i = 0; i < static_cast(minor_comm_size) - num_dst_allocs; ++i) { + h_edge_partition_edgelist_compressed_dsts[i].resize( + edge_partition_edge_counts[num_dst_allocs + i] * compressed_v_size); + raft::update_host(h_edge_partition_edgelist_compressed_dsts[i].data(), + (*edge_partition_edgelist_compressed_dsts)[num_dst_allocs + i].data(), + (*edge_partition_edgelist_compressed_dsts)[num_dst_allocs + i].size(), + handle.get_stream()); } - (*edge_partition_edgelist_edge_ids).push_back(std::move(tmp_edge_ids)); - } - - if (edge_partition_edgelist_edge_types) { - rmm::device_uvector tmp_edge_types(edge_count, handle.get_stream()); - for (int j = 0; j < major_comm_size; ++j) { - for (size_t k = 0; k < num_chunks; ++k) { - auto input_first = (*edgelist_partitioned_edge_types)[k][i].begin() + - (edgelist_edge_offset_vectors[k][i * major_comm_size + j] - - edgelist_edge_offset_vectors[k][i * major_comm_size]); - auto segment_size = edgelist_edge_offset_vectors[k][i * major_comm_size + j + 1] - - edgelist_edge_offset_vectors[k][i * major_comm_size + j]; - thrust::copy(handle.get_thrust_policy(), - input_first, - input_first + segment_size, - tmp_edge_types.begin() + intra_partition_segment_offsets[j] + - intra_segment_copy_output_displacements[j * num_chunks + k]); + (*edge_partition_edgelist_compressed_srcs).clear(); + (*edge_partition_edgelist_compressed_dsts).clear(); + for (size_t i = 0; + i < static_cast(minor_comm_size) - std::min(num_src_allocs, num_dst_allocs); + ++i) { + if (i < static_cast(minor_comm_size) - num_src_allocs) { + edge_partition_edgelist_srcs.push_back(rmm::device_uvector( + edge_partition_edge_counts[num_src_allocs + i], handle.get_stream())); + } + if (i < static_cast(minor_comm_size) - num_dst_allocs) { + edge_partition_edgelist_dsts.push_back(rmm::device_uvector( + edge_partition_edge_counts[num_dst_allocs + i], handle.get_stream())); } } - for (size_t k = 0; k < num_chunks; ++k) { - (*edgelist_partitioned_edge_types)[k][i].resize(0, handle.get_stream()); - (*edgelist_partitioned_edge_types)[k][i].shrink_to_fit(handle.get_stream()); + for (size_t i = 0; + i < static_cast(minor_comm_size) - std::min(num_src_allocs, num_dst_allocs); + ++i) { + if (i < static_cast(minor_comm_size) - num_src_allocs) { + rmm::device_uvector tmp_bytes( + edge_partition_edge_counts[num_src_allocs + i] * compressed_v_size, + handle.get_stream()); + raft::update_device(tmp_bytes.data(), + h_edge_partition_edgelist_compressed_srcs[i].data(), + edge_partition_edge_counts[num_src_allocs + i] * compressed_v_size, + handle.get_stream()); + decompress_vertices( + handle, + raft::device_span(tmp_bytes.data(), tmp_bytes.size()), + raft::device_span(edge_partition_edgelist_srcs[num_src_allocs + i].data(), + edge_partition_edgelist_srcs[num_src_allocs + i].size()), + compressed_v_size); + } + if (i < static_cast(minor_comm_size) - num_dst_allocs) { + rmm::device_uvector tmp_bytes( + edge_partition_edge_counts[num_dst_allocs + i] * compressed_v_size, + handle.get_stream()); + raft::update_device(tmp_bytes.data(), + h_edge_partition_edgelist_compressed_dsts[i].data(), + edge_partition_edge_counts[num_dst_allocs + i] * compressed_v_size, + handle.get_stream()); + decompress_vertices( + handle, + raft::device_span(tmp_bytes.data(), tmp_bytes.size()), + raft::device_span(edge_partition_edgelist_dsts[num_dst_allocs + i].data(), + edge_partition_edgelist_dsts[num_dst_allocs + i].size()), + compressed_v_size); + } } - (*edge_partition_edgelist_edge_types).push_back(std::move(tmp_edge_types)); - } - edgelist_intra_partition_segment_offset_vectors[i] = std::move(intra_partition_segment_offsets); + handle.sync_stream(); + } } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl; + std::cout << comm_rank << ":create_graph_from_edgelist_impl 7" << std::endl; #endif return create_graph_from_partitioned_edgelist compute_sparse_offsets( bool edgelist_major_sorted, rmm::cuda_stream_view stream_view) { - rmm::device_uvector offsets((major_range_last - major_range_first) + 1, stream_view); + rmm::device_uvector offsets(static_cast(major_range_last - major_range_first) + 1, + stream_view); if (edgelist_major_sorted) { offsets.set_element_to_zero_async(0, stream_view); thrust::upper_bound(rmm::exec_policy(stream_view), @@ -77,7 +78,9 @@ rmm::device_uvector compute_sparse_offsets( edgelist_major_first, edgelist_major_last, [offset_view, major_range_first] __device__(auto v) { - atomicAdd(&offset_view[v - major_range_first], edge_t{1}); + cuda::atomic_ref atomic_counter( + offset_view[v - major_range_first]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); }); thrust::exclusive_scan( @@ -246,30 +249,112 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, rmm::device_uvector offsets(0, stream_view); rmm::device_uvector indices(0, stream_view); - auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); if (edgelist_minors.size() > mem_frugal_threshold) { - offsets = compute_sparse_offsets(edgelist_majors.begin(), - edgelist_majors.end(), - major_range_first, - major_range_last, - false, - stream_view); + static_assert((sizeof(vertex_t) == 4) || (sizeof(vertex_t) == 8)); + if ((sizeof(vertex_t) == 8) && (static_cast(major_range_last - major_range_first) <= + static_cast(std::numeric_limits::max()))) { + rmm::device_uvector edgelist_major_offsets(edgelist_majors.size(), stream_view); + thrust::transform( + rmm::exec_policy_nosync(stream_view), + edgelist_majors.begin(), + edgelist_majors.end(), + edgelist_major_offsets.begin(), + cuda::proclaim_return_type([major_range_first] __device__(vertex_t major) { + return static_cast(major - major_range_first); + })); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + + offsets = + compute_sparse_offsets(edgelist_major_offsets.begin(), + edgelist_major_offsets.end(), + uint32_t{0}, + static_cast(major_range_last - major_range_first), + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_major_offsets.size() * (i + 1)) / 4)))); + } - auto pivot = major_range_first + static_cast(thrust::distance( - offsets.begin(), - thrust::lower_bound(rmm::exec_policy(stream_view), - offsets.begin(), - offsets.end(), - edgelist_minors.size() / 2))); - auto second_first = - detail::mem_frugal_partition(edge_first, - edge_first + edgelist_minors.size(), - thrust_tuple_get, 0>{}, - pivot, - stream_view); - thrust::sort(rmm::exec_policy(stream_view), edge_first, second_first); - thrust::sort(rmm::exec_policy(stream_view), second_first, edge_first + edgelist_minors.size()); + auto pair_first = + thrust::make_zip_iterator(edgelist_major_offsets.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(pair_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(pair_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + pair_first + edgelist_major_offsets.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), pair_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), + last_quarter_first, + pair_first + edgelist_major_offsets.size()); + } else { + offsets = compute_sparse_offsets(edgelist_majors.begin(), + edgelist_majors.end(), + major_range_first, + major_range_last, + false, + stream_view); + std::array pivots{}; + for (size_t i = 0; i < 3; ++i) { + pivots[i] = + major_range_first + + static_cast(thrust::distance( + offsets.begin(), + thrust::lower_bound(rmm::exec_policy(stream_view), + offsets.begin(), + offsets.end(), + static_cast((edgelist_minors.size() * (i + 1)) / 4)))); + } + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); + auto second_half_first = + detail::mem_frugal_partition(edge_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[1], + stream_view); + auto second_quarter_first = + detail::mem_frugal_partition(edge_first, + second_half_first, + thrust_tuple_get, 0>{}, + pivots[0], + stream_view); + auto last_quarter_first = + detail::mem_frugal_partition(second_half_first, + edge_first + edgelist_majors.size(), + thrust_tuple_get, 0>{}, + pivots[2], + stream_view); + thrust::sort(rmm::exec_policy(stream_view), edge_first, second_quarter_first); + thrust::sort(rmm::exec_policy(stream_view), second_quarter_first, second_half_first); + thrust::sort(rmm::exec_policy(stream_view), second_half_first, last_quarter_first); + thrust::sort( + rmm::exec_policy(stream_view), last_quarter_first, edge_first + edgelist_majors.size()); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); + } } else { + auto edge_first = thrust::make_zip_iterator(edgelist_majors.begin(), edgelist_minors.begin()); thrust::sort(rmm::exec_policy(stream_view), edge_first, edge_first + edgelist_minors.size()); offsets = compute_sparse_offsets(edgelist_majors.begin(), edgelist_majors.end(), @@ -277,12 +362,11 @@ sort_and_compress_edgelist(rmm::device_uvector&& edgelist_srcs, major_range_last, true, stream_view); + edgelist_majors.resize(0, stream_view); + edgelist_majors.shrink_to_fit(stream_view); } indices = std::move(edgelist_minors); - edgelist_majors.resize(0, stream_view); - edgelist_majors.shrink_to_fit(stream_view); - std::optional> dcs_nzd_vertices{std::nullopt}; if (major_hypersparse_first) { std::tie(offsets, dcs_nzd_vertices) = compress_hypersparse_offsets(std::move(offsets), diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index e4958e65d94..35b0df404d0 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -323,6 +323,10 @@ std::tuple, std::vector, vertex_t> compu handle.sync_stream(); for (size_t i = 0; i < num_bins; ++i) { +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 0-1 i=" << i << std::endl; +#endif std::vector> edge_partition_tmp_majors{}; // for bin "i" { edge_partition_tmp_majors.reserve(edgelist_majors.size()); @@ -357,37 +361,55 @@ std::tuple, std::vector, vertex_t> compu } } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 0-2 i=" << i << std::endl; +#endif rmm::device_uvector tmp_minors(0, handle.get_stream()); { - edge_t bin_size{0}; - if (edge_minor_count_vectors) { - for (size_t j = 0; j < edgelist_minors.size(); ++j) { - bin_size += (*edge_minor_count_vectors)[j][i]; - } - } else { - bin_size = std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - } - tmp_minors.resize(bin_size, handle.get_stream()); - - edge_t offset{0}; + std::vector> edge_partition_tmp_minors{}; // for bin "i" + edge_partition_tmp_minors.reserve(edgelist_minors.size()); for (size_t j = 0; j < edgelist_minors.size(); ++j) { + rmm::device_uvector tmp_minors(0, handle.get_stream()); if (num_bins > 1) { + tmp_minors.resize((*edge_minor_count_vectors)[j][i], handle.get_stream()); thrust::copy_if(handle.get_thrust_policy(), edgelist_minors[j], edgelist_minors[j] + edgelist_edge_counts[j], - tmp_minors.begin() + offset, + tmp_minors.begin(), [i] __device__(auto v) { cuco::detail::MurmurHash3_32 hash_func{hash_seed}; return (static_cast(hash_func(v) % num_bins) == i); }); - offset += (*edge_minor_count_vectors)[j][i]; } else { + tmp_minors.resize(edgelist_edge_counts[j], handle.get_stream()); thrust::copy(handle.get_thrust_policy(), edgelist_minors[j], edgelist_minors[j] + edgelist_edge_counts[j], - tmp_minors.begin() + offset); - offset += edgelist_edge_counts[j]; + tmp_minors.begin()); } + thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); + tmp_minors.resize( + thrust::distance( + tmp_minors.begin(), + thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), + handle.get_stream()); + tmp_minors.shrink_to_fit(handle.get_stream()); + + edge_partition_tmp_minors.push_back(std::move(tmp_minors)); + } + edge_t aggregate_size{0}; + for (size_t i = 0; i < edge_partition_tmp_minors.size(); ++i) { + aggregate_size += edge_partition_tmp_minors[i].size(); + } + tmp_minors.resize(aggregate_size, handle.get_stream()); + size_t output_offset{0}; + for (size_t i = 0; i < edge_partition_tmp_minors.size(); ++i) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_minors[i].begin(), + edge_partition_tmp_minors[i].end(), + tmp_minors.begin() + output_offset); + output_offset += edge_partition_tmp_minors[i].size(); } thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); tmp_minors.resize( @@ -398,6 +420,11 @@ std::tuple, std::vector, vertex_t> compu tmp_minors.shrink_to_fit(handle.get_stream()); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 0-3 i=" << i << std::endl; +#endif + // FIXME: this can be a memory footprint bottleneck if major_comm_size is large. rmm::device_uvector tmp_vertices(0, handle.get_stream()); if (multi_gpu && (handle.get_comms().get_size() > 1)) { auto& comm = handle.get_comms(); @@ -467,7 +494,7 @@ std::tuple, std::vector, vertex_t> compu tmp_minors.shrink_to_fit(handle.get_stream()); // single shuffle_values() on comm instead of one shuffle_values() on minor_comm & one - // shuffle_values() on majro_comm (to cut NCCL P2P buffer size) + // shuffle_values() on major_comm (to cut NCCL P2P buffer size) std::tie(tmp_vertices, std::ignore) = shuffle_values(comm, tmp_vertices.begin(), tx_counts, handle.get_stream()); thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end()); @@ -501,6 +528,10 @@ std::tuple, std::vector, vertex_t> compu tmp_vertices = std::move(merged_vertices); } +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":compute_renumber_map 0-4 i=" << i << std::endl; +#endif if (sorted_local_vertices.size() == 0) { sorted_local_vertices = std::move(tmp_vertices); } else { @@ -544,12 +575,6 @@ std::tuple, std::vector, vertex_t> compu rmm::device_uvector sorted_local_vertex_degrees(0, handle.get_stream()); - auto constexpr num_chunks = size_t{ - 2}; // tuning parameter, this trade-offs # binary searches (up to num_chunks times more - // binary searches can be necessary if num_unique_majors << edgelist_edge_counts[i]) and - // temporary buffer requirement (cut by num_chunks times), currently set to 2 to avoid - // peak memory usage happening in this part (especially when minor_comm_size is small) - if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -561,99 +586,37 @@ std::tuple, std::vector, vertex_t> compu auto edge_partition_major_range_sizes = host_scalar_allgather(minor_comm, sorted_local_vertices.size(), handle.get_stream()); - std::optional> stream_pool_indices{std::nullopt}; - - if ((minor_comm_size >= 2) && (handle.get_stream_pool_size() >= 2)) { - edge_t num_local_edges = - std::reduce(edgelist_edge_counts.begin(), edgelist_edge_counts.end()); - - auto vertex_edge_counts = host_scalar_allreduce( - comm, - thrust::make_tuple(static_cast(sorted_local_vertices.size()), num_local_edges), - raft::comms::op_t::SUM, - handle.get_stream()); - // memory footprint vs parallelism trade-off - // peak memory requirement per loop is approximately - // (V/P) * (sizeof(vertex_t) + sizeof(edge_t)) + - // (E / (comm_size * minor_comm_size)) / num_chunks * sizeof(vertex_t) * 2 + - // std::min(V/P, (E / (comm_size * minor_comm_size)) / num_chunks) * (sizeof(vertex_t) + - // sizeof(edge_t)) - // and limit temporary memory requirement to (E / comm_size) * sizeof(vertex_t) - auto avg_vertex_degree = thrust::get<0>(vertex_edge_counts) > 0 - ? static_cast(thrust::get<1>(vertex_edge_counts)) / - static_cast(thrust::get<0>(vertex_edge_counts)) - : double{0.0}; - auto num_streams = static_cast( - (avg_vertex_degree * sizeof(vertex_t)) / - (static_cast(sizeof(vertex_t) + sizeof(edge_t)) + - (((avg_vertex_degree / minor_comm_size) / num_chunks) * sizeof(vertex_t) * 2) + - (std::min(1.0, ((avg_vertex_degree / minor_comm_size) / num_chunks)) * - (sizeof(vertex_t) + sizeof(edge_t))))); - if (num_streams >= 2) { - stream_pool_indices = std::vector(num_streams); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - handle.sync_stream(); - } - } - for (int i = 0; i < minor_comm_size; ++i) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool(i % (*stream_pool_indices).size()) - : handle.get_stream(); - - rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], loop_stream); + rmm::device_uvector sorted_majors(edge_partition_major_range_sizes[i], + handle.get_stream()); device_bcast(minor_comm, sorted_local_vertices.data(), sorted_majors.data(), edge_partition_major_range_sizes[i], i, - loop_stream); + handle.get_stream()); - rmm::device_uvector sorted_major_degrees(sorted_majors.size(), loop_stream); - thrust::fill(rmm::exec_policy(loop_stream), + rmm::device_uvector sorted_major_degrees(sorted_majors.size(), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), sorted_major_degrees.begin(), sorted_major_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, loop_stream); - tmp_majors.reserve( - (static_cast(edgelist_edge_counts[i]) + (num_chunks - 1)) / num_chunks, - loop_stream); - size_t offset{0}; - for (size_t j = 0; j < num_chunks; ++j) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[i]) - offset); - tmp_majors.resize(this_chunk_size, loop_stream); - thrust::copy(rmm::exec_policy(loop_stream), - edgelist_majors[i] + offset, - edgelist_majors[i] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(rmm::exec_policy(loop_stream), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(rmm::exec_policy(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, loop_stream); - rmm::device_uvector tmp_values(num_unique_majors, loop_stream); - thrust::reduce_by_key(rmm::exec_policy(loop_stream), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(rmm::exec_policy(loop_stream), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_majors.data(), - static_cast(sorted_majors.size()), - sorted_major_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each( + handle.get_thrust_policy(), + edgelist_majors[i], + edgelist_majors[i] + edgelist_edge_counts[i], + [sorted_majors = + raft::device_span(sorted_majors.data(), sorted_majors.size()), + sorted_major_degrees = raft::device_span( + sorted_major_degrees.data(), sorted_major_degrees.size())] __device__(auto major) { + auto it = + thrust::lower_bound(thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); device_reduce(minor_comm, sorted_major_degrees.begin(), @@ -661,11 +624,9 @@ std::tuple, std::vector, vertex_t> compu edge_partition_major_range_sizes[i], raft::comms::op_t::SUM, i, - loop_stream); + handle.get_stream()); if (i == minor_comm_rank) { sorted_local_vertex_degrees = std::move(sorted_major_degrees); } } - - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } else { assert(edgelist_majors.size() == 1); @@ -675,44 +636,21 @@ std::tuple, std::vector, vertex_t> compu sorted_local_vertex_degrees.end(), edge_t{0}); - rmm::device_uvector tmp_majors(0, handle.get_stream()); - tmp_majors.reserve(static_cast(edgelist_edge_counts[0] + (num_chunks - 1)) / num_chunks, - handle.get_stream()); - size_t offset{0}; - for (size_t i = 0; i < num_chunks; ++i) { - size_t this_chunk_size = - std::min(tmp_majors.capacity(), static_cast(edgelist_edge_counts[0]) - offset); - tmp_majors.resize(this_chunk_size, handle.get_stream()); - thrust::copy(handle.get_thrust_policy(), - edgelist_majors[0] + offset, - edgelist_majors[0] + offset + tmp_majors.size(), - tmp_majors.begin()); - thrust::sort(handle.get_thrust_policy(), tmp_majors.begin(), tmp_majors.end()); - auto num_unique_majors = - thrust::count_if(handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(tmp_majors.size()), - is_first_in_run_t{tmp_majors.data()}); - rmm::device_uvector tmp_keys(num_unique_majors, handle.get_stream()); - rmm::device_uvector tmp_values(num_unique_majors, handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - tmp_majors.begin(), - tmp_majors.end(), - thrust::make_constant_iterator(edge_t{1}), - tmp_keys.begin(), - tmp_values.begin()); - - auto kv_pair_first = - thrust::make_zip_iterator(thrust::make_tuple(tmp_keys.begin(), tmp_values.begin())); - thrust::for_each(handle.get_thrust_policy(), - kv_pair_first, - kv_pair_first + tmp_keys.size(), - search_and_increment_degree_t{ - sorted_local_vertices.data(), - static_cast(sorted_local_vertices.size()), - sorted_local_vertex_degrees.data()}); - offset += this_chunk_size; - } + thrust::for_each(handle.get_thrust_policy(), + edgelist_majors[0], + edgelist_majors[0] + edgelist_edge_counts[0], + [sorted_majors = raft::device_span( + sorted_local_vertices.data(), sorted_local_vertices.size()), + sorted_major_degrees = raft::device_span( + sorted_local_vertex_degrees.data(), + sorted_local_vertex_degrees.size())] __device__(auto major) { + auto it = thrust::lower_bound( + thrust::seq, sorted_majors.begin(), sorted_majors.end(), major); + assert((it != sorted_majors.end()) && (*it == major)); + cuda::atomic_ref atomic_counter( + sorted_major_degrees[thrust::distance(sorted_majors.begin(), it)]); + atomic_counter.fetch_add(edge_t{1}, cuda::std::memory_order_relaxed); + }); } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -1175,11 +1113,16 @@ renumber_edgelist( << " edgelist_intra_partition_segment_offsets.has_value()=" << edgelist_intra_partition_segment_offsets.has_value() << std::endl; #endif - if ((static_cast(partition.local_edge_partition_minor_range_size() * - 2.5 /* tuning parameter */) >= - static_cast(number_of_edges / comm_size)) && - edgelist_intra_partition_segment_offsets) { // memory footprint dominated by the O(V/sqrt(P)) - // part than the O(E/P) part + double approx_mem_requirements = + static_cast(partition.local_edge_partition_minor_range_size()) * + (static_cast( + sizeof(vertex_t)) /* rmm::device_uvector renumber_map_minor_labels */ + + + static_cast(sizeof(vertex_t) * 2) * + 1.5 /* kv_store_t renumber_map, * 1.5 to consider load factor */); + if ((approx_mem_requirements > + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && + edgelist_intra_partition_segment_offsets) { #if 1 std::cout << comm_rank << "path A" << std::endl; #endif @@ -1235,13 +1178,16 @@ renumber_edgelist( recvcounts[i] = partition.vertex_partition_range_size(minor_range_vertex_partition_id); } std::vector displacements(recvcounts.size(), 0); - std::partial_sum(recvcounts.begin(), recvcounts.end() - 1, displacements.begin() + 1); + std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); device_allgatherv(major_comm, - renumber_map_labels.begin(), - renumber_map_minor_labels.begin(), + renumber_map_labels.data(), + renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cout << comm_rank << ":" + << " device_allgatherv finished." << std::endl; kv_store_t renumber_map( renumber_map_minor_labels.begin(), From 9d002c57965935048a3ece6bb1c8564c700a7e9a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 22 Sep 2024 23:59:41 -0700 Subject: [PATCH 080/126] use approximation in swithcing between topdown & bottomup --- cpp/src/traversal/bfs_impl.cuh | 260 +++++++++++++++++++++++++-------- 1 file changed, 203 insertions(+), 57 deletions(-) diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 6afa5505af5..e365b340235 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -161,14 +161,27 @@ void bfs(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { is_sorted = static_cast(host_scalar_allreduce(handle.get_comms(), static_cast(is_sorted), - raft::comms::op_t::SUM, + raft::comms::op_t::MIN, handle.get_stream())); } - CUGRAPH_EXPECTS( is_sorted, "Invalid input arguments: input sources should be sorted in the non-descending order."); + bool no_duplicates = (static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(n_sources), + is_first_in_run_t{sources})) == n_sources); + if constexpr (GraphViewType::is_multi_gpu) { + no_duplicates = static_cast(host_scalar_allreduce(handle.get_comms(), + static_cast(no_duplicates), + raft::comms::op_t::MIN, + handle.get_stream())); + } + CUGRAPH_EXPECTS(no_duplicates, + "Invalid input arguments: input sources should not have duplicates."); + auto num_invalid_vertices = thrust::count_if(handle.get_thrust_policy(), sources, @@ -210,35 +223,106 @@ void bfs(raft::handle_t const& handle, auto prep1 = std::chrono::steady_clock::now(); #endif - constexpr edge_t direction_optimizing_alpha = 14; + constexpr double direction_optimizing_alpha = 14.0; constexpr vertex_t direction_optimizing_beta = 24; - std::optional> out_degrees{std::nullopt}; + std::optional> approx_out_degrees{std::nullopt}; std::optional> nzd_unvisited_vertices{std::nullopt}; if (direction_optimizing) { - // FIXME: if this becomes the main performance bottleneck, we may just approximate this. - out_degrees = graph_view.compute_out_degrees(handle); - nzd_unvisited_vertices = rmm::device_uvector( - graph_view.local_vertex_partition_range_size(), handle.get_stream()); - (*nzd_unvisited_vertices) - .resize(thrust::distance( - (*nzd_unvisited_vertices).begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()), - (*nzd_unvisited_vertices).begin(), - [vertex_partition, - sources = raft::device_span(sources, n_sources), - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = - vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return (out_degrees[v_offset] > edge_t{0}) && - !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); - })), - handle.get_stream()); - (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream()); + size_t partition_idx{0}; + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + partition_idx = static_cast(minor_comm_rank); + partition_size = static_cast(minor_comm_size); + } + + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { // exploit internal knowedge for exhaustive performance optimization for + // large-scale benchmarking (the else path is sufficient for small + // clusters with few tens of GPUs) + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_mask_view = graph_view.edge_mask_view(); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto high_and_mid_degree_segment_size = + (*segment_offsets)[2]; // compute local degrees for high & mid degree segments only, for + // low & hypersparse segments, use low_degree_threshold * + // partition_size * 0.5 & partition_size * + // hypersparse_threshold_ratio * 0.5 as approximate out degrees + if (edge_partition_e_mask) { + approx_out_degrees = edge_partition.compute_local_degrees_with_mask( + (*edge_partition_e_mask).value_first(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } else { + approx_out_degrees = edge_partition.compute_local_degrees( + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + high_and_mid_degree_segment_size, + handle.get_stream()); + } + thrust::transform(handle.get_thrust_policy(), + (*approx_out_degrees).begin(), + (*approx_out_degrees).end(), + (*approx_out_degrees).begin(), + multiplier_t{static_cast( + partition_size)}); // local_degrees => approximate global degrees + } else { + approx_out_degrees = graph_view.compute_out_degrees(handle); // exact + } + if (segment_offsets) { + auto num_visited_nzd_vertices = static_cast( + thrust::count_if(handle.get_thrust_policy(), + sources, + sources + n_sources, + [nzd_v_last = graph_view.local_vertex_partition_range_first() + + *((*segment_offsets).rbegin() + 1)] __device__(auto v) { + return (v < nzd_v_last) ? true : false; + })); + nzd_unvisited_vertices = rmm::device_uvector( + *((*segment_offsets).rbegin() + 1) - num_visited_nzd_vertices, handle.get_stream()); + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + + *((*segment_offsets).rbegin() + 1), + (*nzd_unvisited_vertices).begin(), + [vertex_partition, + sources = raft::device_span(sources, n_sources)] __device__(vertex_t v) { + return !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); + }); + } else { + nzd_unvisited_vertices = rmm::device_uvector( + graph_view.local_vertex_partition_range_size(), handle.get_stream()); + auto valid_last = thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()), + (*nzd_unvisited_vertices).begin(), + [vertex_partition, + sources = raft::device_span(sources, n_sources), + out_degrees /* exact */ = raft::device_span( + (*approx_out_degrees).data(), (*approx_out_degrees).size())] __device__(vertex_t v) { + auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); + return (out_degrees[v_offset] > edge_t{0}) && + !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); + }); + (*nzd_unvisited_vertices) + .resize(thrust::distance((*nzd_unvisited_vertices).begin(), valid_last), + handle.get_stream()); + (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream()); + } } // 4. initialize BFS frontier @@ -372,23 +456,6 @@ void bfs(raft::handle_t const& handle, #endif if (direction_optimizing) { - // FIXME: computing m_f & updating nzd_unvisited_vertices & computing m_u can be executed - // concurrently. - // FIXME: also the above fill_edge_dst_property can be executed concurrently. - auto m_f = thrust::transform_reduce( - handle.get_thrust_policy(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); - { rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), handle.get_stream()); @@ -404,19 +471,92 @@ void bfs(raft::handle_t const& handle, nzd_unvisited_vertices = std::move(tmp_vertices); } - auto m_u = thrust::transform_reduce( - handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*out_degrees).data(), (*out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; - }), - edge_t{0}, - thrust::plus{}); + double m_f{0.0}; + double m_u{0.0}; + { + size_t partition_idx{0}; + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + partition_idx = static_cast(minor_comm_rank); + partition_size = static_cast(minor_comm_size); + } + + auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin(); + auto f_vertex_last = vertex_frontier.bucket(bucket_idx_next).end(); + auto u_vertex_first = (*nzd_unvisited_vertices).begin(); + auto u_vertex_last = (*nzd_unvisited_vertices).end(); + auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + // FIXME: this actually over-estimates for graphs with power-law degree distribution + auto approx_low_segment_degree = + static_cast(low_degree_threshold * partition_size) * 0.5; + auto approx_hypersparse_segment_degree = + static_cast(partition_size) * hypersparse_threshold_ratio * 0.5; + auto f_segment_offsets = compute_key_segment_offsets( + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + f_vertex_last = f_vertex_first + f_segment_offsets[2]; + m_f = static_cast((f_segment_offsets[3] - f_segment_offsets[2])) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_f += static_cast(f_segment_offsets[4] - f_segment_offsets[3]) * + approx_hypersparse_segment_degree; + } + + auto u_segment_offsets = compute_key_segment_offsets( + (*nzd_unvisited_vertices).begin(), + (*nzd_unvisited_vertices).end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + u_vertex_last = u_vertex_first + u_segment_offsets[2]; + m_u = static_cast((u_segment_offsets[3] - u_segment_offsets[2])) * + approx_low_segment_degree; + if (graph_view.use_dcs()) { + m_u += static_cast(u_segment_offsets[4] - u_segment_offsets[3]) * + approx_hypersparse_segment_degree; + } + } + + m_f += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + f_vertex_first, + f_vertex_last, + cuda::proclaim_return_type( + [vertex_partition, + out_degrees = raft::device_span( + (*approx_out_degrees).data(), + (*approx_out_degrees).size())] __device__(vertex_t v) { + auto v_offset = + vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); + return out_degrees[v_offset]; + }), + edge_t{0}, + thrust::plus{})); + + m_u += static_cast(thrust::transform_reduce( + handle.get_thrust_policy(), + u_vertex_first, + u_vertex_last, + cuda::proclaim_return_type( + [vertex_partition, + out_degrees = raft::device_span( + (*approx_out_degrees).data(), + (*approx_out_degrees).size())] __device__(vertex_t v) { + auto v_offset = + vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); + return out_degrees[v_offset]; + }), + edge_t{0}, + thrust::plus{})); + } + auto aggregate_m_f = GraphViewType::is_multi_gpu ? host_scalar_allreduce( @@ -427,6 +567,12 @@ void bfs(raft::handle_t const& handle, ? host_scalar_allreduce( handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) : m_u; + std::cout << comm_rank << ":m_f=" << m_f << " m_u=" << m_u + << " aggregate_m_f * direction_optimzing_alpha=" + << aggregate_m_f * direction_optimizing_alpha + << " aggregate_m_u=" << aggregate_m_u + << " cur_aggregate_frontier_size=" << cur_aggregate_frontier_size + << " next_aggregate_frontier_size=" << next_aggregate_frontier_size << std::endl; if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { top_down = false; From 9e3574e10445f4dd3a95f7ebd3b2c64e2172a971 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 23 Sep 2024 15:58:19 -0700 Subject: [PATCH 081/126] update logging --- .../detail/extract_transform_v_frontier_e.cuh | 4 +- .../prims/detail/per_v_transform_reduce_e.cuh | 5 +- cpp/src/prims/fill_edge_src_dst_property.cuh | 7 +-- .../create_graph_from_edgelist_impl.cuh | 33 +++++------ cpp/src/structure/renumber_edgelist_impl.cuh | 57 ++++++++++++------- cpp/src/traversal/bfs_impl.cuh | 13 +++-- 6 files changed, 65 insertions(+), 54 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index d7828375543..7adca3e226c 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -1185,7 +1185,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration subdur7 = subtime8 - subtime7; std::chrono::duration subdur8 = subtime9 - subtime8; std::chrono::duration subdur9 = subtime10 - subtime9; - std::cout << "sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() + std::cout << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << ")" << std::endl; @@ -1248,7 +1248,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\tdetail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() + std::cout << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 753fbc05157..ed2a3fa35f4 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1376,7 +1376,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, VertexValueOutputIterator vertex_value_output_first) { #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto const comm_rank = handle.get_comms().get_rank(); RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time0 = std::chrono::steady_clock::now(); #endif @@ -2640,7 +2639,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration subdur14 = subtime15 - subtime14; std::chrono::duration subdur15 = subtime16 - subtime15; std::chrono::duration subdur16 = subtime17 - subtime16; - std::cout << comm_rank << ":sub took (" << subdur0.count() << "," << subdur1.count() << "," + std::cout << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," @@ -2766,7 +2765,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\t" << comm_rank << ":detail::per_v (prep, ep, comm) took (" << dur0.count() + std::cout << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 553dfd521a6..36e1b92896a 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -381,8 +381,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = std::chrono::steady_clock::now num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); - std::cout << comm.get_rank() << ":" - << " v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," + std::cout << "v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; @@ -559,7 +558,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::n std::chrono::duration subdur0 = sub1 - sub0; std::chrono::duration subdur1 = sub2 - sub1; std::chrono::duration subdur2 = sub3 - sub2; -std::cout << comm.get_rank() << ":sub took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << ")" << std::endl; +std::cout << "sub (fill) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << ")" << std::endl; #if 0 for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -693,7 +692,7 @@ std::cout << comm.get_rank() << ":sub took (" << subdur0.count() << "," << subdu RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t2 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = t1 - t0; std::chrono::duration dur1 = t2 - t1; -std::cout << comm.get_rank() << ":fill_edge_minor took (" << dur0.count() << "," << dur1.count() << ")" << std::endl; +std::cout << "fill_edge_minor took (" << dur0.count() << "," << dur1.count() << ")" << std::endl; } else { assert(graph_view.local_vertex_partition_range_size() == graph_view.local_edge_partition_src_range_size()); diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 1539abcf3c9..9ea5290c08d 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -452,9 +452,8 @@ create_graph_from_partitioned_edgelist( // 1. renumber #if 1 - auto const comm_rank = handle.get_comms().get_rank(); RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_partitioned 0" << std::endl; + std::cout << "create_graph_from_partitioned 0" << std::endl; #endif std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); @@ -485,7 +484,7 @@ create_graph_from_partitioned_edgelist( // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_partitioned 1" << std::endl; + std::cout << "create_graph_from_partitioned 1" << std::endl; #endif auto total_global_mem = handle.get_device_properties().totalGlobalMem; @@ -663,7 +662,7 @@ create_graph_from_partitioned_edgelist( mem_frugal_threshold, handle.get_stream()); } else { -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph_from_partitioned 2-1 i=" << i << std::endl; +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitioned 2-1 i=" << i << std::endl; std::forward_as_tuple(offsets, indices, dcs_nzd_vertices) = detail::sort_and_compress_edgelist( std::move(edge_partition_edgelist_srcs[i]), @@ -675,7 +674,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph minor_range_last, mem_frugal_threshold, handle.get_stream()); -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph_from_partitioned 2-2 i=" << i << std::endl; +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitioned 2-2 i=" << i << std::endl; } } } @@ -695,7 +694,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph // 3. segmented sort neighbors #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_partitioned 3" << std::endl; + std::cout << "create_graph_from_partitioned 3" << std::endl; #endif for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { @@ -785,7 +784,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << comm_rank << ":create_graph // 4. create a graph and an edge_property_t object. #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_partitioned 4" << std::endl; + std::cout << "create_graph_from_partitioned 4" << std::endl; #endif std::optional, weight_t>> @@ -1068,9 +1067,8 @@ create_graph_from_edgelist_impl( bool do_expensive_check) { #if 1 - auto const comm_rank = handle.get_comms().get_rank(); RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 0" << std::endl; + std::cout << "create_graph_from_edgelist_impl 0" << std::endl; #endif auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); @@ -1217,8 +1215,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank - << ":create_graph_from_edgelist_impl 1 compressed_v_size=" << compressed_v_size + std::cout << "create_graph_from_edgelist_impl 1 compressed_v_size=" << compressed_v_size << std::endl; #endif @@ -1264,7 +1261,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 2" << std::endl; + std::cout << "create_graph_from_edgelist_impl 2" << std::endl; #endif // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement @@ -1317,7 +1314,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 3" << std::endl; + std::cout << "create_graph_from_edgelist_impl 3" << std::endl; #endif // 4. compute additional copy_offset vectors @@ -1356,7 +1353,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 4" << std::endl; + std::cout << "create_graph_from_edgelist_impl 4" << std::endl; #endif // 5. split the grouped edge chunks to local partitions @@ -1387,7 +1384,7 @@ create_graph_from_edgelist_impl( compressed_v_size); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 4-1" << std::endl; + std::cout << "create_graph_from_edgelist_impl 4-1" << std::endl; #endif edge_partition_edgelist_compressed_dsts = @@ -1421,7 +1418,7 @@ create_graph_from_edgelist_impl( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 5" << std::endl; + std::cout << "create_graph_from_edgelist_impl 5" << std::endl; #endif if (edgelist_weights) { @@ -1456,7 +1453,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 6" << std::endl; + std::cout << "create_graph_from_edgelist_impl 6" << std::endl; #endif // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory @@ -1579,7 +1576,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":create_graph_from_edgelist_impl 7" << std::endl; + std::cout << "create_graph_from_edgelist_impl 7" << std::endl; #endif return create_graph_from_partitioned_edgelist, std::vector, vertex_t> compu // construct local_vertices) #if 1 - auto comm_rank = multi_gpu ? handle.get_comms().get_rank() : int{0}; RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 0" << std::endl; + std::cout << "compute_renumber_map 0" << std::endl; #endif rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { @@ -325,7 +324,7 @@ std::tuple, std::vector, vertex_t> compu for (size_t i = 0; i < num_bins; ++i) { #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 0-1 i=" << i << std::endl; + std::cout << "compute_renumber_map 0-1 i=" << i << std::endl; #endif std::vector> edge_partition_tmp_majors{}; // for bin "i" { @@ -363,7 +362,7 @@ std::tuple, std::vector, vertex_t> compu #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 0-2 i=" << i << std::endl; + std::cout << "compute_renumber_map 0-2 i=" << i << std::endl; #endif rmm::device_uvector tmp_minors(0, handle.get_stream()); { @@ -422,7 +421,7 @@ std::tuple, std::vector, vertex_t> compu #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 0-3 i=" << i << std::endl; + std::cout << "compute_renumber_map 0-3 i=" << i << std::endl; #endif // FIXME: this can be a memory footprint bottleneck if major_comm_size is large. rmm::device_uvector tmp_vertices(0, handle.get_stream()); @@ -530,7 +529,7 @@ std::tuple, std::vector, vertex_t> compu #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 0-4 i=" << i << std::endl; + std::cout << "compute_renumber_map 0-4 i=" << i << std::endl; #endif if (sorted_local_vertices.size() == 0) { sorted_local_vertices = std::move(tmp_vertices); @@ -554,7 +553,7 @@ std::tuple, std::vector, vertex_t> compu } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 1" << std::endl; + std::cout << "compute_renumber_map 1" << std::endl; #endif // 2. find an unused vertex ID @@ -568,7 +567,7 @@ std::tuple, std::vector, vertex_t> compu "vertex_t, increase vertex_t to 64 bit."); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 2" << std::endl; + std::cout << "compute_renumber_map 2" << std::endl; #endif // 3. compute global degrees for the sorted local vertices @@ -654,7 +653,7 @@ std::tuple, std::vector, vertex_t> compu } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 3" << std::endl; + std::cout << "compute_renumber_map 3" << std::endl; #endif // 5. sort local vertices by degree (descending) @@ -725,7 +724,7 @@ std::tuple, std::vector, vertex_t> compu handle.sync_stream(); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":compute_renumber_map 4" << std::endl; + std::cout << "compute_renumber_map 4" << std::endl; #endif return std::make_tuple( @@ -1021,7 +1020,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":renumber_edgelist 0" << std::endl; + std::cout << "renumber_edgelist 0" << std::endl; #endif auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = detail::compute_renumber_map(handle, @@ -1031,7 +1030,7 @@ renumber_edgelist( edgelist_edge_counts); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":renumber_edgelist 1" << std::endl; + std::cout << "renumber_edgelist 1" << std::endl; #endif // 2. initialize partition_t object, number_of_vertices, and number_of_edges @@ -1070,7 +1069,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":renumber_edgelist 2" << std::endl; + std::cout << "renumber_edgelist 2" << std::endl; #endif { vertex_t max_edge_partition_major_range_size{0}; @@ -1106,8 +1105,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank - << ":renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" + std::cout << "renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" << partition.local_edge_partition_minor_range_size() << " number_of_edges=" << number_of_edges << " comm_size=" << comm_size << " edgelist_intra_partition_segment_offsets.has_value()=" @@ -1124,7 +1122,7 @@ renumber_edgelist( static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && edgelist_intra_partition_segment_offsets) { #if 1 - std::cout << comm_rank << "path A" << std::endl; + std::cout << "path A" << std::endl; #endif vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { @@ -1166,7 +1164,7 @@ renumber_edgelist( } } else { #if 1 - std::cout << comm_rank << ":path B" << std::endl; + std::cout << "path B" << std::endl; #endif rmm::device_uvector renumber_map_minor_labels( partition.local_edge_partition_minor_range_size(), handle.get_stream()); @@ -1179,15 +1177,32 @@ renumber_edgelist( } std::vector displacements(recvcounts.size(), 0); std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); +{ +RAFT_CUDA_TRY(cudaDeviceSynchronize()); +size_t free{}; +size_t total{}; +RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); +auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); +auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); +auto u_sz = t_sz - f_sz; +std::cout << "BEFORE device_allgatherv free=" << f_sz << "GB used=" << u_sz << "GB total=" << t_sz << std::endl; +} device_allgatherv(major_comm, renumber_map_labels.data(), renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":" - << " device_allgatherv finished." << std::endl; +{ +RAFT_CUDA_TRY(cudaDeviceSynchronize()); +size_t free{}; +size_t total{}; +RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); +auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); +auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); +auto u_sz = t_sz - f_sz; +std::cout << "AFTER device_allgatherv free=" << f_sz << "GB used=" << u_sz << "GB total=" << t_sz << std::endl; +} kv_store_t renumber_map( renumber_map_minor_labels.begin(), @@ -1207,7 +1222,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << comm_rank << ":renumber_edgelist 4" << std::endl; + std::cout << "renumber_edgelist 4" << std::endl; #endif auto edge_partition_segment_offsets = detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index e365b340235..bfad6b63ffd 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -373,12 +373,12 @@ void bfs(raft::handle_t const& handle, // 4. BFS iteration vertex_t depth{0}; - bool top_down = true; + bool topdown = true; auto cur_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_cur).aggregate_size()); while (true) { vertex_t next_aggregate_frontier_size{}; - if (top_down) { + if (topdown) { #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown0 = std::chrono::steady_clock::now(); @@ -575,7 +575,7 @@ void bfs(raft::handle_t const& handle, << " next_aggregate_frontier_size=" << next_aggregate_frontier_size << std::endl; if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { - top_down = false; + topdown = false; } } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -583,7 +583,7 @@ void bfs(raft::handle_t const& handle, auto topdown5 = std::chrono::steady_clock::now(); #endif - if (top_down) { // staying in top-down + if (topdown) { // staying in top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t(handle); vertex_frontier.swap_buckets(bucket_idx_cur, bucket_idx_next); @@ -608,6 +608,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur = topdown6 - topdown0; std::cout << comm_rank << ":depth=" << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size + << " next topdown=" << topdown << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() << "," << dur5.count() << ") s." << std::endl; @@ -732,14 +733,14 @@ void bfs(raft::handle_t const& handle, if ((next_aggregate_frontier_size * direction_optimizing_beta < aggregate_nzd_unvisited_vertices) && (next_aggregate_frontier_size < cur_aggregate_frontier_size)) { - top_down = true; + topdown = true; } #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup4 = std::chrono::steady_clock::now(); #endif - if (top_down) { // swithcing to top-down + if (topdown) { // swithcing to top-down vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); From 07749f4eaa87e28fb979b21bba06e9c1423b2af1 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Sep 2024 14:16:35 -0700 Subject: [PATCH 082/126] peak memory usage --- .../cugraph/utilities/shuffle_comm.cuh | 151 ++++++-- .../create_graph_from_edgelist_impl.cuh | 189 ++++------ cpp/src/structure/renumber_edgelist_impl.cuh | 337 +++++++++--------- 3 files changed, 370 insertions(+), 307 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 3cbd35b4bc3..303bb5694cf 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -41,6 +41,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,7 @@ inline std::tuple, std::vector> compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, rmm::device_uvector const& d_tx_value_counts, + bool drop_empty_ranks, rmm::cuda_stream_view stream_view) { auto const comm_size = comm.get_size(); @@ -111,28 +113,30 @@ compute_tx_rx_counts_offsets_ranks(raft::comms::comms_t const& comm, std::partial_sum(tx_counts.begin(), tx_counts.end() - 1, tx_offsets.begin() + 1); std::partial_sum(rx_counts.begin(), rx_counts.end() - 1, rx_offsets.begin() + 1); - int num_tx_dst_ranks{0}; - int num_rx_src_ranks{0}; - for (int i = 0; i < comm_size; ++i) { - if (tx_counts[i] != 0) { - tx_counts[num_tx_dst_ranks] = tx_counts[i]; - tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; - tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; - ++num_tx_dst_ranks; - } - if (rx_counts[i] != 0) { - rx_counts[num_rx_src_ranks] = rx_counts[i]; - rx_offsets[num_rx_src_ranks] = rx_offsets[i]; - rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; - ++num_rx_src_ranks; + if (drop_empty_ranks) { + int num_tx_dst_ranks{0}; + int num_rx_src_ranks{0}; + for (int i = 0; i < comm_size; ++i) { + if (tx_counts[i] != 0) { + tx_counts[num_tx_dst_ranks] = tx_counts[i]; + tx_offsets[num_tx_dst_ranks] = tx_offsets[i]; + tx_dst_ranks[num_tx_dst_ranks] = tx_dst_ranks[i]; + ++num_tx_dst_ranks; + } + if (rx_counts[i] != 0) { + rx_counts[num_rx_src_ranks] = rx_counts[i]; + rx_offsets[num_rx_src_ranks] = rx_offsets[i]; + rx_src_ranks[num_rx_src_ranks] = rx_src_ranks[i]; + ++num_rx_src_ranks; + } } + tx_counts.resize(num_tx_dst_ranks); + tx_offsets.resize(num_tx_dst_ranks); + tx_dst_ranks.resize(num_tx_dst_ranks); + rx_counts.resize(num_rx_src_ranks); + rx_offsets.resize(num_rx_src_ranks); + rx_src_ranks.resize(num_rx_src_ranks); } - tx_counts.resize(num_tx_dst_ranks); - tx_offsets.resize(num_tx_dst_ranks); - tx_dst_ranks.resize(num_tx_dst_ranks); - rx_counts.resize(num_rx_src_ranks); - rx_offsets.resize(num_rx_src_ranks); - rx_src_ranks.resize(num_rx_src_ranks); return std::make_tuple(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks); } @@ -823,6 +827,8 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector const& tx_value_counts, rmm::cuda_stream_view stream_view) { + using value_t = typename thrust::iterator_traits::value_type; + auto const comm_size = comm.get_size(); rmm::device_uvector d_tx_value_counts(comm_size, stream_view); @@ -836,11 +842,10 @@ auto shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); - auto rx_value_buffer = - allocate_dataframe_buffer::value_type>( - rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); + auto rx_value_buffer = allocate_dataframe_buffer( + rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); // (if num_tx_dst_ranks == num_rx_src_ranks == comm_size). device_multicast_sendrecv(comm, @@ -866,6 +871,100 @@ auto shuffle_values(raft::comms::comms_t const& comm, return std::make_tuple(std::move(rx_value_buffer), rx_counts); } +// this uses less memory than calling shuffle_values then sort & unique but requires comm.get_size() +// - 1 communication steps +template +auto shuffle_and_unique_segment_sorted_values( + raft::comms::comms_t const& comm, + TxValueIterator + segment_sorted_tx_value_first, // sorted within each segment (segment sizes: + // tx_value_counts[i], where i = [0, comm_size); and bettter be + // unique to reduce communication volume + std::vector const& tx_value_counts, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + + auto sorted_unique_values = allocate_dataframe_buffer(0, stream_view); + if (comm_size == 1) { + resize_dataframe_buffer(sorted_unique_values, tx_value_counts[comm_rank], stream_view); + thrust::copy(rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first, + segment_sorted_tx_value_first + tx_value_counts[comm_rank], + get_dataframe_buffer_begin(sorted_unique_values)); + resize_dataframe_buffer( + sorted_unique_values, + thrust::distance(get_dataframe_buffer_begin(sorted_unique_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values))), + stream_view); + } else { + rmm::device_uvector d_tx_value_counts(comm_size, stream_view); + raft::update_device( + d_tx_value_counts.data(), tx_value_counts.data(), comm_size, stream_view.value()); + + std::vector tx_counts{}; + std::vector tx_offsets{}; + std::vector rx_counts{}; + std::vector rx_offsets{}; + std::tie(tx_counts, tx_offsets, std::ignore, rx_counts, rx_offsets, std::ignore) = + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, false, stream_view); + + d_tx_value_counts.resize(0, stream_view); + d_tx_value_counts.shrink_to_fit(stream_view); + + for (int i = 1; i < comm_size; ++i) { + auto dst = (comm_rank + i) % comm_size; + auto src = + static_cast((static_cast(comm_rank) + static_cast(comm_size - i)) % + static_cast(comm_size)); + auto rx_sorted_values = allocate_dataframe_buffer(rx_counts[src], stream_view); + device_sendrecv(comm, + segment_sorted_tx_value_first + tx_offsets[dst], + tx_counts[dst], + dst, + get_dataframe_buffer_begin(rx_sorted_values), + rx_counts[src], + src, + stream_view); + auto merged_sorted_values = allocate_dataframe_buffer( + (i == 1 ? tx_counts[comm_rank] : size_dataframe_buffer(sorted_unique_values)) + + rx_counts[src], + stream_view); + if (i == 1) { + thrust::merge( + rmm::exec_policy_nosync(stream_view), + segment_sorted_tx_value_first + tx_offsets[comm_rank], + segment_sorted_tx_value_first + (tx_offsets[comm_rank] + tx_counts[comm_rank]), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } else { + thrust::merge(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(sorted_unique_values), + get_dataframe_buffer_end(sorted_unique_values), + get_dataframe_buffer_begin(rx_sorted_values), + get_dataframe_buffer_end(rx_sorted_values), + get_dataframe_buffer_begin(merged_sorted_values)); + } + resize_dataframe_buffer( + merged_sorted_values, + thrust::distance(get_dataframe_buffer_begin(merged_sorted_values), + thrust::unique(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(merged_sorted_values), + get_dataframe_buffer_end(merged_sorted_values))), + stream_view); + sorted_unique_values = std::move(merged_sorted_values); + } + } + shrink_to_fit_dataframe_buffer(sorted_unique_values, stream_view); + return sorted_unique_values; +} + template auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, ValueIterator tx_value_first /* [INOUT */, @@ -889,7 +988,7 @@ auto groupby_gpu_id_and_shuffle_values(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); auto rx_value_buffer = allocate_dataframe_buffer::value_type>( @@ -943,7 +1042,7 @@ auto groupby_gpu_id_and_shuffle_kv_pairs(raft::comms::comms_t const& comm, std::vector rx_offsets{}; std::vector rx_src_ranks{}; std::tie(tx_counts, tx_offsets, tx_dst_ranks, rx_counts, rx_offsets, rx_src_ranks) = - detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, stream_view); + detail::compute_tx_rx_counts_offsets_ranks(comm, d_tx_value_counts, true, stream_view); rmm::device_uvector::value_type> rx_keys( rx_offsets.size() > 0 ? rx_offsets.back() + rx_counts.back() : size_t{0}, stream_view); diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 9ea5290c08d..196ce2ac0d6 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -453,7 +453,7 @@ create_graph_from_partitioned_edgelist( // 1. renumber #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_partitioned 0" << std::endl; + std::cerr << "create_graph_from_partitioned 0" << std::endl; #endif std::vector edgelist_edge_counts(minor_comm_size, edge_t{0}); @@ -484,7 +484,7 @@ create_graph_from_partitioned_edgelist( // 2. sort and compress edge list (COO) to CSR (or CSC) or CSR + DCSR (CSC + DCSC) hybrid #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_partitioned 1" << std::endl; + std::cerr << "create_graph_from_partitioned 1" << std::endl; #endif auto total_global_mem = handle.get_device_properties().totalGlobalMem; @@ -493,7 +493,8 @@ create_graph_from_partitioned_edgelist( if (edge_partition_edgelist_edge_ids) { element_size += sizeof(edge_id_t); } if (edge_partition_edgelist_edge_types) { element_size += sizeof(edge_type_t); } auto constexpr mem_frugal_ratio = - 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the total_global_mem, switch to the memory frugal approach + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + // total_global_mem, switch to the memory frugal approach auto mem_frugal_threshold = static_cast(static_cast(total_global_mem / element_size) * mem_frugal_ratio); @@ -662,7 +663,8 @@ create_graph_from_partitioned_edgelist( mem_frugal_threshold, handle.get_stream()); } else { -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitioned 2-1 i=" << i << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_partitioned 2-1 i=" << i << std::endl; std::forward_as_tuple(offsets, indices, dcs_nzd_vertices) = detail::sort_and_compress_edgelist( std::move(edge_partition_edgelist_srcs[i]), @@ -674,7 +676,8 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitio minor_range_last, mem_frugal_threshold, handle.get_stream()); -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitioned 2-2 i=" << i << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_partitioned 2-2 i=" << i << std::endl; } } } @@ -694,7 +697,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitio // 3. segmented sort neighbors #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_partitioned 3" << std::endl; + std::cerr << "create_graph_from_partitioned 3" << std::endl; #endif for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { @@ -784,7 +787,7 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cout << "create_graph_from_partitio // 4. create a graph and an edge_property_t object. #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_partitioned 4" << std::endl; + std::cerr << "create_graph_from_partitioned 4" << std::endl; #endif std::optional, weight_t>> @@ -1068,7 +1071,7 @@ create_graph_from_edgelist_impl( { #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 0" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 0" << std::endl; #endif auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); @@ -1181,7 +1184,7 @@ create_graph_from_edgelist_impl( } bool compress{false}; if (static_cast(num_edges) * element_size > - static_cast(total_global_mem * 0.5 /* tuning parameter */)) { + static_cast(total_global_mem * 0.65 /* tuning parameter */)) { compress = true; } @@ -1215,7 +1218,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 1 compressed_v_size=" << compressed_v_size + std::cerr << "create_graph_from_edgelist_impl 1 compressed_v_size=" << compressed_v_size << std::endl; #endif @@ -1261,7 +1264,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 2" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 2" << std::endl; #endif // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement @@ -1314,7 +1317,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 3" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 3" << std::endl; #endif // 4. compute additional copy_offset vectors @@ -1353,7 +1356,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 4" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 4" << std::endl; #endif // 5. split the grouped edge chunks to local partitions @@ -1383,8 +1386,8 @@ create_graph_from_edgelist_impl( edge_partition_intra_segment_copy_output_displacement_vectors, compressed_v_size); #if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 4-1" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "create_graph_from_edgelist_impl 4-1" << std::endl; #endif edge_partition_edgelist_compressed_dsts = @@ -1418,7 +1421,7 @@ create_graph_from_edgelist_impl( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 5" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 5" << std::endl; #endif if (edgelist_weights) { @@ -1453,7 +1456,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 6" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 6" << std::endl; #endif // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory @@ -1462,121 +1465,65 @@ create_graph_from_edgelist_impl( if (compressed_v_size < sizeof(vertex_t)) { assert(edge_partition_edgelist_compressed_srcs); assert(edge_partition_edgelist_compressed_dsts); + + std::vector> h_edge_partition_edgelist_compressed_srcs(minor_comm_size); + std::vector> h_edge_partition_edgelist_compressed_dsts(minor_comm_size); + for (size_t i = 0; i < static_cast(minor_comm_size); ++i) { + h_edge_partition_edgelist_compressed_srcs[i].resize(edge_partition_edge_counts[i] * + compressed_v_size); + raft::update_host(h_edge_partition_edgelist_compressed_srcs[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].data(), + (*edge_partition_edgelist_compressed_srcs)[i].size(), + handle.get_stream()); + + h_edge_partition_edgelist_compressed_dsts[i].resize(edge_partition_edge_counts[i] * + compressed_v_size); + raft::update_host(h_edge_partition_edgelist_compressed_dsts[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].data(), + (*edge_partition_edgelist_compressed_dsts)[i].size(), + handle.get_stream()); + } + (*edge_partition_edgelist_compressed_srcs).clear(); + (*edge_partition_edgelist_compressed_dsts).clear(); + edge_partition_edgelist_srcs.reserve(minor_comm_size); edge_partition_edgelist_dsts.reserve(minor_comm_size); for (int i = 0; i < minor_comm_size; ++i) { - auto tmp_srcs = - try_allocate_dataframe_buffer(edge_partition_edge_counts[i], handle.get_stream()); - if (tmp_srcs) { - decompress_vertices( - handle, - raft::device_span((*edge_partition_edgelist_compressed_srcs)[i].data(), - (*edge_partition_edgelist_compressed_srcs)[i].size()), - raft::device_span((*tmp_srcs).data(), (*tmp_srcs).size()), - compressed_v_size); - // defer freeing (*edge_partition_edgelist_compressed_srcs)[i] to reduce memory - // fragmentation (pool allocator) - edge_partition_edgelist_srcs.push_back(std::move(*tmp_srcs)); - } else { - break; - } - - auto tmp_dsts = - try_allocate_dataframe_buffer(edge_partition_edge_counts[i], handle.get_stream()); - if (tmp_dsts) { - decompress_vertices( - handle, - raft::device_span((*edge_partition_edgelist_compressed_dsts)[i].data(), - (*edge_partition_edgelist_compressed_dsts)[i].size()), - raft::device_span((*tmp_dsts).data(), (*tmp_dsts).size()), - compressed_v_size); - // defer freeing (*edge_partition_edgelist_compressed_dsts)[i] to reduce memory - // fragmentation (pool allocator) - edge_partition_edgelist_dsts.push_back(std::move(*tmp_dsts)); - } else { - break; - } + edge_partition_edgelist_srcs.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); + edge_partition_edgelist_dsts.push_back( + rmm::device_uvector(edge_partition_edge_counts[i], handle.get_stream())); } + for (int i = 0; i < minor_comm_size; ++i) { + rmm::device_uvector tmp_bytes(edge_partition_edge_counts[i] * compressed_v_size, + handle.get_stream()); - auto num_src_allocs = edge_partition_edgelist_srcs.size(); - auto num_dst_allocs = edge_partition_edgelist_dsts.size(); - if ((num_src_allocs < static_cast(minor_comm_size)) || - (num_dst_allocs < static_cast(minor_comm_size))) { - std::vector> h_edge_partition_edgelist_compressed_srcs( - static_cast(minor_comm_size) - num_src_allocs); - std::vector> h_edge_partition_edgelist_compressed_dsts( - static_cast(minor_comm_size) - num_dst_allocs); - for (size_t i = 0; i < static_cast(minor_comm_size) - num_src_allocs; ++i) { - h_edge_partition_edgelist_compressed_srcs[i].resize( - edge_partition_edge_counts[num_src_allocs + i] * compressed_v_size); - raft::update_host(h_edge_partition_edgelist_compressed_srcs[i].data(), - (*edge_partition_edgelist_compressed_srcs)[num_src_allocs + i].data(), - (*edge_partition_edgelist_compressed_srcs)[num_src_allocs + i].size(), + raft::update_device(tmp_bytes.data(), + h_edge_partition_edgelist_compressed_srcs[i].data(), + h_edge_partition_edgelist_compressed_srcs[i].size(), handle.get_stream()); - } - for (size_t i = 0; i < static_cast(minor_comm_size) - num_dst_allocs; ++i) { - h_edge_partition_edgelist_compressed_dsts[i].resize( - edge_partition_edge_counts[num_dst_allocs + i] * compressed_v_size); - raft::update_host(h_edge_partition_edgelist_compressed_dsts[i].data(), - (*edge_partition_edgelist_compressed_dsts)[num_dst_allocs + i].data(), - (*edge_partition_edgelist_compressed_dsts)[num_dst_allocs + i].size(), + decompress_vertices(handle, + raft::device_span(tmp_bytes.data(), tmp_bytes.size()), + raft::device_span(edge_partition_edgelist_srcs[i].data(), + edge_partition_edgelist_srcs[i].size()), + compressed_v_size); + + raft::update_device(tmp_bytes.data(), + h_edge_partition_edgelist_compressed_dsts[i].data(), + h_edge_partition_edgelist_compressed_dsts[i].size(), handle.get_stream()); - } - (*edge_partition_edgelist_compressed_srcs).clear(); - (*edge_partition_edgelist_compressed_dsts).clear(); - for (size_t i = 0; - i < static_cast(minor_comm_size) - std::min(num_src_allocs, num_dst_allocs); - ++i) { - if (i < static_cast(minor_comm_size) - num_src_allocs) { - edge_partition_edgelist_srcs.push_back(rmm::device_uvector( - edge_partition_edge_counts[num_src_allocs + i], handle.get_stream())); - } - if (i < static_cast(minor_comm_size) - num_dst_allocs) { - edge_partition_edgelist_dsts.push_back(rmm::device_uvector( - edge_partition_edge_counts[num_dst_allocs + i], handle.get_stream())); - } - } - for (size_t i = 0; - i < static_cast(minor_comm_size) - std::min(num_src_allocs, num_dst_allocs); - ++i) { - if (i < static_cast(minor_comm_size) - num_src_allocs) { - rmm::device_uvector tmp_bytes( - edge_partition_edge_counts[num_src_allocs + i] * compressed_v_size, - handle.get_stream()); - raft::update_device(tmp_bytes.data(), - h_edge_partition_edgelist_compressed_srcs[i].data(), - edge_partition_edge_counts[num_src_allocs + i] * compressed_v_size, - handle.get_stream()); - decompress_vertices( - handle, - raft::device_span(tmp_bytes.data(), tmp_bytes.size()), - raft::device_span(edge_partition_edgelist_srcs[num_src_allocs + i].data(), - edge_partition_edgelist_srcs[num_src_allocs + i].size()), - compressed_v_size); - } - if (i < static_cast(minor_comm_size) - num_dst_allocs) { - rmm::device_uvector tmp_bytes( - edge_partition_edge_counts[num_dst_allocs + i] * compressed_v_size, - handle.get_stream()); - raft::update_device(tmp_bytes.data(), - h_edge_partition_edgelist_compressed_dsts[i].data(), - edge_partition_edge_counts[num_dst_allocs + i] * compressed_v_size, - handle.get_stream()); - decompress_vertices( - handle, - raft::device_span(tmp_bytes.data(), tmp_bytes.size()), - raft::device_span(edge_partition_edgelist_dsts[num_dst_allocs + i].data(), - edge_partition_edgelist_dsts[num_dst_allocs + i].size()), - compressed_v_size); - } - } - - handle.sync_stream(); + decompress_vertices(handle, + raft::device_span(tmp_bytes.data(), tmp_bytes.size()), + raft::device_span(edge_partition_edgelist_dsts[i].data(), + edge_partition_edgelist_dsts[i].size()), + compressed_v_size); } + + handle.sync_stream(); } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "create_graph_from_edgelist_impl 7" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 7" << std::endl; #endif return create_graph_from_partitioned_edgelist, std::vector, vertex_t> compu #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 0" << std::endl; + std::cerr << "compute_renumber_map 0" << std::endl; #endif rmm::device_uvector sorted_local_vertices(0, handle.get_stream()); if (!local_vertices) { @@ -323,11 +323,12 @@ std::tuple, std::vector, vertex_t> compu for (size_t i = 0; i < num_bins; ++i) { #if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 0-1 i=" << i << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 0-1 i=" << i << std::endl; #endif - std::vector> edge_partition_tmp_majors{}; // for bin "i" + rmm::device_uvector this_bin_sorted_unique_majors(0, handle.get_stream()); { + std::vector> edge_partition_tmp_majors{}; // for bin "i" edge_partition_tmp_majors.reserve(edgelist_majors.size()); for (size_t j = 0; j < edgelist_majors.size(); ++j) { rmm::device_uvector tmp_majors(0, handle.get_stream()); @@ -358,13 +359,53 @@ std::tuple, std::vector, vertex_t> compu edge_partition_tmp_majors.push_back(std::move(tmp_majors)); } + if constexpr (multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + std::vector tx_counts(minor_comm_size); + for (int j = 0; j < minor_comm_size; ++j) { + tx_counts[j] = edge_partition_tmp_majors[j].size(); + } + this_bin_sorted_unique_majors.resize(std::reduce(tx_counts.begin(), tx_counts.end()), + handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_majors[j].begin(), + edge_partition_tmp_majors[j].end(), + this_bin_sorted_unique_majors.begin() + output_offset); + output_offset += edge_partition_tmp_majors[j].size(); + } +#if 0 + std::tie(this_bin_sorted_unique_majors, std::ignore) = shuffle_values( + minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); + thrust::sort( + handle.get_thrust_policy(), + this_bin_sorted_unique_majors.begin(), + this_bin_sorted_unique_majors.end()); + this_bin_sorted_unique_majors.resize(thrust::distance(this_bin_sorted_unique_majors.begin(), thrust::unique( + handle.get_thrust_policy(), + this_bin_sorted_unique_majors.begin(), + this_bin_sorted_unique_majors.end())), handle.get_stream()); + this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream()); +#else + this_bin_sorted_unique_majors = shuffle_and_unique_segment_sorted_values( + minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); +#endif + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } + } else { + this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); + } } #if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 0-2 i=" << i << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 0-2 i=" << i << std::endl; #endif - rmm::device_uvector tmp_minors(0, handle.get_stream()); + rmm::device_uvector this_bin_sorted_unique_minors(0, handle.get_stream()); { std::vector> edge_partition_tmp_minors{}; // for bin "i" edge_partition_tmp_minors.reserve(edgelist_minors.size()); @@ -397,150 +438,124 @@ std::tuple, std::vector, vertex_t> compu edge_partition_tmp_minors.push_back(std::move(tmp_minors)); } - edge_t aggregate_size{0}; - for (size_t i = 0; i < edge_partition_tmp_minors.size(); ++i) { - aggregate_size += edge_partition_tmp_minors[i].size(); - } - tmp_minors.resize(aggregate_size, handle.get_stream()); - size_t output_offset{0}; - for (size_t i = 0; i < edge_partition_tmp_minors.size(); ++i) { - thrust::copy(handle.get_thrust_policy(), - edge_partition_tmp_minors[i].begin(), - edge_partition_tmp_minors[i].end(), - tmp_minors.begin() + output_offset); - output_offset += edge_partition_tmp_minors[i].size(); + if constexpr (multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + if (major_comm_size > 1) { + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + edge_t aggregate_size{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + aggregate_size += edge_partition_tmp_minors[j].size(); + } + this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_minors[j].begin(), + edge_partition_tmp_minors[j].end(), + this_bin_sorted_unique_minors.begin() + output_offset); + output_offset += edge_partition_tmp_minors[j].size(); + } + thrust::sort(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end()); + this_bin_sorted_unique_minors.resize( + thrust::distance(this_bin_sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end())), + handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); +#if 0 + this_bin_sorted_unique_minors = shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle, std::move(this_bin_sorted_unique_minors)); + thrust::sort(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end()); + this_bin_sorted_unique_minors.resize(thrust::distance(this_bin_sorted_unique_minors.begin(), thrust::unique(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end())), handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); +#else + compute_gpu_id_from_ext_vertex_t gpu_id_func{ + comm_size, major_comm_size, minor_comm_size}; + auto d_tx_counts = groupby_and_count( + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), + [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { + return partition_manager::compute_major_comm_rank_from_global_comm_rank( + major_comm_size, minor_comm_size, gpu_id_func(v)); + }, + major_comm_size, + std::numeric_limits::max(), + handle.get_stream()); + std::vector h_tx_counts(d_tx_counts.size()); + raft::update_host( + h_tx_counts.data(), d_tx_counts.data(), d_tx_counts.size(), handle.get_stream()); + handle.sync_stream(); + std::vector tx_displacements(h_tx_counts.size()); + std::exclusive_scan( + h_tx_counts.begin(), h_tx_counts.end(), tx_displacements.begin(), size_t{0}); + for (int j = 0; j < major_comm_size; ++j) { + thrust::sort( + handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin() + tx_displacements[j], + this_bin_sorted_unique_minors.begin() + (tx_displacements[j] + h_tx_counts[j])); + } + this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values( + major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream()); +#endif + } else { + this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); + } + } else { + this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); } - thrust::sort(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end()); - tmp_minors.resize( - thrust::distance( - tmp_minors.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_minors.begin(), tmp_minors.end())), - handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); } - #if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 0-3 i=" << i << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 0-3 i=" << i << std::endl; #endif - // FIXME: this can be a memory footprint bottleneck if major_comm_size is large. - rmm::device_uvector tmp_vertices(0, handle.get_stream()); - if (multi_gpu && (handle.get_comms().get_size() > 1)) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); - auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - compute_gpu_id_from_ext_vertex_t gpu_id_func{ - comm_size, major_comm_size, minor_comm_size}; - auto d_minor_counts = groupby_and_count( - tmp_minors.begin(), - tmp_minors.end(), - [major_comm_size, minor_comm_size, gpu_id_func] __device__(auto v) { - return partition_manager::compute_major_comm_rank_from_global_comm_rank( - major_comm_size, minor_comm_size, gpu_id_func(v)); - }, - major_comm_size, - std::numeric_limits::max(), - handle.get_stream()); - std::vector h_minor_counts(d_minor_counts.size()); - raft::update_host( - h_minor_counts.data(), d_minor_counts.data(), d_minor_counts.size(), handle.get_stream()); - handle.sync_stream(); - std::vector h_minor_displacements(h_minor_counts.size()); - std::exclusive_scan( - h_minor_counts.begin(), h_minor_counts.end(), h_minor_displacements.begin(), size_t{0}); - - std::vector tx_counts(comm_size, 0); - for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { - auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, major_comm_rank, j); - tx_counts[idx] = edge_partition_tmp_majors[j].size(); - } - for (size_t j = 0; j < h_minor_counts.size(); ++j) { - auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, j, minor_comm_rank); - tx_counts[idx] += h_minor_counts[j]; - } - std::vector tx_displacements(comm_size); - std::exclusive_scan( - tx_counts.begin(), tx_counts.end(), tx_displacements.begin(), size_t{0}); - tmp_vertices.resize(tx_displacements.back() + tx_counts.back(), handle.get_stream()); - for (size_t j = 0; j < edge_partition_tmp_majors.size(); ++j) { - auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, major_comm_rank, j); - thrust::copy(handle.get_thrust_policy(), - edge_partition_tmp_majors[j].begin(), - edge_partition_tmp_majors[j].end(), - tmp_vertices.begin() + tx_displacements[idx]); - edge_partition_tmp_majors[j].resize(0, handle.get_stream()); - edge_partition_tmp_majors[j].shrink_to_fit(handle.get_stream()); - } - for (size_t j = 0; j < h_minor_counts.size(); ++j) { - auto idx = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, j, minor_comm_rank); - thrust::copy( - handle.get_thrust_policy(), - tmp_minors.begin() + h_minor_displacements[j], - tmp_minors.begin() + (h_minor_displacements[j] + h_minor_counts[j]), - tmp_vertices.begin() + tx_displacements[idx] + (tx_counts[idx] - h_minor_counts[j])); - } - tmp_minors.resize(0, handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); - - // single shuffle_values() on comm instead of one shuffle_values() on minor_comm & one - // shuffle_values() on major_comm (to cut NCCL P2P buffer size) - std::tie(tmp_vertices, std::ignore) = - shuffle_values(comm, tmp_vertices.begin(), tx_counts, handle.get_stream()); - thrust::sort(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end()); - tmp_vertices.resize( - thrust::distance( - tmp_vertices.begin(), - thrust::unique(handle.get_thrust_policy(), tmp_vertices.begin(), tmp_vertices.end())), + rmm::device_uvector this_bin_sorted_unique_vertices(0, handle.get_stream()); + { + rmm::device_uvector merged_vertices( + this_bin_sorted_unique_majors.size() + this_bin_sorted_unique_minors.size(), handle.get_stream()); - tmp_vertices.shrink_to_fit(handle.get_stream()); - } else { - assert(edge_partition_sorted_unique_majors.size() == 1); - auto& tmp_majors = edge_partition_tmp_majors[0]; - rmm::device_uvector merged_vertices(tmp_majors.size() + tmp_minors.size(), - handle.get_stream()); thrust::merge(handle.get_thrust_policy(), - tmp_majors.begin(), - tmp_majors.end(), - tmp_minors.begin(), - tmp_minors.end(), + this_bin_sorted_unique_majors.begin(), + this_bin_sorted_unique_majors.end(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end(), merged_vertices.begin()); - tmp_majors.resize(0, handle.get_stream()); - tmp_majors.shrink_to_fit(handle.get_stream()); - tmp_minors.resize(0, handle.get_stream()); - tmp_minors.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_majors.resize(0, handle.get_stream()); + this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream()); + this_bin_sorted_unique_minors.resize(0, handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); merged_vertices.resize(thrust::distance(merged_vertices.begin(), thrust::unique(handle.get_thrust_policy(), merged_vertices.begin(), merged_vertices.end())), handle.get_stream()); merged_vertices.shrink_to_fit(handle.get_stream()); - tmp_vertices = std::move(merged_vertices); + this_bin_sorted_unique_vertices = std::move(merged_vertices); } - #if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 0-4 i=" << i << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "compute_renumber_map 0-4 i=" << i << std::endl; #endif if (sorted_local_vertices.size() == 0) { - sorted_local_vertices = std::move(tmp_vertices); + sorted_local_vertices = std::move(this_bin_sorted_unique_vertices); } else { rmm::device_uvector merged_vertices( - sorted_local_vertices.size() + tmp_vertices.size(), handle.get_stream()); + sorted_local_vertices.size() + this_bin_sorted_unique_vertices.size(), + handle.get_stream()); thrust::merge(handle.get_thrust_policy(), sorted_local_vertices.begin(), sorted_local_vertices.end(), - tmp_vertices.begin(), - tmp_vertices.end(), + this_bin_sorted_unique_vertices.begin(), + this_bin_sorted_unique_vertices.end(), merged_vertices.begin()); // merging two unique sets from different hash // bins, so the merged set can't have duplicates sorted_local_vertices = std::move(merged_vertices); @@ -553,7 +568,7 @@ std::tuple, std::vector, vertex_t> compu } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 1" << std::endl; + std::cerr << "compute_renumber_map 1" << std::endl; #endif // 2. find an unused vertex ID @@ -567,7 +582,7 @@ std::tuple, std::vector, vertex_t> compu "vertex_t, increase vertex_t to 64 bit."); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 2" << std::endl; + std::cerr << "compute_renumber_map 2" << std::endl; #endif // 3. compute global degrees for the sorted local vertices @@ -653,7 +668,7 @@ std::tuple, std::vector, vertex_t> compu } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 3" << std::endl; + std::cerr << "compute_renumber_map 3" << std::endl; #endif // 5. sort local vertices by degree (descending) @@ -724,7 +739,7 @@ std::tuple, std::vector, vertex_t> compu handle.sync_stream(); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "compute_renumber_map 4" << std::endl; + std::cerr << "compute_renumber_map 4" << std::endl; #endif return std::make_tuple( @@ -1020,7 +1035,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "renumber_edgelist 0" << std::endl; + std::cerr << "renumber_edgelist 0" << std::endl; #endif auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = detail::compute_renumber_map(handle, @@ -1030,7 +1045,7 @@ renumber_edgelist( edgelist_edge_counts); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "renumber_edgelist 1" << std::endl; + std::cerr << "renumber_edgelist 1" << std::endl; #endif // 2. initialize partition_t object, number_of_vertices, and number_of_edges @@ -1069,7 +1084,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "renumber_edgelist 2" << std::endl; + std::cerr << "renumber_edgelist 2" << std::endl; #endif { vertex_t max_edge_partition_major_range_size{0}; @@ -1105,7 +1120,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" + std::cerr << "renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" << partition.local_edge_partition_minor_range_size() << " number_of_edges=" << number_of_edges << " comm_size=" << comm_size << " edgelist_intra_partition_segment_offsets.has_value()=" @@ -1122,7 +1137,7 @@ renumber_edgelist( static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && edgelist_intra_partition_segment_offsets) { #if 1 - std::cout << "path A" << std::endl; + std::cerr << "path A" << std::endl; #endif vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { @@ -1164,7 +1179,7 @@ renumber_edgelist( } } else { #if 1 - std::cout << "path B" << std::endl; + std::cerr << "path B" << std::endl; #endif rmm::device_uvector renumber_map_minor_labels( partition.local_edge_partition_minor_range_size(), handle.get_stream()); @@ -1177,32 +1192,34 @@ renumber_edgelist( } std::vector displacements(recvcounts.size(), 0); std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); -{ -RAFT_CUDA_TRY(cudaDeviceSynchronize()); -size_t free{}; -size_t total{}; -RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); -auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); -auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); -auto u_sz = t_sz - f_sz; -std::cout << "BEFORE device_allgatherv free=" << f_sz << "GB used=" << u_sz << "GB total=" << t_sz << std::endl; -} + { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + size_t free{}; + size_t total{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); + auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); + auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); + auto u_sz = t_sz - f_sz; + std::cerr << "BEFORE device_allgatherv free=" << f_sz << "GB used=" << u_sz + << "GB total=" << t_sz << std::endl; + } device_allgatherv(major_comm, renumber_map_labels.data(), renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); -{ -RAFT_CUDA_TRY(cudaDeviceSynchronize()); -size_t free{}; -size_t total{}; -RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); -auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); -auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); -auto u_sz = t_sz - f_sz; -std::cout << "AFTER device_allgatherv free=" << f_sz << "GB used=" << u_sz << "GB total=" << t_sz << std::endl; -} + { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + size_t free{}; + size_t total{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); + auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); + auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); + auto u_sz = t_sz - f_sz; + std::cerr << "AFTER device_allgatherv free=" << f_sz << "GB used=" << u_sz + << "GB total=" << t_sz << std::endl; + } kv_store_t renumber_map( renumber_map_minor_labels.begin(), @@ -1222,7 +1239,7 @@ std::cout << "AFTER device_allgatherv free=" << f_sz << "GB used=" << u_sz << "G #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cout << "renumber_edgelist 4" << std::endl; + std::cerr << "renumber_edgelist 4" << std::endl; #endif auto edge_partition_segment_offsets = detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); From 4ddd0a1e67c666099789bce8c1d2d4613bca7b91 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Sep 2024 14:28:39 -0700 Subject: [PATCH 083/126] improve logging --- .../detail/extract_transform_v_frontier_e.cuh | 4 ++-- .../prims/detail/per_v_transform_reduce_e.cuh | 4 ++-- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 19 ++++++++++++++++++- cpp/src/structure/graph_impl.cuh | 10 +++++++++- cpp/src/traversal/bfs_impl.cuh | 13 ++++++------- 5 files changed, 37 insertions(+), 13 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 7adca3e226c..21dda2b0c92 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -1185,7 +1185,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration subdur7 = subtime8 - subtime7; std::chrono::duration subdur8 = subtime9 - subtime8; std::chrono::duration subdur9 = subtime10 - subtime9; - std::cout << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() + std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << ")" << std::endl; @@ -1248,7 +1248,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() + std::cerr << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index ed2a3fa35f4..54941f6b816 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -2639,7 +2639,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration subdur14 = subtime15 - subtime14; std::chrono::duration subdur15 = subtime16 - subtime15; std::chrono::duration subdur16 = subtime17 - subtime16; - std::cout << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," + std::cerr << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," @@ -2765,7 +2765,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count() + std::cerr << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 6efa9c6313f..1f080f7b103 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -293,6 +293,17 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, d_tx_buffer_last_boundaries.size(), handle.get_stream()); handle.sync_stream(); +#if 0 + std::vector tx_counts(comm.get_size(), 0); + for (int i = 0; i < major_comm_size; ++i) { + auto r = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(major_comm_size, minor_comm_size, i, minor_comm_rank); + tx_counts[r] = (i == 0) ? h_tx_buffer_last_boundaries[0] : (h_tx_buffer_last_boundaries[i] - h_tx_buffer_last_boundaries[i - 1]); + } + + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); // use comm insteads of major_comm to save P2P buffer allocation +#else std::vector tx_counts(h_tx_buffer_last_boundaries.size()); std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); @@ -300,12 +311,18 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); std::tie(rx_key_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); +#endif key_buffer = std::move(rx_key_buffer); if constexpr (!std::is_same_v) { auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); +#if 0 + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); +#else std::tie(rx_payload_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); +#endif payload_buffer = std::move(rx_payload_buffer); } @@ -319,7 +336,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cout << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << "," + std::cerr << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index ef43b7b13ec..4d5585304f6 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -165,6 +165,7 @@ update_local_sorted_unique_edge_majors_minors( // majors/minors to support storing edge major/minor properties in (key, value) pairs. // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 1" << std::endl; { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); @@ -191,8 +192,10 @@ update_local_sorted_unique_edge_majors_minors( raft::comms::op_t::MAX, handle.get_stream()); +std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl; if (max_minor_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { + std::cerr << "K,V pairs" << std::endl; auto const chunk_size = static_cast(std::min(1.0 / max_minor_properties_fill_ratio, 1024.0)); @@ -280,10 +283,11 @@ update_local_sorted_unique_edge_majors_minors( } // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 2" << std::endl; std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - num_local_unique_edge_major_counts[i] += thrust::count_if( + num_local_unique_edge_major_counts[i] = thrust::count_if( handle.get_thrust_policy(), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), @@ -304,6 +308,7 @@ update_local_sorted_unique_edge_majors_minors( raft::comms::op_t::MAX, handle.get_stream()); +std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl; if (max_major_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { auto const chunk_size = @@ -368,6 +373,7 @@ update_local_sorted_unique_edge_majors_minors( } local_sorted_unique_edge_major_chunk_size = chunk_size; } +RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 3" << std::endl; return std::make_tuple(std::move(local_sorted_unique_edge_majors), std::move(local_sorted_unique_edge_major_chunk_start_offsets), @@ -425,6 +431,7 @@ graph_t dur2 = prep3 - prep2; std::chrono::duration dur3 = prep4 - prep3; std::chrono::duration dur = prep4 - prep0; - std::cout << comm_rank << ":prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() + std::cerr << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; #endif @@ -437,7 +436,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur = topdown3 - topdown0; - std::cout << comm_rank << ":depth=" << depth << " topdown (prim,vf,host) took " + std::cerr << "depth=" << depth << " topdown (prim,vf,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ") s." << std::endl; #endif @@ -567,7 +566,7 @@ void bfs(raft::handle_t const& handle, ? host_scalar_allreduce( handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) : m_u; - std::cout << comm_rank << ":m_f=" << m_f << " m_u=" << m_u + std::cerr << "m_f=" << m_f << " m_u=" << m_u << " aggregate_m_f * direction_optimzing_alpha=" << aggregate_m_f * direction_optimizing_alpha << " aggregate_m_u=" << aggregate_m_u @@ -606,7 +605,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; std::chrono::duration dur = topdown6 - topdown0; - std::cout << comm_rank << ":depth=" << depth + std::cerr << "depth=" << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size << " next topdown=" << topdown << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," @@ -703,7 +702,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; - std::cout << comm_rank << ":depth=" << depth << " bottomup (prim+,host) took " + std::cerr << "depth=" << depth << " bottomup (prim+,host) took " << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s." << std::endl; #endif @@ -760,7 +759,7 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur3 = bottomup4 - bottomup3; std::chrono::duration dur4 = bottomup5 - bottomup4; std::chrono::duration dur = bottomup5 - bottomup0; - std::cout << comm_rank << ":depth=" << depth + std::cerr << "depth=" << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," From 3bb660276ca36ffd86ab7042676607e1f89d29e8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Sep 2024 14:32:48 -0700 Subject: [PATCH 084/126] NCCL bug workaround --- cpp/src/prims/fill_edge_src_dst_property.cuh | 371 +++++++++---------- 1 file changed, 179 insertions(+), 192 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 36e1b92896a..815c2a44621 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -287,6 +287,8 @@ void fill_edge_minor_property(raft::handle_t const& handle, } } +#define FILL_PERFORMANCE_MEASUREMENT 1 + template (); @@ -370,6 +375,17 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = std::chrono::steady_clock::now handle.get_stream()); } } + size_t min_bcast_size = std::numeric_limits::max(); + for (int i = 0; i < major_comm_size; ++i) { + if (v_list_bitmap) { + min_bcast_size = + std::min(min_bcast_size, + packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * + sizeof(uint32_t)); + } else { + min_bcast_size = std::min(min_bcast_size, local_v_list_sizes[i] * sizeof(vertex_t)); + } + } auto num_concurrent_bcasts = (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * @@ -381,9 +397,10 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = std::chrono::steady_clock::now num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); num_concurrent_bcasts = std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); - std::cout << "v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," + std::cerr << "v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() - << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; + << " num_concurrent_bcasts=" << num_concurrent_bcasts + << " min_bcast_size=" << min_bcast_size << std::endl; std::optional> stream_pool_indices{std::nullopt}; if (num_concurrent_bcasts > 1) { @@ -397,72 +414,70 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t0 = std::chrono::steady_clock::now } else { key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } + handle.sync_stream(); // FIXME: unnecessary if we run broadcast operations in ncclGroupStart & + // ncclGroupoEnd -RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t1 = std::chrono::steady_clock::now(); +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto t1 = std::chrono::steady_clock::now(); +#endif auto edge_partition_keys = edge_minor_property_output.keys(); for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { -RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub0 = std::chrono::steady_clock::now(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); auto loop_count = std::min(num_concurrent_bcasts, static_cast(major_comm_size) - i); - std::vector, rmm::device_uvector>> - edge_partition_key_buffers{}; - std::vector> edge_partition_dummy_counter_scalars{}; - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - - std::variant, rmm::device_uvector> key_buffer = - rmm::device_uvector(0, handle.get_stream()); - if (v_list_bitmap) { - key_buffer = rmm::device_uvector( + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); +#endif + std::vector> edge_partition_rx_bitmaps{}; + edge_partition_rx_bitmaps.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + edge_partition_rx_bitmaps.push_back(rmm::device_uvector( packed_bool_size(local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]), - handle.get_stream()); - } else { - std::get<0>(key_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); + handle.get_stream())); } - edge_partition_key_buffers.push_back(std::move(key_buffer)); - edge_partition_dummy_counter_scalars.push_back( - rmm::device_scalar(size_t{0}, handle.get_stream())); - } -RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub1 = std::chrono::steady_clock::now(); - - device_group_start(major_comm); - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto& key_buffer = edge_partition_key_buffers[j]; - if (v_list_bitmap) { +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub1 = std::chrono::steady_clock::now(); +#endif + if (packed_bool_size(min_bcast_size) >= 8192 /* workaround for a seemingly NCCL bug */) { + device_group_start(major_comm); + } + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& rx_bitmap = edge_partition_rx_bitmaps[j]; device_bcast(major_comm, (static_cast(partition_idx) == major_comm_rank) ? (*v_list_bitmap).data() : static_cast(nullptr), - std::get<1>(key_buffer).data(), - std::get<1>(key_buffer).size(), - static_cast(partition_idx), - handle.get_stream()); - } else { - device_bcast(major_comm, - sorted_unique_vertex_first, - std::get<0>(key_buffer).data(), - std::get<0>(key_buffer).size(), + rx_bitmap.data(), + rx_bitmap.size(), static_cast(partition_idx), handle.get_stream()); } - } - device_group_end(major_comm); - if (stream_pool_indices) { handle.sync_stream(); } -RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub2 = std::chrono::steady_clock::now(); - - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - if (is_packed_bool() && - !edge_partition_keys && v_list_bitmap) { - auto const& rx_bitmap = std::get<1>(edge_partition_key_buffers[j]); + if (min_bcast_size >= 8192 /* workaround for a seemingly NCCL bug */) { + device_group_end(major_comm); + } + handle.sync_stream(); + +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub2 = std::chrono::steady_clock::now(); +#endif + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + auto const& rx_bitmap = edge_partition_rx_bitmaps[j]; thrust::for_each( rmm::exec_policy_nosync(loop_stream), thrust::make_counting_iterator(size_t{0}), @@ -489,148 +504,108 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub2 = std::chrono::steady_clock::n } } }); - } else { + } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub3 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = sub1 - sub0; + std::chrono::duration subdur1 = sub2 - sub1; + std::chrono::duration subdur2 = sub3 - sub2; + std::cerr << "fill_edge_minor path A took (" << subdur0.count() << "," << subdur1.count() + << "," << subdur2.count() << ")" << std::endl; +#endif + } else { +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub0 = std::chrono::steady_clock::now(); +#endif + std::vector, rmm::device_uvector>> + edge_partition_v_buffers{}; + edge_partition_v_buffers.reserve(loop_count); + std::vector> edge_partition_dummy_counter_scalars{}; + edge_partition_dummy_counter_scalars.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + std::variant, rmm::device_uvector> v_buffer = + rmm::device_uvector(0, handle.get_stream()); if (v_list_bitmap) { - auto const& rx_bitmap = std::get<1>(edge_partition_key_buffers[j]); - rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], loop_stream); - rmm::device_scalar dummy(size_t{0}, loop_stream); - retrieve_vertex_list_from_bitmap(raft::device_span(rx_bitmap.data(), rx_bitmap.size()), rx_vertices.begin(), raft::device_span(dummy.data(), size_t{1}), local_v_list_range_firsts[partition_idx], local_v_list_range_lasts[partition_idx], loop_stream); - edge_partition_key_buffers[j] = std::move(rx_vertices); - } - auto const& rx_vertices = std::get<0>(edge_partition_key_buffers[j]); - if (edge_partition_keys) { - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), - [rx_vertex_first = rx_vertices.begin(), - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], - subrange_key_last = - (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); - } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; - } - } - }); + v_buffer = rmm::device_uvector( + packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), + handle.get_stream()); } else { - if constexpr (contains_packed_bool_element) { - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator( - static_cast(local_v_list_sizes[partition_idx])), - [minor_range_first, - rx_vertex_first = rx_vertices.begin(), - input, - output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = rx_vertex - minor_range_first; - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); - }); - } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type( - [minor_range_first] __device__(auto v) { return v - minor_range_first; })); - auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(rmm::exec_policy_nosync(loop_stream), - val_first, - val_first + local_v_list_sizes[partition_idx], - map_first, - edge_partition_value_first); - } + std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); } + edge_partition_v_buffers.push_back(std::move(v_buffer)); + edge_partition_dummy_counter_scalars.push_back( + rmm::device_scalar(size_t{0}, handle.get_stream())); } - } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::now(); -std::chrono::duration subdur0 = sub1 - sub0; -std::chrono::duration subdur1 = sub2 - sub1; -std::chrono::duration subdur2 = sub3 - sub2; -std::cout << "sub (fill) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << ")" << std::endl; -#if 0 - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - if (is_packed_bool() && - !edge_partition_keys && v_list_bitmap) { - rmm::device_uvector rx_bitmap( - packed_bool_size(local_v_list_range_lasts[partition_idx] - - local_v_list_range_firsts[partition_idx]), - loop_stream); - device_bcast(major_comm, - (static_cast(partition_idx) == major_comm_rank) - ? (*v_list_bitmap).data() - : static_cast(nullptr), - rx_bitmap.data(), - rx_bitmap.size(), - partition_idx, - loop_stream); - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_bitmap.size()), - [input, - output_value_first = - edge_partition_value_first + - packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first), - rx_bitmap = raft::device_span(rx_bitmap.data(), - rx_bitmap.size())] __device__(size_t i) { - if ((i == 0) || (i == (rx_bitmap.size() - 1))) { // first or last - cuda::atomic_ref word( - *(output_value_first + i)); - if (input) { - word.fetch_or(rx_bitmap[i], cuda::std::memory_order_relaxed); - } else { - word.fetch_and(~rx_bitmap[i], cuda::std::memory_order_relaxed); - } - } else { - if (input) { - *(output_value_first + i) |= rx_bitmap[i]; - } else { - *(output_value_first + i) &= ~rx_bitmap[i]; - } - } - }); - } else { - rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], loop_stream); - // FIXME: these broadcast operations can be placed between ncclGroupStart() and - // ncclGroupEnd() - std::variant, decltype(sorted_unique_vertex_first)> - v_list{}; +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub1 = std::chrono::steady_clock::now(); +#endif + + if (min_bcast_size >= 8192 /* workaround for a seemingly NCCL bug */) { + device_group_start(major_comm); + }; + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + auto& v_buffer = edge_partition_v_buffers[j]; if (v_list_bitmap) { - v_list = (static_cast(partition_idx) == major_comm_rank) - ? raft::device_span((*v_list_bitmap).data(), - (*v_list_bitmap).size()) - : raft::device_span(static_cast(nullptr), - size_t{0}); + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) + ? (*v_list_bitmap).data() + : static_cast(nullptr), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); } else { - v_list = sorted_unique_vertex_first; + // FIXME: we may better send 32 bit vertex offsets if [local_v_list_range_firsts[], + // local_v_list_range_lasts[]) fit into unsigned 32 bit integer + device_bcast(major_comm, + (static_cast(partition_idx) == major_comm_rank) + ? sorted_unique_vertex_first + : static_cast(nullptr), + std::get<0>(v_buffer).data(), + std::get<0>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } + } + if (min_bcast_size >= 8192 /* workaround for a seemingly NCCL bug */) { + device_group_end(major_comm); + } + if (stream_pool_indices) { handle.sync_stream(); } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub2 = std::chrono::steady_clock::now(); +#endif + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + if (v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], + loop_stream); + rmm::device_scalar dummy(size_t{0}, loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + rx_vertices.begin(), + raft::device_span(dummy.data(), size_t{1}), + local_v_list_range_firsts[partition_idx], + local_v_list_range_lasts[partition_idx], + loop_stream); + edge_partition_v_buffers[j] = std::move(rx_vertices); } - device_bcast_vertex_list(major_comm, - v_list, - rx_vertices.begin(), - local_v_list_range_firsts[partition_idx], - local_v_list_range_lasts[partition_idx], - local_v_list_sizes[partition_idx], - partition_idx, - loop_stream); + auto const& rx_vertices = std::get<0>(edge_partition_v_buffers[j]); if (edge_partition_keys) { thrust::for_each( rmm::exec_policy_nosync(loop_stream), @@ -685,14 +660,26 @@ std::cout << "sub (fill) took (" << subdur0.count() << "," << subdur1.count() << } } } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto sub3 = std::chrono::steady_clock::now(); + std::chrono::duration subdur0 = sub1 - sub0; + std::chrono::duration subdur1 = sub2 - sub1; + std::chrono::duration subdur2 = sub3 - sub2; + std::cerr << "fill_edge_minor path B took (" << subdur0.count() << "," << subdur1.count() + << "," << subdur2.count() << ")" << std::endl; +#endif } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#endif } -RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t2 = std::chrono::steady_clock::now(); -std::chrono::duration dur0 = t1 - t0; -std::chrono::duration dur1 = t2 - t1; -std::cout << "fill_edge_minor took (" << dur0.count() << "," << dur1.count() << ")" << std::endl; +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto t2 = std::chrono::steady_clock::now(); + std::chrono::duration dur0 = t1 - t0; + std::chrono::duration dur1 = t2 - t1; + std::cerr << "fill_edge_minor took (" << dur0.count() << "," << dur1.count() << ")" + << std::endl; +#endif } else { assert(graph_view.local_vertex_partition_range_size() == graph_view.local_edge_partition_src_range_size()); From 8be2a3f89ab07a60b9f768e173401d67a2553ee5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Sep 2024 14:34:32 -0700 Subject: [PATCH 085/126] temporary parameter tweaks for testing --- cpp/tests/utilities/base_fixture.hpp | 12 +++++++++--- cpp/tests/utilities/mg_utilities.cpp | 2 +- cpp/tests/utilities/test_graphs.hpp | 2 +- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index 25011c0c97a..dade2af57c0 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -77,10 +77,16 @@ inline auto make_pool(bool use_max = false) // effect the maximum amount of parallel tests, and therefore `tests/CMakeLists.txt` // `_CUGRAPH_TEST_PERCENT` default value will need to be audited. auto const [free, total] = rmm::available_device_memory(); - auto const min_alloc = - use_max ? rmm::align_down(std::min(free, total / 2), rmm::CUDA_ALLOCATION_ALIGNMENT) + // EOS: 1 node 0.94 succeeded 0.95 failed, 2+ nodes 0.97 succeeded 0.98 failed + auto const init_alloc = + use_max ? rmm::align_down(std::min(free, static_cast(total * 0.93)), rmm::CUDA_ALLOCATION_ALIGNMENT) : rmm::align_down(std::min(free, total / 10), rmm::CUDA_ALLOCATION_ALIGNMENT); - return rmm::mr::make_owning_wrapper(make_cuda(), min_alloc); + std::optional max_alloc{}; + if (use_max) { + max_alloc = init_alloc; + } + std::cout << "init_alloc ratio=" << static_cast(init_alloc) / static_cast(total) << std::endl; + return rmm::mr::make_owning_wrapper(make_cuda(), init_alloc, max_alloc); } inline auto make_binning() diff --git a/cpp/tests/utilities/mg_utilities.cpp b/cpp/tests/utilities/mg_utilities.cpp index 6f8fb8c6acd..d22a9956f17 100644 --- a/cpp/tests/utilities/mg_utilities.cpp +++ b/cpp/tests/utilities/mg_utilities.cpp @@ -60,7 +60,7 @@ std::unique_ptr initialize_mg_handle(size_t pool_size) --gpu_row_comm_size; } - cugraph::partition_manager::init_subcomm(*handle, gpu_row_comm_size); + cugraph::partition_manager::init_subcomm(*handle, std::max(comm_size / 8, 1)); return std::move(handle); } diff --git a/cpp/tests/utilities/test_graphs.hpp b/cpp/tests/utilities/test_graphs.hpp index b7a91985658..5edc722a8c6 100644 --- a/cpp/tests/utilities/test_graphs.hpp +++ b/cpp/tests/utilities/test_graphs.hpp @@ -246,7 +246,7 @@ class Rmat_Usecase : public detail::TranslateGraph_Usecase { // cuMemAddressReserve // (https://developer.nvidia.com/blog/introducing-low-level-gpu-virtual-memory-management), we // can reduce the temporary memory requirement to (1 / num_partitions) * (original data size) - size_t constexpr num_partitions_per_gpu = 4; + size_t constexpr num_partitions_per_gpu = 8; size_t num_partitions = num_partitions_per_gpu * static_cast(multi_gpu ? handle.get_comms().get_size() : 1); From 7da6fe56f341e7a498a39c489a8e48092dd6c0f8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 25 Sep 2024 23:25:09 -0700 Subject: [PATCH 086/126] bug fix (when major_comm_size == 1 && minor_comm_size > 1) --- cpp/src/structure/renumber_edgelist_impl.cuh | 53 +++++++++++--------- 1 file changed, 28 insertions(+), 25 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index de3fcbf3f9b..75cc316b8c1 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -438,6 +438,34 @@ std::tuple, std::vector, vertex_t> compu edge_partition_tmp_minors.push_back(std::move(tmp_minors)); } + if (edge_partition_tmp_minors.size() == 1) { + this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); + } else { + edge_t aggregate_size{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + aggregate_size += edge_partition_tmp_minors[j].size(); + } + this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream()); + size_t output_offset{0}; + for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { + thrust::copy(handle.get_thrust_policy(), + edge_partition_tmp_minors[j].begin(), + edge_partition_tmp_minors[j].end(), + this_bin_sorted_unique_minors.begin() + output_offset); + output_offset += edge_partition_tmp_minors[j].size(); + } + edge_partition_tmp_minors.clear(); + thrust::sort(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end()); + this_bin_sorted_unique_minors.resize( + thrust::distance(this_bin_sorted_unique_minors.begin(), + thrust::unique(handle.get_thrust_policy(), + this_bin_sorted_unique_minors.begin(), + this_bin_sorted_unique_minors.end())), + handle.get_stream()); + this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); + } if constexpr (multi_gpu) { auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); @@ -446,29 +474,6 @@ std::tuple, std::vector, vertex_t> compu auto const comm_size = comm.get_size(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - edge_t aggregate_size{0}; - for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { - aggregate_size += edge_partition_tmp_minors[j].size(); - } - this_bin_sorted_unique_minors.resize(aggregate_size, handle.get_stream()); - size_t output_offset{0}; - for (size_t j = 0; j < edge_partition_tmp_minors.size(); ++j) { - thrust::copy(handle.get_thrust_policy(), - edge_partition_tmp_minors[j].begin(), - edge_partition_tmp_minors[j].end(), - this_bin_sorted_unique_minors.begin() + output_offset); - output_offset += edge_partition_tmp_minors[j].size(); - } - thrust::sort(handle.get_thrust_policy(), - this_bin_sorted_unique_minors.begin(), - this_bin_sorted_unique_minors.end()); - this_bin_sorted_unique_minors.resize( - thrust::distance(this_bin_sorted_unique_minors.begin(), - thrust::unique(handle.get_thrust_policy(), - this_bin_sorted_unique_minors.begin(), - this_bin_sorted_unique_minors.end())), - handle.get_stream()); - this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); #if 0 this_bin_sorted_unique_minors = shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle, std::move(this_bin_sorted_unique_minors)); thrust::sort(handle.get_thrust_policy(), @@ -510,8 +515,6 @@ std::tuple, std::vector, vertex_t> compu } else { this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); } - } else { - this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); } } #if 1 From eb5354e8bb5125536abcdfacf47d6fc861a2f4e0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 26 Sep 2024 11:05:17 -0700 Subject: [PATCH 087/126] undo temporary workarounds --- cpp/src/prims/fill_edge_src_dst_property.cuh | 10 +++------- 1 file changed, 3 insertions(+), 7 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 815c2a44621..bd1f352c263 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -448,9 +448,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub1 = std::chrono::steady_clock::now(); #endif - if (packed_bool_size(min_bcast_size) >= 8192 /* workaround for a seemingly NCCL bug */) { - device_group_start(major_comm); - } + device_group_start(major_comm); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto& rx_bitmap = edge_partition_rx_bitmaps[j]; @@ -463,10 +461,8 @@ void fill_edge_minor_property(raft::handle_t const& handle, static_cast(partition_idx), handle.get_stream()); } - if (min_bcast_size >= 8192 /* workaround for a seemingly NCCL bug */) { - device_group_end(major_comm); - } - handle.sync_stream(); + device_group_end(major_comm); + handle.sync_stream(); // FIXME: ??? #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); From 376c02866f323e0f65bcf68549486569738ca2cb Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 26 Sep 2024 18:03:36 -0700 Subject: [PATCH 088/126] optimize prep step in fill_edge_minor_property() --- cpp/include/cugraph/utilities/device_comm.hpp | 124 +++++++++++++++--- .../detail/extract_transform_v_frontier_e.cuh | 22 ++-- cpp/src/prims/detail/multi_stream_utils.cuh | 33 +++-- .../prims/detail/per_v_transform_reduce_e.cuh | 17 +-- cpp/src/prims/fill_edge_src_dst_property.cuh | 114 +++++++++------- 5 files changed, 215 insertions(+), 95 deletions(-) diff --git a/cpp/include/cugraph/utilities/device_comm.hpp b/cpp/include/cugraph/utilities/device_comm.hpp index ffb0f7d9e5b..07de2d06466 100644 --- a/cpp/include/cugraph/utilities/device_comm.hpp +++ b/cpp/include/cugraph/utilities/device_comm.hpp @@ -55,7 +55,7 @@ auto iter_to_raw_ptr(thrust::detail::normal_iterator> iter } template -std::enable_if_t::value, void> +std::enable_if_t, void> device_isend_impl(raft::comms::comms_t const& comm, InputIterator input_first, size_t count, @@ -76,7 +76,7 @@ std::enable_if_t::value, void> device_isend_ raft::comms::request_t* request) { static_assert( - std::is_same::value_type, OutputValueType>::value); + std::is_same_v::value_type, OutputValueType>); comm.isend(iter_to_raw_ptr(input_first), count, dst, tag, request); } @@ -136,7 +136,7 @@ device_irecv_impl(raft::comms::comms_t const& comm, { static_assert( - std::is_same::value_type>::value); + std::is_same_v::value_type>); comm.irecv(iter_to_raw_ptr(output_first), count, src, tag, request); } @@ -200,7 +200,7 @@ device_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_sendrecv(iter_to_raw_ptr(input_first), tx_count, dst, @@ -286,7 +286,7 @@ device_multicast_sendrecv_impl(raft::comms::comms_t const& comm, { using value_type = typename std::iterator_traits::value_type; static_assert( - std::is_same::value_type, value_type>::value); + std::is_same_v::value_type, value_type>); comm.device_multicast_sendrecv(iter_to_raw_ptr(input_first), tx_counts, tx_offsets, @@ -379,8 +379,8 @@ device_bcast_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.bcast( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, root, stream_view.value()); } @@ -440,8 +440,8 @@ device_allreduce_impl(raft::comms::comms_t const& comm, raft::comms::op_t op, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allreduce( iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, op, stream_view.value()); } @@ -503,8 +503,8 @@ device_reduce_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.reduce(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), count, @@ -548,6 +548,62 @@ struct device_reduce_tuple_iterator_element_impl +std::enable_if_t::value, void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + // no-op +} + +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather_impl(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); + comm.allgather( + iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, stream_view.value()); +} + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + device_allgather_impl(comm, + thrust::get(input_first.get_iterator_tuple()), + thrust::get(output_first.get_iterator_tuple()), + sendcount, + stream_view); + device_allgather_tuple_iterator_element_impl().run( + comm, input_first, output_first, sendcount, stream_view); + } +}; + +template +struct device_allgather_tuple_iterator_element_impl { + void run(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) const + { + } +}; + template std::enable_if_t::value, void> device_allgatherv_impl(raft::comms::comms_t const& comm, @@ -571,8 +627,8 @@ device_allgatherv_impl(raft::comms::comms_t const& comm, std::vector const& displacements, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.allgatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), recvcounts.data(), @@ -639,8 +695,8 @@ device_gatherv_impl(raft::comms::comms_t const& comm, int root, rmm::cuda_stream_view stream_view) { - static_assert(std::is_same::value_type, - typename std::iterator_traits::value_type>::value); + static_assert(std::is_same_v::value_type, + typename std::iterator_traits::value_type>); comm.gatherv(iter_to_raw_ptr(input_first), iter_to_raw_ptr(output_first), sendcount, @@ -1000,6 +1056,44 @@ device_reduce(raft::comms::comms_t const& comm, .run(comm, input_first, output_first, count, op, root, stream_view); } +template +std::enable_if_t< + std::is_arithmetic::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + detail::device_allgather_impl(comm, input_first, output_first, sendcount, stream_view); +} + +template +std::enable_if_t< + is_thrust_tuple_of_arithmetic::value_type>::value && + is_thrust_tuple::value_type>::value, + void> +device_allgather(raft::comms::comms_t const& comm, + InputIterator input_first, + OutputIterator output_first, + size_t sendcount, + rmm::cuda_stream_view stream_view) +{ + static_assert( + thrust::tuple_size::value_type>::value == + thrust::tuple_size::value_type>::value); + + size_t constexpr tuple_size = + thrust::tuple_size::value_type>::value; + + detail::device_allgather_tuple_iterator_element_impl() + .run(comm, input_first, output_first, sendcount, stream_view); +} + template std::enable_if_t< std::is_arithmetic::value_type>::value, diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 21dda2b0c92..712b596ee63 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -760,8 +760,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); - auto max_tmp_buffer_size = - (graph_view.compute_number_of_edges(handle) / comm_size) * sizeof(vertex_t); + auto max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05); auto aggregate_major_range_size = host_scalar_allreduce( comm, @@ -802,11 +802,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, (aggregate_major_range_size / comm_size) * key_size + (aggregate_max_pushes / comm_size) * (output_key_size + output_value_size); - stream_pool_indices = init_stream_pool_indices(handle, - max_tmp_buffer_size, + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, approx_tmp_buffer_size_per_edge_partition, graph_view.number_of_local_edge_partitions(), - max_segments); + max_segments, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } } @@ -1185,10 +1186,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration subdur7 = subtime8 - subtime7; std::chrono::duration subdur8 = subtime9 - subtime8; std::chrono::duration subdur9 = subtime10 - subtime9; - std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() - << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," - << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," - << subdur9.count() << ")" << std::endl; + std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," + << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," + << subdur8.count() << "," << subdur9.count() << ")" << std::endl; #endif } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1248,7 +1249,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cerr << "\t\t" << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() + std::cerr << "\t\t" + << "detail::extract (pre,fill,concat) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; #endif diff --git a/cpp/src/prims/detail/multi_stream_utils.cuh b/cpp/src/prims/detail/multi_stream_utils.cuh index 78b75fc083d..76ef3fb0de4 100644 --- a/cpp/src/prims/detail/multi_stream_utils.cuh +++ b/cpp/src/prims/detail/multi_stream_utils.cuh @@ -33,22 +33,20 @@ namespace cugraph { namespace detail { -inline std::vector init_stream_pool_indices( - raft::handle_t const& handle, - size_t max_tmp_buffer_size, - size_t approx_tmp_buffer_size_per_edge_partition, - size_t num_local_edge_partitions, - size_t num_streams_per_edge_partition) +inline std::vector init_stream_pool_indices(size_t max_tmp_buffer_size, + size_t approx_tmp_buffer_size_per_loop, + size_t loop_count, + size_t num_streams_per_loop, + size_t max_streams) { - size_t num_streams = - std::min(num_local_edge_partitions * num_streams_per_edge_partition, - raft::round_down_safe(handle.get_stream_pool_size(), num_streams_per_edge_partition)); + size_t num_streams = std::min(loop_count * num_streams_per_loop, + raft::round_down_safe(max_streams, num_streams_per_loop)); auto num_concurrent_loops = - (approx_tmp_buffer_size_per_edge_partition > 0) - ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_edge_partition, size_t{1}) - : num_local_edge_partitions; - num_streams = std::min(num_concurrent_loops * num_streams_per_edge_partition, num_streams); + (approx_tmp_buffer_size_per_loop > 0) + ? std::max(max_tmp_buffer_size / approx_tmp_buffer_size_per_loop, size_t{1}) + : loop_count; + num_streams = std::min(num_concurrent_loops * num_streams_per_loop, num_streams); std::vector stream_pool_indices(num_streams); std::iota(stream_pool_indices.begin(), stream_pool_indices.end(), size_t{0}); @@ -125,10 +123,11 @@ void count_nosync(InputIterator input_first, } template -void sum_nosync(InputIterator input_first, - InputIterator input_last, - raft::device_span::value_type> sum /* size = 1 */, - rmm::cuda_stream_view stream_view) +void sum_nosync( + InputIterator input_first, + InputIterator input_last, + raft::device_span::value_type> sum /* size = 1 */, + rmm::cuda_stream_view stream_view) { CUGRAPH_EXPECTS( static_cast(thrust::distance(input_first, input_last)) <= diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 54941f6b816..e17c3ae5fd0 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1770,9 +1770,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); - auto max_tmp_buffer_size = - static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * - sizeof(vertex_t); + auto max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05); size_t approx_tmp_buffer_size_per_edge_partition{0}; if constexpr (update_major) { size_t key_size{0}; @@ -1805,11 +1804,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, (aggregate_major_range_size / comm_size) * (key_size + value_size); } - stream_pool_indices = init_stream_pool_indices(handle, - max_tmp_buffer_size, + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, approx_tmp_buffer_size_per_edge_partition, graph_view.number_of_local_edge_partitions(), - max_segments); + max_segments, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } } @@ -2765,8 +2765,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cerr << "\t\t" << "detail::per_v (prep, ep, comm) took (" << dur0.count() - << "," << dur1.count() << "," << dur2.count() << ")" << std::endl; + std::cerr << "\t\t" + << "detail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << ")" << std::endl; #endif } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index bd1f352c263..a3e5284b572 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -327,8 +327,8 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto v_list_size = - static_cast(thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); + auto v_list_size = static_cast( + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); std::array v_list_range = {vertex_t{0}, vertex_t{0}}; if (v_list_size > 0) { rmm::device_uvector tmps(2, handle.get_stream()); @@ -348,11 +348,41 @@ void fill_edge_minor_property(raft::handle_t const& handle, // !edge_partition_keys.has_value() && v_list_bitmap.has_value()) } - auto local_v_list_sizes = host_scalar_allgather(major_comm, v_list_size, handle.get_stream()); - auto local_v_list_range_firsts = - host_scalar_allgather(major_comm, v_list_range[0], handle.get_stream()); - auto local_v_list_range_lasts = - host_scalar_allgather(major_comm, v_list_range[1], handle.get_stream()); + std::vector local_v_list_sizes{}; + std::vector local_v_list_range_firsts{}; + std::vector local_v_list_range_lasts{}; + if (major_comm_size > 1) { // allgather v_list_size, v_list_range[0], and v_list_range[1] + std::vector h_tmps = {v_list_size, v_list_range[0], v_list_range[1]}; + rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{3}, + handle.get_stream()); + raft::update_device(d_aggregate_tmps.data() + major_comm_rank * size_t{3}, + h_tmps.data(), + size_t{3}, + handle.get_stream()); + device_allgather(major_comm, + d_aggregate_tmps.data() + major_comm_rank * size_t{3}, + d_aggregate_tmps.data(), + size_t{3}, + handle.get_stream()); + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + local_v_list_sizes = std::vector(major_comm_size); + local_v_list_range_firsts = std::vector(major_comm_size); + local_v_list_range_lasts = std::vector(major_comm_size); + for (int i = 0; i < major_comm_size; ++i) { + local_v_list_sizes[i] = h_aggregate_tmps[i * size_t{3}]; + local_v_list_range_firsts[i] = h_aggregate_tmps[i * size_t{3} + 1]; + local_v_list_range_lasts[i] = h_aggregate_tmps[i * size_t{3} + 2]; + } + } else { + local_v_list_sizes = {v_list_size}; + local_v_list_range_firsts = {v_list_range[0]}; + local_v_list_range_lasts = {v_list_range[1]}; + } std::optional> v_list_bitmap{std::nullopt}; if (major_comm_size > 1) { @@ -375,38 +405,38 @@ void fill_edge_minor_property(raft::handle_t const& handle, handle.get_stream()); } } - size_t min_bcast_size = std::numeric_limits::max(); - for (int i = 0; i < major_comm_size; ++i) { - if (v_list_bitmap) { - min_bcast_size = - std::min(min_bcast_size, - packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * - sizeof(uint32_t)); - } else { - min_bcast_size = std::min(min_bcast_size, local_v_list_sizes[i] * sizeof(vertex_t)); + + auto edge_partition_keys = edge_minor_property_output.keys(); + + std::optional> stream_pool_indices{std::nullopt}; + { + size_t tmp_buffer_size_per_loop{}; + for (int i = 0; i < major_comm_size; ++i) { + if (is_packed_bool() && + !edge_partition_keys && v_list_bitmap) { + tmp_buffer_size_per_loop += + packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * + sizeof(uint32_t); + } else { + tmp_buffer_size_per_loop += static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); + } } + tmp_buffer_size_per_loop /= major_comm_size; + stream_pool_indices = init_stream_pool_indices( + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * + 0.05), + tmp_buffer_size_per_loop, + major_comm_size, + 1, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } + size_t num_concurrent_bcasts = stream_pool_indices ? (*stream_pool_indices).size() : size_t{1}; - auto num_concurrent_bcasts = - (static_cast(graph_view.compute_number_of_edges(handle) / comm_size) * - sizeof(vertex_t)) / - std::min( - (std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / major_comm_size) * - sizeof(vertex_t), - size_t{1}); - num_concurrent_bcasts = std::min(num_concurrent_bcasts, handle.get_stream_pool_size()); - num_concurrent_bcasts = - std::min(std::max(num_concurrent_bcasts, size_t{1}), static_cast(major_comm_size)); std::cerr << "v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() - << " num_concurrent_bcasts=" << num_concurrent_bcasts - << " min_bcast_size=" << min_bcast_size << std::endl; - - std::optional> stream_pool_indices{std::nullopt}; - if (num_concurrent_bcasts > 1) { - stream_pool_indices = std::vector(num_concurrent_bcasts); - std::iota((*stream_pool_indices).begin(), (*stream_pool_indices).end(), size_t{0}); - } + << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; std::optional> key_offsets{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -421,7 +451,6 @@ void fill_edge_minor_property(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto t1 = std::chrono::steady_clock::now(); #endif - auto edge_partition_keys = edge_minor_property_output.keys(); for (size_t i = 0; i < static_cast(major_comm_size); i += num_concurrent_bcasts) { RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub0 = std::chrono::steady_clock::now(); @@ -462,7 +491,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, handle.get_stream()); } device_group_end(major_comm); - handle.sync_stream(); // FIXME: ??? + if (stream_pool_indices) { handle.sync_stream(); } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -542,9 +571,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto sub1 = std::chrono::steady_clock::now(); #endif - if (min_bcast_size >= 8192 /* workaround for a seemingly NCCL bug */) { - device_group_start(major_comm); - }; + device_group_start(major_comm); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -571,9 +598,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, handle.get_stream()); } } - if (min_bcast_size >= 8192 /* workaround for a seemingly NCCL bug */) { - device_group_end(major_comm); - } + device_group_end(major_comm); if (stream_pool_indices) { handle.sync_stream(); } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -605,7 +630,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, if (edge_partition_keys) { thrust::for_each( rmm::exec_policy_nosync(loop_stream), - thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), [rx_vertex_first = rx_vertices.begin(), input, @@ -632,8 +657,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, thrust::for_each( rmm::exec_policy_nosync(loop_stream), thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator( - static_cast(local_v_list_sizes[partition_idx])), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), [minor_range_first, rx_vertex_first = rx_vertices.begin(), input, From 453d8df0ac5b125da74f452c6e46d724c04d8aff Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 27 Sep 2024 16:25:34 -0700 Subject: [PATCH 089/126] minor tweaks --- .../detail/extract_transform_v_frontier_e.cuh | 3 +- cpp/src/prims/fill_edge_src_dst_property.cuh | 11 ++--- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 40 +++++++++---------- 3 files changed, 27 insertions(+), 27 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 712b596ee63..0bcf9633e96 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -934,9 +934,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition_frontier_major_last = thrust_tuple_get_or_identity( edge_partition_frontier_key_last); - edge_partition_max_pushes = edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host + // FIXME: check whether skipping a call for 0 key_buffer size helps or not edge_partition_max_pushes = edge_partition.compute_number_of_edges( edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); } diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index a3e5284b572..556db4f4346 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -396,7 +396,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, avg_fill_ratio /= static_cast(major_comm_size); constexpr double threshold_ratio = - 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + 2.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); if (avg_fill_ratio > threshold_ratio) { v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, sorted_unique_vertex_last, @@ -444,8 +444,6 @@ void fill_edge_minor_property(raft::handle_t const& handle, } else { key_offsets = graph_view.local_sorted_unique_edge_dst_vertex_partition_offsets(); } - handle.sync_stream(); // FIXME: unnecessary if we run broadcast operations in ncclGroupStart & - // ncclGroupoEnd #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -530,6 +528,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, } }); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::now(); @@ -680,6 +679,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, } } } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::now(); @@ -690,7 +690,6 @@ void fill_edge_minor_property(raft::handle_t const& handle, << "," << subdur2.count() << ")" << std::endl; #endif } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -702,7 +701,9 @@ void fill_edge_minor_property(raft::handle_t const& handle, #endif } else { assert(graph_view.local_vertex_partition_range_size() == - graph_view.local_edge_partition_src_range_size()); + (GraphViewType::is_storage_transposed + ? graph_view.local_edge_partition_src_range_size() + : graph_view.local_edge_partition_dst_range_sizse())); if constexpr (contains_packed_bool_element) { thrust::for_each(handle.get_thrust_policy(), sorted_unique_vertex_first, diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 1f080f7b103..29d9549eb7f 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -244,6 +244,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time1 = std::chrono::steady_clock::now(); + auto size_before_lreduce = size_dataframe_buffer(key_buffer); #endif // 2. reduce the buffer @@ -254,6 +255,10 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time2 = std::chrono::steady_clock::now(); + auto time3 = std::chrono::steady_clock::now(); + auto time4 = std::chrono::steady_clock::now(); + auto size_after_lreduce = size_dataframe_buffer(key_buffer); + auto size_before_greduce = size_after_lreduce; #endif if constexpr (GraphViewType::is_multi_gpu) { // FIXME: this step is unnecessary if major_comm_size== 1 @@ -293,38 +298,30 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, d_tx_buffer_last_boundaries.size(), handle.get_stream()); handle.sync_stream(); -#if 0 - std::vector tx_counts(comm.get_size(), 0); - for (int i = 0; i < major_comm_size; ++i) { - auto r = partition_manager::compute_global_comm_rank_from_graph_subcomm_ranks(major_comm_size, minor_comm_size, i, minor_comm_rank); - tx_counts[r] = (i == 0) ? h_tx_buffer_last_boundaries[0] : (h_tx_buffer_last_boundaries[i] - h_tx_buffer_last_boundaries[i - 1]); - } - - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values( - comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); // use comm insteads of major_comm to save P2P buffer allocation -#else std::vector tx_counts(h_tx_buffer_last_boundaries.size()); std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time3 = std::chrono::steady_clock::now(); +#endif auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); std::tie(rx_key_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); -#endif key_buffer = std::move(rx_key_buffer); if constexpr (!std::is_same_v) { auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); -#if 0 - std::tie(rx_payload_buffer, std::ignore) = shuffle_values( - comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); -#else std::tie(rx_payload_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); -#endif payload_buffer = std::move(rx_payload_buffer); } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time4 = std::chrono::steady_clock::now(); + size_before_greduce = size_dataframe_buffer(key_buffer); +#endif std::tie(key_buffer, payload_buffer) = detail::sort_and_reduce_buffer_elements( @@ -332,12 +329,15 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, } #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time3 = std::chrono::steady_clock::now(); + auto time5 = std::chrono::steady_clock::now(); + auto size_after_greduce = size_dataframe_buffer(key_buffer); std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; - std::cerr << "\tprim (fill,lreduce,greduce) took (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << ")" << std::endl; + std::chrono::duration dur3 = time4 - time3; + std::chrono::duration dur4 = time5 - time4; + std::cerr << "\tprim (fill,lreduce,g-prep,g-shuffle,g-s&r) took (" << dur0.count() << "," << dur1.count() << "," + << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") l_size=(" << size_before_lreduce << "," << size_after_lreduce << ") g_size=(" << size_before_greduce << "," << size_after_greduce << ")" << std::endl; #endif if constexpr (!std::is_same_v) { From 171c2b55a7f4dfddfa5086b439f6a00a8089747a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 28 Sep 2024 02:23:02 -0700 Subject: [PATCH 090/126] update fill_edge_minor_property --- cpp/src/prims/fill_edge_src_dst_property.cuh | 408 ++++++++++++++----- 1 file changed, 295 insertions(+), 113 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 556db4f4346..d9a198b4152 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -326,44 +326,56 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); - auto v_list_size = static_cast( - thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); - std::array v_list_range = {vertex_t{0}, vertex_t{0}}; - if (v_list_size > 0) { - rmm::device_uvector tmps(2, handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - tmps.begin(), - tmps.end(), - [sorted_unique_vertex_first, v_list_size] __device__(size_t i) { - return (i == 0) ? *sorted_unique_vertex_first - : (*(sorted_unique_vertex_first + (v_list_size - 1)) + 1); - }); - raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); - handle.sync_stream(); - v_list_range[0] -= - (v_list_range[0] - minor_range_first) % - packed_bools_per_word(); // to perform bitwise AND|OR in word granularity (if edge minor - // property value type is packed bool && - // !edge_partition_keys.has_value() && v_list_bitmap.has_value()) - } + constexpr size_t packed_bool_word_bcast_alignment = + 128 / + sizeof( + uint32_t); // 128B cache line alignment (unaligned ncclBroadcast operations are slower) std::vector local_v_list_sizes{}; std::vector local_v_list_range_firsts{}; std::vector local_v_list_range_lasts{}; - if (major_comm_size > 1) { // allgather v_list_size, v_list_range[0], and v_list_range[1] - std::vector h_tmps = {v_list_size, v_list_range[0], v_list_range[1]}; + { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{3}, handle.get_stream()); - raft::update_device(d_aggregate_tmps.data() + major_comm_rank * size_t{3}, - h_tmps.data(), - size_t{3}, - handle.get_stream()); - device_allgather(major_comm, - d_aggregate_tmps.data() + major_comm_rank * size_t{3}, - d_aggregate_tmps.data(), - size_t{3}, - handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + d_aggregate_tmps.begin() + major_comm_rank * size_t{3}, + d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{3}, + [sorted_unique_vertex_first, + v_list_size, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + if (v_list_size > 0) { + return *sorted_unique_vertex_first; + } else { + return vertex_partition_range_first; + } + } else { + if (v_list_size > 0) { + return *(sorted_unique_vertex_first + (v_list_size - 1)) + 1; + } else { + return vertex_partition_range_first; + } + } + }); + + if (major_comm_size > 1) { // allgather v_list_size, v_list_range_first (inclusive), + // v_list_range_last (exclusive) + device_allgather(major_comm, + d_aggregate_tmps.data() + major_comm_rank * size_t{3}, + d_aggregate_tmps.data(), + size_t{3}, + handle.get_stream()); + } + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); raft::update_host(h_aggregate_tmps.data(), d_aggregate_tmps.data(), @@ -378,36 +390,122 @@ void fill_edge_minor_property(raft::handle_t const& handle, local_v_list_range_firsts[i] = h_aggregate_tmps[i * size_t{3} + 1]; local_v_list_range_lasts[i] = h_aggregate_tmps[i * size_t{3} + 2]; } - } else { - local_v_list_sizes = {v_list_size}; - local_v_list_range_firsts = {v_list_range[0]}; - local_v_list_range_lasts = {v_list_range[1]}; } + auto edge_partition_keys = edge_minor_property_output.keys(); + std::optional> v_list_bitmap{std::nullopt}; + std::optional> compressed_v_list{std::nullopt}; if (major_comm_size > 1) { + bool v_compressible{false}; + if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < major_comm_size; ++i) { + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + double avg_fill_ratio{0.0}; for (int i = 0; i < major_comm_size; ++i) { auto num_keys = static_cast(local_v_list_sizes[i]); auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; - avg_fill_ratio += - (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; + avg_fill_ratio += (range_size > 0) + ? (static_cast(num_keys) / static_cast(range_size)) + : double{0.0}; } avg_fill_ratio /= static_cast(major_comm_size); + double threshold_ratio = + 1.0 / static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); + auto avg_v_list_size = std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / + static_cast(major_comm_size); - constexpr double threshold_ratio = - 2.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); - if (avg_fill_ratio > threshold_ratio) { - v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_vertex_first, - sorted_unique_vertex_last, - local_v_list_range_firsts[major_comm_rank], - local_v_list_range_lasts[major_comm_rank], - handle.get_stream()); + // FIXME: should I better set minimum v_list_size??? + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_v_list_size) > packed_bool_word_bcast_alignment)) { + if (is_packed_bool() && + !edge_partition_keys) { // directly update edge_minor_property_output (with special + // care for unaligned boundaries) + rmm::device_uvector boundary_words( + packed_bool_word_bcast_alignment, + handle.get_stream()); // for unaligned boundaries + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first) == + packed_bool_offset(graph_view.local_vertex_partition_range_first() - + minor_range_first)) && + (((local_v_list_range_firsts[major_comm_rank] - minor_range_first) % + packed_bools_per_word()) != 0)) { + leading_boundary_words = packed_bool_word_bcast_alignment; + } + thrust::fill(handle.get_thrust_policy(), + boundary_words.begin(), + boundary_words.begin() + leading_boundary_words, + packed_bool_empty_mask()); + thrust::for_each( + handle.get_thrust_policy(), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + [input, + minor_range_first, + leading_boundary_words, + word_offset_first = + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first), + output_value_first = edge_partition_value_first, + boundary_words = raft::device_span( + boundary_words.data(), boundary_words.size())] __device__(auto v) { + auto v_offset = v - minor_range_first; + auto word_offset = packed_bool_offset(v_offset); + cuda::atomic_ref word( + (word_offset - word_offset_first < leading_boundary_words) + ? boundary_words[word_offset - word_offset_first] + : *(output_value_first + word_offset)); + if (input) { + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + } else { + word.fetch_and(~packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + } + }); + rmm::device_uvector aggregate_boundary_words( + major_comm_size * packed_bool_word_bcast_alignment, handle.get_stream()); + device_allgather(major_comm, + boundary_words.data(), + aggregate_boundary_words.data(), + packed_bool_word_bcast_alignment, + handle.get_stream()); + v_list_bitmap = std::move(aggregate_boundary_words); + } else { + v_list_bitmap = + compute_vertex_list_bitmap_info(sorted_unique_vertex_first, + sorted_unique_vertex_last, + local_v_list_range_firsts[major_comm_rank], + local_v_list_range_lasts[major_comm_rank], + handle.get_stream()); + } + } else if (v_compressible) { + rmm::device_uvector tmps(local_v_list_sizes[major_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[major_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); } } - auto edge_partition_keys = edge_minor_property_output.keys(); - std::optional> stream_pool_indices{std::nullopt}; { size_t tmp_buffer_size_per_loop{}; @@ -415,9 +513,12 @@ void fill_edge_minor_property(raft::handle_t const& handle, if (is_packed_bool() && !edge_partition_keys && v_list_bitmap) { + tmp_buffer_size_per_loop += 0; + } else if (v_list_bitmap) { tmp_buffer_size_per_loop += packed_bool_size(local_v_list_range_lasts[i] - local_v_list_range_firsts[i]) * - sizeof(uint32_t); + sizeof(uint32_t) + + static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); } else { tmp_buffer_size_per_loop += static_cast(local_v_list_sizes[i]) * sizeof(vertex_t); } @@ -434,8 +535,11 @@ void fill_edge_minor_property(raft::handle_t const& handle, } size_t num_concurrent_bcasts = stream_pool_indices ? (*stream_pool_indices).size() : size_t{1}; - std::cerr << "v_list_size=" << v_list_size << " v_list_range=(" << v_list_range[0] << "," - << v_list_range[1] << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() + std::cerr << "v_list_size=" << local_v_list_sizes[major_comm_rank] << " v_list_range=(" + << local_v_list_range_firsts[major_comm_rank] << "," + << local_v_list_range_lasts[major_comm_rank] + << ") v_list_bitmap.has_value()=" << v_list_bitmap.has_value() + << " compressed_v_list.has_value()=" << compressed_v_list.has_value() << " num_concurrent_bcasts=" << num_concurrent_bcasts << std::endl; std::optional> key_offsets{}; @@ -461,16 +565,27 @@ void fill_edge_minor_property(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub0 = std::chrono::steady_clock::now(); #endif - std::vector> edge_partition_rx_bitmaps{}; - edge_partition_rx_bitmaps.reserve(loop_count); + std::vector leading_boundary_word_counts(loop_count); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - edge_partition_rx_bitmaps.push_back(rmm::device_uvector( - packed_bool_size(local_v_list_range_lasts[partition_idx] - - local_v_list_range_firsts[partition_idx]), - handle.get_stream())); + auto leading_boundary_words = + (packed_bool_word_bcast_alignment - + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bool_word_bcast_alignment) % + packed_bool_word_bcast_alignment; + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, partition_idx, minor_comm_rank); + if ((leading_boundary_words == 0) && + (packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) == + packed_bool_offset(graph_view.vertex_partition_range_first(vertex_partition_id) - + minor_range_first)) && + (((local_v_list_range_firsts[partition_idx] - minor_range_first) % + packed_bools_per_word()) != 0)) { + leading_boundary_words = packed_bool_word_bcast_alignment; + } + leading_boundary_word_counts[j] = leading_boundary_words; } - #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub1 = std::chrono::steady_clock::now(); @@ -478,57 +593,78 @@ void fill_edge_minor_property(raft::handle_t const& handle, device_group_start(major_comm); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto& rx_bitmap = edge_partition_rx_bitmaps[j]; + size_t bcast_size{0}; + vertex_t packed_bool_offset_first{0}; + if (local_v_list_range_firsts[partition_idx] < local_v_list_range_lasts[partition_idx]) { + auto leading_boundary_words = leading_boundary_word_counts[j]; + packed_bool_offset_first = + packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first) + + static_cast(leading_boundary_words); + auto packed_bool_offset_last = + packed_bool_offset(local_v_list_range_lasts[partition_idx] - 1 - minor_range_first); + if (packed_bool_offset_first <= packed_bool_offset_last) { + bcast_size = (packed_bool_offset_last - packed_bool_offset_first) + 1; + } + } + device_bcast(major_comm, - (static_cast(partition_idx) == major_comm_rank) - ? (*v_list_bitmap).data() - : static_cast(nullptr), - rx_bitmap.data(), - rx_bitmap.size(), + edge_partition_value_first + packed_bool_offset_first, + edge_partition_value_first + packed_bool_offset_first, + bcast_size, static_cast(partition_idx), handle.get_stream()); } device_group_end(major_comm); - if (stream_pool_indices) { handle.sync_stream(); } - #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub2 = std::chrono::steady_clock::now(); #endif - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - auto const& rx_bitmap = edge_partition_rx_bitmaps[j]; - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(rx_bitmap.size()), - [input, - output_value_first = - edge_partition_value_first + - packed_bool_offset(local_v_list_range_firsts[partition_idx] - minor_range_first), - rx_bitmap = raft::device_span(rx_bitmap.data(), - rx_bitmap.size())] __device__(size_t i) { - if ((i == 0) || (i == (rx_bitmap.size() - 1))) { // first or last + + rmm::device_uvector d_leading_boundary_word_counts( + leading_boundary_word_counts.size(), handle.get_stream()); + raft::update_device(d_leading_boundary_word_counts.data(), + leading_boundary_word_counts.data(), + leading_boundary_word_counts.size(), + handle.get_stream()); + + rmm::device_uvector d_local_v_list_range_firsts(loop_count, handle.get_stream()); + raft::update_device(d_local_v_list_range_firsts.data(), + local_v_list_range_firsts.data() + i, + loop_count, + handle.get_stream()); + + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(loop_count * packed_bool_word_bcast_alignment), + [input, + minor_range_first, + leading_boundary_word_counts = raft::device_span( + d_leading_boundary_word_counts.data(), d_leading_boundary_word_counts.size()), + local_v_list_range_firsts = raft::device_span( + d_local_v_list_range_firsts.data(), d_local_v_list_range_firsts.size()), + aggregate_boundary_words = raft::device_span( + (*v_list_bitmap).data() + i * packed_bool_word_bcast_alignment, + loop_count * packed_bool_word_bcast_alignment), + output_value_first = edge_partition_value_first] __device__(size_t i) { + auto j = i / packed_bool_word_bcast_alignment; + auto leading_boundary_words = leading_boundary_word_counts[j]; + if ((i % packed_bool_word_bcast_alignment) < leading_boundary_words) { + auto boundary_word = aggregate_boundary_words[i]; + if (boundary_word != packed_bool_empty_mask()) { + auto word_offset = + packed_bool_offset(local_v_list_range_firsts[j] - minor_range_first) + + (i % packed_bool_word_bcast_alignment); cuda::atomic_ref word( - *(output_value_first + i)); - if (input) { - word.fetch_or(rx_bitmap[i], cuda::std::memory_order_relaxed); - } else { - word.fetch_and(~rx_bitmap[i], cuda::std::memory_order_relaxed); - } - } else { + *(output_value_first + word_offset)); if (input) { - *(output_value_first + i) |= rx_bitmap[i]; + word.fetch_or(aggregate_boundary_words[i], cuda::std::memory_order_relaxed); } else { - *(output_value_first + i) &= ~rx_bitmap[i]; + word.fetch_and(~aggregate_boundary_words[i], cuda::std::memory_order_relaxed); } } - }); - } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + } + }); #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::now(); @@ -558,6 +694,9 @@ void fill_edge_minor_property(raft::handle_t const& handle, packed_bool_size(local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]), handle.get_stream()); + } else if (compressed_v_list) { + v_buffer = + rmm::device_uvector(local_v_list_sizes[partition_idx], handle.get_stream()); } else { std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); } @@ -577,9 +716,14 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto& v_buffer = edge_partition_v_buffers[j]; if (v_list_bitmap) { device_bcast(major_comm, - (static_cast(partition_idx) == major_comm_rank) - ? (*v_list_bitmap).data() - : static_cast(nullptr), + (*v_list_bitmap).data(), + std::get<1>(v_buffer).data(), + std::get<1>(v_buffer).size(), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(major_comm, + (*compressed_v_list).data(), std::get<1>(v_buffer).data(), std::get<1>(v_buffer).size(), static_cast(partition_idx), @@ -625,20 +769,30 @@ void fill_edge_minor_property(raft::handle_t const& handle, edge_partition_v_buffers[j] = std::move(rx_vertices); } - auto const& rx_vertices = std::get<0>(edge_partition_v_buffers[j]); if (edge_partition_keys) { thrust::for_each( rmm::exec_policy_nosync(loop_stream), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), - [rx_vertex_first = rx_vertices.begin(), + [rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], input, subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], subrange_key_last = (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], edge_partition_value_first = edge_partition_value_first, subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { - auto minor = *(rx_vertex_first + i); + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } auto it = thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); if ((it != subrange_key_last) && (*it == minor)) { @@ -658,24 +812,52 @@ void fill_edge_minor_property(raft::handle_t const& handle, thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), [minor_range_first, - rx_vertex_first = rx_vertices.begin(), + rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], input, output_value_first = edge_partition_value_first] __device__(auto i) { - auto rx_vertex = *(rx_vertex_first + i); - auto minor_offset = rx_vertex - minor_range_first; + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto minor_offset = minor - minor_range_first; fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); }); } else { - auto map_first = thrust::make_transform_iterator( - rx_vertices.begin(), - cuda::proclaim_return_type( - [minor_range_first] __device__(auto v) { return v - minor_range_first; })); - auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(rmm::exec_policy_nosync(loop_stream), - val_first, - val_first + local_v_list_sizes[partition_idx], - map_first, - edge_partition_value_first); + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + std::get<1>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first, + range_first = + local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { + return v_offset + (range_first - minor_range_first); + })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + std::get<0>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } } } } From 4122652e2691d6170fd3caa42cfaa53f08146b44 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 28 Sep 2024 22:09:08 -0700 Subject: [PATCH 091/126] update to use more than one block to process very high degree vertices --- .../detail/extract_transform_v_frontier_e.cuh | 214 ++++++++++-------- 1 file changed, 119 insertions(+), 95 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 0bcf9633e96..cb7d38fa8b3 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -335,11 +335,11 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( auto major_offset = edge_partition.major_offset_from_major_nocheck(major); vertex_t const* indices{nullptr}; edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = - ((static_cast(local_out_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * + auto rounded_up_local_degree = + ((static_cast(local_degree) + (raft::warp_size() - 1)) / raft::warp_size()) * raft::warp_size(); auto call_e_op = call_e_op_t(local_out_degree)) && + if ((i < static_cast(local_degree)) && ((*edge_partition_e_mask).get(local_edge_offset + i))) { e_op_result = call_e_op(i); } @@ -369,9 +369,9 @@ __global__ static void extract_transform_v_frontier_e_mid_degree( buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); } } else { - for (size_t i = lane_id; i < rounded_up_local_out_degree; i += raft::warp_size()) { + for (size_t i = lane_id; i < rounded_up_local_degree; i += raft::warp_size()) { e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } + if (i < static_cast(local_degree)) { e_op_result = call_e_op(i); } warp_push_buffer_elements( buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); @@ -396,7 +396,7 @@ __global__ static void extract_transform_v_frontier_e_high_degree( typename GraphViewType::edge_type, GraphViewType::is_multi_gpu> edge_partition, KeyIterator key_first, - KeyIterator key_last, + raft::device_span key_local_degree_offsets, EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, EdgePartitionEdgeValueInputWrapper edge_partition_e_value_input, @@ -417,63 +417,61 @@ __global__ static void extract_transform_v_frontier_e_high_degree( typename EdgePartitionEdgeValueInputWrapper::value_type, EdgeOp>::type; - auto const warp_id = threadIdx.x / raft::warp_size(); + auto const tid = threadIdx.x + blockIdx.x * blockDim.x; auto const lane_id = threadIdx.x % raft::warp_size(); - auto idx = static_cast(blockIdx.x); - cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - - while (idx < static_cast(thrust::distance(key_first, key_last))) { - auto key = *(key_first + idx); - auto major = thrust_tuple_get_or_identity(key); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - vertex_t const* indices{nullptr}; - edge_t local_edge_offset{}; - edge_t local_out_degree{}; - thrust::tie(indices, local_edge_offset, local_out_degree) = - edge_partition.local_edges(major_offset); - auto rounded_up_local_out_degree = ((static_cast(local_out_degree) + - (extract_transform_v_frontier_e_kernel_block_size - 1)) / - extract_transform_v_frontier_e_kernel_block_size) * - extract_transform_v_frontier_e_kernel_block_size; + auto idx = static_cast(tid); - auto call_e_op = call_e_op_t{edge_partition, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - e_op, - key, - major_offset, - indices, - local_edge_offset}; + cuda::atomic_ref buffer_idx(*buffer_idx_ptr); - if (edge_partition_e_mask) { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if ((i < static_cast(local_out_degree)) && - ((*edge_partition_e_mask).get(local_edge_offset + i))) { - e_op_result = call_e_op(i); + auto num_edges = *(key_local_degree_offsets.rbegin()); + size_t rounded_up_num_edges = + ((static_cast(num_edges) + (raft::warp_size() - 1)) / raft::warp_size()) * + raft::warp_size(); + while (idx < rounded_up_num_edges) { + e_op_result_t e_op_result{thrust::nullopt}; + if (idx < num_edges) { + auto key_idx = thrust::distance( + key_local_degree_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, key_local_degree_offsets.begin() + 1, key_local_degree_offsets.end(), idx)); + auto key = *(key_first + key_idx); + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + vertex_t const* indices{nullptr}; + edge_t local_edge_offset{}; + edge_t local_degree{}; + thrust::tie(indices, local_edge_offset, local_degree) = + edge_partition.local_edges(major_offset); + + auto call_e_op = call_e_op_t{edge_partition, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + e_op, + key, + major_offset, + indices, + local_edge_offset}; + + auto e_idx = static_cast(idx - key_local_degree_offsets[key_idx]); + if (edge_partition_e_mask) { + if ((*edge_partition_e_mask).get(local_edge_offset + e_idx)) { + e_op_result = call_e_op(e_idx); } - - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - } - } else { - for (size_t i = threadIdx.x; i < rounded_up_local_out_degree; i += blockDim.x) { - e_op_result_t e_op_result{thrust::nullopt}; - if (i < static_cast(local_out_degree)) { e_op_result = call_e_op(i); } - - warp_push_buffer_elements( - buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); + } else { + e_op_result = call_e_op(e_idx); } } + warp_push_buffer_elements( + buffer_key_output_first, buffer_value_output_first, buffer_idx, lane_id, e_op_result); - idx += gridDim.x; + idx += gridDim.x * blockDim.x; } } @@ -648,6 +646,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, local_frontier_range_lasts{}; std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { + // FIXME: combine multiple host_scalar_allgather auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); local_frontier_sizes = host_scalar_allgather( @@ -721,6 +720,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, conditional_t>, std::byte /* dummy */> frontier_bitmap{}; if constexpr (try_bitmap) { + // FIXME: 4B v_offset... auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); if (minor_comm_size > 1) { @@ -839,6 +839,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto loop_count = std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); + // FIXME: ncclGroupStart,ncclGroupEnd std::conditional_t>, std::byte /* dummy */> @@ -893,12 +894,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); } } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime1 = std::chrono::steady_clock::now(); -#endif #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime2 = std::chrono::steady_clock::now(); + auto subtime1 = std::chrono::steady_clock::now(); #endif std::vector> output_key_buffers{}; @@ -907,6 +905,14 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, output_value_buffers.reserve(loop_count); std::vector> output_buffer_idx_scalars{}; output_buffer_idx_scalars.reserve(loop_count); + std::optional>> key_local_degree_offset_vectors{ + std::nullopt}; + std::optional> high_segment_edge_counts{std::nullopt}; + if (key_segment_offset_vectors) { + key_local_degree_offset_vectors = std::vector>{}; + (*key_local_degree_offset_vectors).reserve(loop_count); + high_segment_edge_counts = std::vector(loop_count); + } for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto loop_stream = stream_pool_indices @@ -917,6 +923,19 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, edge_partition_device_view_t( graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + edge_partition_frontier_key_first = + get_dataframe_buffer_begin(edge_partition_key_buffers[j]); + edge_partition_frontier_key_last = + get_dataframe_buffer_end(edge_partition_key_buffers[j]); + } + } + auto edge_partition_max_pushes = local_max_pushes; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -924,10 +943,6 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto const minor_comm_size = minor_comm.get_size(); if (minor_comm_size > 1) { if (static_cast(partition_idx) != minor_comm_rank) { - auto edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - auto edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_key_buffers[j]); auto edge_partition_frontier_major_first = thrust_tuple_get_or_identity( edge_partition_frontier_key_first); @@ -942,18 +957,40 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } + if (key_segment_offset_vectors) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + rmm::device_uvector key_local_degree_offsets(key_segment_offsets[1] + 1, + loop_stream); + key_local_degree_offsets.set_element_to_zero_async(0, loop_stream); + auto key_local_degree_first = thrust::make_transform_iterator( + edge_partition_frontier_key_first, + cuda::proclaim_return_type([edge_partition] __device__(auto key) { + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + key_local_degree_offsets.begin() + 1); + size_t num_edges{0}; + raft::update_host( + &num_edges, key_local_degree_offsets.data() + key_segment_offsets[1], 1, loop_stream); + // FIXME: this prevents multi-CUDA stream execution + RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); + (*key_local_degree_offset_vectors).push_back(std::move(key_local_degree_offsets)); + (*high_segment_edge_counts)[j] = num_edges; + } + output_key_buffers.push_back( allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); output_value_buffers.push_back( allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); output_buffer_idx_scalars.push_back(rmm::device_scalar(size_t{0}, loop_stream)); } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime3 = std::chrono::steady_clock::now(); -#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime4 = std::chrono::steady_clock::now(); + auto subtime2 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { @@ -1009,19 +1046,21 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if (key_segment_offset_vectors) { auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - if (key_segment_offsets[1] > 0) { + if ((key_segment_offsets[1] > 0) && ((*high_segment_edge_counts)[j] > 0)) { auto exec_stream = edge_partition_stream_pool_indices ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) : handle.get_stream(); - raft::grid_1d_block_t update_grid(key_segment_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); + + raft::grid_1d_thread_t update_grid((*high_segment_edge_counts)[j], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_high_degree <<>>( edge_partition, edge_partition_frontier_key_first, - edge_partition_frontier_key_first + key_segment_offsets[1], + raft::device_span((*key_local_degree_offset_vectors)[j].data(), + (*key_local_degree_offset_vectors)[j].size()), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1121,12 +1160,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime5 = std::chrono::steady_clock::now(); -#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime6 = std::chrono::steady_clock::now(); + auto subtime3 = std::chrono::steady_clock::now(); #endif std::vector tmp_buffer_sizes(loop_count); @@ -1139,12 +1175,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // FIXME: tmp_buffer_idx.value() implicitly synchronizes to copy the results to host tmp_buffer_sizes[j] = tmp_buffer_idx.value(loop_stream); } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime7 = std::chrono::steady_clock::now(); -#endif #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime8 = std::chrono::steady_clock::now(); + auto subtime4 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { @@ -1169,26 +1202,17 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, value_buffers.push_back(std::move(tmp_value_buffer)); } } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime9 = std::chrono::steady_clock::now(); -#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime10 = std::chrono::steady_clock::now(); + auto subtime5 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; std::chrono::duration subdur2 = subtime3 - subtime2; std::chrono::duration subdur3 = subtime4 - subtime3; std::chrono::duration subdur4 = subtime5 - subtime4; - std::chrono::duration subdur5 = subtime6 - subtime5; - std::chrono::duration subdur6 = subtime7 - subtime6; - std::chrono::duration subdur7 = subtime8 - subtime7; - std::chrono::duration subdur8 = subtime9 - subtime8; - std::chrono::duration subdur9 = subtime10 - subtime9; std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," - << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," - << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," - << subdur8.count() << "," << subdur9.count() << ")" << std::endl; + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << ")" + << std::endl; #endif } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete From 146dbafda54eabaae5c8bc24b6cfc98b1404b962 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 29 Sep 2024 19:58:10 -0700 Subject: [PATCH 092/126] additional primitive performance optimizations --- .../detail/extract_transform_v_frontier_e.cuh | 138 ++-- .../prims/detail/per_v_transform_reduce_e.cuh | 658 ++++++++++++------ cpp/src/prims/fill_edge_src_dst_property.cuh | 4 +- cpp/tests/utilities/mg_utilities.hpp | 2 +- 4 files changed, 546 insertions(+), 256 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index cb7d38fa8b3..8bd1bb299a7 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -646,63 +646,97 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, local_frontier_range_lasts{}; std::optional>> key_segment_offset_vectors{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - // FIXME: combine multiple host_scalar_allgather auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - local_frontier_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - handle.get_stream()); + + size_t num_scalars = 1; // local_frontier_size if constexpr (try_bitmap) { - std::array v_list_range = {vertex_t{0}, vertex_t{0}}; - auto v_list_size = - static_cast(thrust::distance(frontier_key_first, frontier_key_last)); - if (v_list_size > 0) { - rmm::device_uvector tmps(2, handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - tmps.begin(), - tmps.end(), - [frontier_key_first, v_list_size] __device__(size_t i) { - return (i == 0) ? *frontier_key_first - : (*(frontier_key_first + (v_list_size - 1)) + 1); - }); - raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); - handle.sync_stream(); - } - local_frontier_range_firsts = - host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); - local_frontier_range_lasts = - host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); + num_scalars += 2; // local_frontier_range_first, local_frontier_range_last } + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + minor_comm_rank * num_scalars, + d_aggregate_tmps.begin() + minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1), + [frontier_key_first, + v_list_size = static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if constexpr (try_bitmap) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + vertex_t first{}; + if (v_list_size > 0) { + first = *frontier_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else { + assert(i == 2); + vertex_t last{}; + if (v_list_size > 0) { + last = *(frontier_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + } else { + assert(i == 0); + return v_list_size; + } + }); if (key_segment_offsets) { - rmm::device_uvector d_key_segment_offsets((*key_segment_offsets).size(), - handle.get_stream()); - raft::update_device(d_key_segment_offsets.data(), - (*key_segment_offsets).data(), - (*key_segment_offsets).size(), - handle.get_stream()); - rmm::device_uvector d_aggregate_key_segment_offsets( - minor_comm_size * d_key_segment_offsets.size(), handle.get_stream()); - std::vector rx_counts(minor_comm_size, d_key_segment_offsets.size()); - std::vector rx_displacements(minor_comm_size); - std::exclusive_scan(rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); - device_allgatherv(minor_comm, - d_key_segment_offsets.data(), - d_aggregate_key_segment_offsets.data(), - rx_counts, - rx_displacements, - handle.get_stream()); - std::vector h_aggregate_key_segment_offsets(d_aggregate_key_segment_offsets.size()); - raft::update_host(h_aggregate_key_segment_offsets.data(), - d_aggregate_key_segment_offsets.data(), - d_aggregate_key_segment_offsets.size(), - handle.get_stream()); - handle.sync_stream(); - key_segment_offset_vectors = std::vector>(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - (*key_segment_offset_vectors)[i] = std::vector( - h_aggregate_key_segment_offsets.begin() + i * (*key_segment_offsets).size(), - h_aggregate_key_segment_offsets.begin() + (i + 1) * (*key_segment_offsets).size()); + raft::update_device( + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1)), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + local_frontier_sizes = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_frontier_range_firsts = std::vector(minor_comm_size); + local_frontier_range_lasts = std::vector(minor_comm_size); + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + for (int i = 0; i < minor_comm_size; ++i) { + local_frontier_sizes[i] = h_aggregate_tmps[i * num_scalars]; + if constexpr (try_bitmap) { + local_frontier_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 1]); + local_frontier_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 2]); + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 3 : 1)), + h_aggregate_tmps.begin() + + (i * num_scalars + (try_bitmap ? 3 : 1) + (*key_segment_offsets).size())); } } } else { diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index e17c3ae5fd0..2aab5cf3200 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -957,18 +957,14 @@ __host__ __device__ int priority_to_rank( } } -// return selected ranks if root. -// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are -// selected or not. template -std::variant /* root */, std::optional>> -compute_selected_ranks( +rmm::device_uvector compute_priorities( raft::comms::comms_t const& comm, ValueIterator value_first, - ValueIterator value_last, std::optional> hypersparse_key_offsets, // we may not have values for the entire "range_size" if // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, size_t range_size, int root, int subgroup_size /* faster interconnect within a subgroup */, @@ -979,13 +975,6 @@ compute_selected_ranks( auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); - assert(hypersparse_key_offsets.has_value() || - static_cast(thrust::distance(value_first, value_last)) == - range_size); // we should have for the entire "range_size" if - // hypersparse_key_offsets.has_value() is false - auto contiguous_size = static_cast(thrust::distance(value_first, value_last)) - - (hypersparse_key_offsets ? (*hypersparse_key_offsets).size() : size_t{0}); - // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same // DGX node should be the next) @@ -1031,12 +1020,30 @@ compute_selected_ranks( is_not_equal_t::value_type>{init}); } } - device_allreduce(comm, - priorities.data(), - priorities.data(), - priorities.size(), - raft::comms::op_t::MIN, - stream_view); + + return priorities; +} + +// return selected ranks if root. +// otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are +// selected or not. +template +std::variant /* root */, std::optional>> +compute_selected_ranks_from_priorities( + raft::comms::comms_t const& comm, + raft::device_span priorities, + std::optional> + hypersparse_key_offsets, // we may not have values for the entire "range_size" if + // hypersparse_key_offsets.has_value() is true + size_t contiguous_size, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + bool ignore_local_values, + rmm::cuda_stream_view stream_view) +{ + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + if (comm_rank == root) { rmm::device_uvector selected_ranks(priorities.size(), stream_view); auto offset_priority_pair_first = @@ -1059,13 +1066,16 @@ compute_selected_ranks( std::optional> keep_flags{std::nullopt}; if (!ignore_local_values) { keep_flags = rmm::device_uvector( - packed_bool_size(thrust::distance(value_first, value_last)), stream_view); - auto offset_priority_pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); + packed_bool_size(hypersparse_key_offsets + ? (contiguous_size + (*hypersparse_key_offsets).size()) + : contiguous_size), + stream_view); thrust::fill(rmm::exec_policy_nosync(stream_view), (*keep_flags).begin(), (*keep_flags).end(), packed_bool_empty_mask()); + auto offset_priority_pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); thrust::for_each( rmm::exec_policy_nosync(stream_view), offset_priority_pair_first, @@ -1088,13 +1098,13 @@ compute_selected_ranks( } }); if (hypersparse_key_offsets) { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + (*hypersparse_key_offsets).begin()); thrust::for_each( rmm::exec_policy_nosync(stream_view), - thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), - (*hypersparse_key_offsets).begin()), - thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), - (*hypersparse_key_offsets).begin()) + - (*hypersparse_key_offsets).size(), + pair_first, + pair_first + (*hypersparse_key_offsets).size(), [priorities = raft::device_span(priorities.data(), priorities.size()), keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), root, @@ -1550,7 +1560,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, static_cast(minor_comm_rank)); - // FIXME: we may filter zero local degree vertices first per_v_transform_reduce_e_edge_partition( handle, edge_partition, @@ -1661,63 +1670,99 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, key_segment_offset_vectors{}; if constexpr (use_input_key) { if constexpr (GraphViewType::is_multi_gpu) { + // FIMXE: refactor this code (create host_scalar_array_allgather) auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - local_key_list_sizes = host_scalar_allgather( - minor_comm, - static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), - handle.get_stream()); + + size_t num_scalars = 1; // local_key_list_size if constexpr (try_bitmap) { - std::array v_list_range = {vertex_t{0}, vertex_t{0}}; - auto v_list_size = static_cast( - thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); - if (v_list_size > 0) { - rmm::device_uvector tmps(2, handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - tmps.begin(), - tmps.end(), - [sorted_unique_key_first, v_list_size] __device__(size_t i) { - return (i == 0) ? *sorted_unique_key_first - : (*(sorted_unique_key_first + (v_list_size - 1)) + 1); - }); - raft::update_host(v_list_range.data(), tmps.data(), 2, handle.get_stream()); - handle.sync_stream(); - } - local_key_list_range_firsts = - host_scalar_allgather(minor_comm, v_list_range[0], handle.get_stream()); - local_key_list_range_lasts = - host_scalar_allgather(minor_comm, v_list_range[1], handle.get_stream()); + num_scalars += 2; // local_key_list_range_first, local_key_list_range_last } + if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + minor_comm_rank * num_scalars, + d_aggregate_tmps.begin() + minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1), + [sorted_unique_key_first, + v_list_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if constexpr (try_bitmap) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_key_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else { + assert(i == 2); + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_key_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + } else { + assert(i == 0); + return v_list_size; + } + }); if (key_segment_offsets) { - rmm::device_uvector d_key_segment_offsets((*key_segment_offsets).size(), - handle.get_stream()); - raft::update_device(d_key_segment_offsets.data(), - (*key_segment_offsets).data(), - (*key_segment_offsets).size(), - handle.get_stream()); - rmm::device_uvector d_aggregate_key_segment_offsets( - minor_comm_size * d_key_segment_offsets.size(), handle.get_stream()); - std::vector rx_counts(minor_comm_size, d_key_segment_offsets.size()); - std::vector rx_displacements(minor_comm_size); - std::exclusive_scan( - rx_counts.begin(), rx_counts.end(), rx_displacements.begin(), size_t{0}); - device_allgatherv(minor_comm, - d_key_segment_offsets.data(), - d_aggregate_key_segment_offsets.data(), - rx_counts, - rx_displacements, - handle.get_stream()); - std::vector h_aggregate_key_segment_offsets(d_aggregate_key_segment_offsets.size()); - raft::update_host(h_aggregate_key_segment_offsets.data(), - d_aggregate_key_segment_offsets.data(), - d_aggregate_key_segment_offsets.size(), - handle.get_stream()); - handle.sync_stream(); - key_segment_offset_vectors = std::vector>(minor_comm_size); - for (int i = 0; i < minor_comm_size; ++i) { - (*key_segment_offset_vectors)[i] = std::vector( - h_aggregate_key_segment_offsets.begin() + i * (*key_segment_offsets).size(), - h_aggregate_key_segment_offsets.begin() + (i + 1) * (*key_segment_offsets).size()); + raft::update_device( + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1)), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); + } + + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } + + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + local_key_list_sizes = std::vector(minor_comm_size); + if constexpr (try_bitmap) { + local_key_list_range_firsts = std::vector(minor_comm_size); + local_key_list_range_lasts = std::vector(minor_comm_size); + } + if (key_segment_offsets) { + key_segment_offset_vectors = std::vector>{}; + (*key_segment_offset_vectors).reserve(minor_comm_size); + } + for (int i = 0; i < minor_comm_size; ++i) { + local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars]; + if constexpr (try_bitmap) { + local_key_list_range_firsts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 1]); + local_key_list_range_lasts[i] = + static_cast(h_aggregate_tmps[i * num_scalars + 2]); + } + if (key_segment_offsets) { + (*key_segment_offset_vectors) + .emplace_back(h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1), + h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1) + + (*key_segment_offsets).size()); } } } else { @@ -1730,16 +1775,34 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 6. compute optional bitmap info + // 6. compute optional bitmap info & compressed vertex list std:: conditional_t>, std::byte /* dummy */> - key_list_bitmap{}; + v_list_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_v_list{}; if constexpr (try_bitmap) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); if (minor_comm_size > 1) { auto const minor_comm_rank = minor_comm.get_rank(); + + bool v_compressible{false}; + if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { + vertex_t local_v_list_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_key_list_range_lasts[i] - local_key_list_range_firsts[i]; + local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); + } + if (local_v_list_max_range_size <= + std::numeric_limits::max()) { // broadcast 32bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + double avg_fill_ratio{0.0}; for (int i = 0; i < minor_comm_size; ++i) { auto num_keys = static_cast(local_key_list_sizes[i]); @@ -1749,15 +1812,27 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } avg_fill_ratio /= static_cast(minor_comm_size); - constexpr double threshold_ratio = - 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); + double threshold_ratio = + 2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ / + static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); if (avg_fill_ratio > threshold_ratio) { - key_list_bitmap = + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, sorted_unique_nzd_key_last, local_key_list_range_firsts[minor_comm_rank], local_key_list_range_lasts[minor_comm_rank], handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_key_list_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + sorted_unique_key_first, + sorted_unique_nzd_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_key_list_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_v_list = std::move(tmps); } } } @@ -1770,9 +1845,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); - auto max_tmp_buffer_size = static_cast( - static_cast(handle.get_device_properties().totalGlobalMem) * 0.05); - size_t approx_tmp_buffer_size_per_edge_partition{0}; + size_t tmp_buffer_size_per_loop{0}; // FIXME: need to review this logic if constexpr (update_major) { size_t key_size{0}; if constexpr (use_input_key) { @@ -1800,15 +1873,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } else { aggregate_major_range_size = graph_view.number_of_vertices(); } - approx_tmp_buffer_size_per_edge_partition = + tmp_buffer_size_per_loop = (aggregate_major_range_size / comm_size) * (key_size + value_size); } - stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, - approx_tmp_buffer_size_per_edge_partition, - graph_view.number_of_local_edge_partitions(), - max_segments, - handle.get_stream_pool_size()); + stream_pool_indices = init_stream_pool_indices( + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * + 0.05), + tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + max_segments, + handle.get_stream_pool_size()); if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } } @@ -1847,7 +1922,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, decltype(minor_tmp_buffer->mutable_view().value_first())>, void /* dummy */>; - if (stream_pool_indices) { handle.sync_stream(); } + if constexpr (!GraphViewType::is_multi_gpu || !use_input_key) { + if (stream_pool_indices) { handle.sync_stream(); } + } // 9. proces local edge partitions @@ -1858,6 +1935,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime0 = std::chrono::steady_clock::now(); + auto subtime1 = std::chrono::steady_clock::now(); + auto subtime2 = std::chrono::steady_clock::now(); #endif auto loop_count = std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); @@ -1872,6 +1951,63 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_size = minor_comm.get_size(); edge_partition_key_buffers.reserve(loop_count); + std::optional>> edge_partition_tmp_buffers{ + std::nullopt}; + if (v_list_bitmap || compressed_v_list) { + edge_partition_tmp_buffers = std::vector>{}; + (*edge_partition_tmp_buffers).reserve(loop_count); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if (v_list_bitmap || compressed_v_list) { + (*edge_partition_tmp_buffers) + .emplace_back(v_list_bitmap + ? packed_bool_size(local_key_list_range_lasts[partition_idx] - + local_key_list_range_firsts[partition_idx]) + : local_key_list_sizes[partition_idx], + handle.get_stream()); + } else { + edge_partition_key_buffers.emplace_back(local_key_list_sizes[partition_idx], + handle.get_stream()); + } + } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + handle.sync_stream(); + subtime1 = std::chrono::steady_clock::now(); +#endif + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if (v_list_bitmap) { + device_bcast(minor_comm, + (*v_list_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_tmp_buffers)[j]), + size_dataframe_buffer((*edge_partition_tmp_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_v_list) { + device_bcast(minor_comm, + (*compressed_v_list).data(), + get_dataframe_buffer_begin((*edge_partition_tmp_buffers)[j]), + size_dataframe_buffer((*edge_partition_tmp_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (stream_pool_indices) { handle.sync_stream(); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + subtime2 = std::chrono::steady_clock::now(); +#endif for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -1879,54 +2015,50 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto edge_partition_key_buffer = allocate_dataframe_buffer( - minor_comm_size > 1 ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); - if (size_dataframe_buffer(edge_partition_key_buffer) > 0) { - if constexpr (try_bitmap) { - std::variant, decltype(sorted_unique_key_first)> - v_list{}; - if (key_list_bitmap) { - v_list = (static_cast(partition_idx) == minor_comm_rank) - ? raft::device_span((*key_list_bitmap).data(), - (*key_list_bitmap).size()) - : raft::device_span(static_cast(nullptr), - size_t{0}); + bool process_local_edges{true}; + if constexpr (filter_input_key) { + process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + } + + if (process_local_edges) { + if (v_list_bitmap || compressed_v_list) { + rmm::device_uvector rx_vertices(local_key_list_sizes[partition_idx], + loop_stream); + auto const& rx_tmps = (*edge_partition_tmp_buffers)[j]; + if (v_list_bitmap) { + rmm::device_scalar dummy(size_t{0}, loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_tmps.data(), rx_tmps.size()), + rx_vertices.begin(), + raft::device_span(dummy.data(), size_t{1}), + local_key_list_range_firsts[partition_idx], + local_key_list_range_lasts[partition_idx], + loop_stream); } else { - v_list = sorted_unique_key_first; + thrust::transform( + rmm::exec_policy_nosync(loop_stream), + rx_tmps.begin(), + rx_tmps.end(), + rx_vertices.begin(), + cuda::proclaim_return_type( + [range_first = local_key_list_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return static_cast(range_first + v_offset); })); } - device_bcast_vertex_list(minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_key_buffer), - local_key_list_range_firsts[partition_idx], - local_key_list_range_lasts[partition_idx], - local_key_list_sizes[partition_idx], - static_cast(partition_idx), - loop_stream); - } else { - device_bcast(minor_comm, - sorted_unique_key_first, - get_dataframe_buffer_begin(edge_partition_key_buffer), - local_key_list_sizes[partition_idx], - static_cast(partition_idx), - loop_stream); + edge_partition_key_buffers.push_back(std::move(rx_vertices)); } - if constexpr (filter_input_key) { - auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - if (!process_local_edges) { - resize_dataframe_buffer(edge_partition_key_buffer, 0, loop_stream); - shrink_to_fit_dataframe_buffer(edge_partition_key_buffer, loop_stream); - } + } else { + if (v_list_bitmap || compressed_v_list) { + edge_partition_key_buffers.emplace_back(0, loop_stream); + } else { + resize_dataframe_buffer(edge_partition_key_buffers[j], 0, loop_stream); + shrink_to_fit_dataframe_buffer(edge_partition_key_buffers[j], loop_stream); } } - edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime1 = std::chrono::steady_clock::now(); -#endif #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime2 = std::chrono::steady_clock::now(); + auto subtime3 = std::chrono::steady_clock::now(); #endif std::conditional_t(buffer_size, loop_stream)); } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime3 = std::chrono::steady_clock::now(); -#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime4 = std::chrono::steady_clock::now(); @@ -2282,10 +2411,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } if constexpr (std::is_same_v>) { - std::vector< - std::variant, std::optional>>> - edge_partition_selected_ranks_or_flags{}; - edge_partition_selected_ranks_or_flags.reserve(loop_count); + std::vector, + rmm::device_uvector, + rmm::device_uvector>> + edge_partition_priorities{}; + edge_partition_priorities.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto loop_stream = stream_pool_indices @@ -2314,37 +2444,159 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, range_size = size_dataframe_buffer(output_buffer); } + auto contiguous_size = + hypersparse_key_offsets + ? (size_dataframe_buffer(output_buffer) - (*hypersparse_key_offsets).size()) + : range_size; + + std::variant, + rmm::device_uvector, + rmm::device_uvector> + priorities = rmm::device_uvector(0, loop_stream); if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t - auto selected_ranks_or_flags = compute_selected_ranks( + priorities = compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_end(output_buffer), hypersparse_key_offsets, + contiguous_size, range_size, static_cast(partition_idx), subgroup_size, init, process_local_edges ? false : true /* ignore_local_values */, loop_stream); - edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); } else if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint16_t - CUGRAPH_FAIL( - "unimplemented."); // currently, raft does not support allreduce on uint16_t. - } else { // priority_t == uint32_t - auto selected_ranks_or_flags = compute_selected_ranks( + priorities = compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_end(output_buffer), hypersparse_key_offsets, + contiguous_size, range_size, static_cast(partition_idx), subgroup_size, init, process_local_edges ? false : true /* ignore_local_values */, loop_stream); - edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } else { // priority == uint32_t + priorities = compute_priorities( + minor_comm, + get_dataframe_buffer_begin(output_buffer), + hypersparse_key_offsets, + contiguous_size, + range_size, + static_cast(partition_idx), + subgroup_size, + init, + process_local_edges ? false : true /* ignore_local_values */, + loop_stream); + } + edge_partition_priorities.push_back(std::move(priorities)); + } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto& priorities = edge_partition_priorities[j]; + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + device_allreduce(minor_comm, + std::get<0>(priorities).data(), + std::get<0>(priorities).data(), + std::get<0>(priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } else if (minor_comm_size <= + std::numeric_limits::max()) { // priority == uint16_t + CUGRAPH_FAIL( + "unimplemented."); // currently, raft does not support allreduce on uint16_t. + } else { // priority == uint32_t + device_allreduce(minor_comm, + std::get<2>(priorities).data(), + std::get<2>(priorities).data(), + std::get<2>(priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (stream_pool_indices) { handle.sync_stream(); } + + std::vector< + std::variant, std::optional>>> + edge_partition_selected_ranks_or_flags{}; + edge_partition_selected_ranks_or_flags.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + bool process_local_edges = true; + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } + } + + auto const& output_buffer = major_output_buffers[j]; + std::optional> hypersparse_key_offsets{std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + hypersparse_key_offsets = raft::device_span( + (*edge_partition_hypersparse_key_offset_vectors)[j].data(), + (*edge_partition_hypersparse_key_offset_vectors)[j].size()); + } + } + + size_t range_size{0}; + if constexpr (filter_input_key) { + range_size = local_key_list_sizes[partition_idx]; + } else { + range_size = size_dataframe_buffer(output_buffer); + } + + auto contiguous_size = + hypersparse_key_offsets + ? (size_dataframe_buffer(output_buffer) - (*hypersparse_key_offsets).size()) + : range_size; + + auto& priorities = edge_partition_priorities[j]; + std::variant, std::optional>> + selected_ranks_or_flags = rmm::device_uvector(0, loop_stream); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + selected_ranks_or_flags = compute_selected_ranks_from_priorities( + minor_comm, + raft::device_span(std::get<0>(priorities).data(), + std::get<0>(priorities).size()), + hypersparse_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges ? false : true /* ignore_local_values */, + loop_stream); + } else if (minor_comm_size <= + std::numeric_limits::max()) { // priority == uint16_t + selected_ranks_or_flags = compute_selected_ranks_from_priorities( + minor_comm, + raft::device_span(std::get<1>(priorities).data(), + std::get<1>(priorities).size()), + hypersparse_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges ? false : true /* ignore_local_values */, + loop_stream); + } else { // priority_t == uint32_t + selected_ranks_or_flags = compute_selected_ranks_from_priorities( + minor_comm, + raft::device_span(std::get<2>(priorities).data(), + std::get<2>(priorities).size()), + hypersparse_key_offsets, + contiguous_size, + static_cast(partition_idx), + subgroup_size, + process_local_edges ? false : true /* ignore_local_values */, + loop_stream); } + edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); if constexpr (filter_input_key) { if (edge_partition_hypersparse_key_offset_vectors) { @@ -2427,6 +2679,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif } + // FIXME: combine count & copy_if??? std::vector> edge_partition_values{}; edge_partition_values.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { @@ -2508,23 +2761,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { h_value_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); } - rmm::device_uvector d_value_buffer_sizes(loop_count, handle.get_stream()); - raft::update_device(d_value_buffer_sizes.data(), + rmm::device_uvector d_aggregate_value_buffer_sizes(minor_comm_size * loop_count, + handle.get_stream()); + raft::update_device(d_aggregate_value_buffer_sizes.data() + minor_comm_rank * loop_count, h_value_buffer_sizes.data(), h_value_buffer_sizes.size(), handle.get_stream()); - rmm::device_uvector d_aggregate_value_buffer_sizes(minor_comm_size * loop_count, - handle.get_stream()); - std::vector tmp_rx_sizes(minor_comm_size, loop_count); - std::vector tmp_rx_displs = std::vector(minor_comm_size); - std::exclusive_scan( - tmp_rx_sizes.begin(), tmp_rx_sizes.end(), tmp_rx_displs.begin(), size_t{0}); - device_allgatherv(minor_comm, - d_value_buffer_sizes.data(), - d_aggregate_value_buffer_sizes.data(), - tmp_rx_sizes, - tmp_rx_displs, - handle.get_stream()); + device_allgather(minor_comm, + d_aggregate_value_buffer_sizes.data() + minor_comm_rank * loop_count, + d_aggregate_value_buffer_sizes.data(), + loop_count, + handle.get_stream()); if (static_cast(minor_comm_rank / num_concurrent_loops) == (i / num_concurrent_loops)) { std::vector h_aggregate_value_buffer_sizes( @@ -2549,18 +2796,15 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime13 = std::chrono::steady_clock::now(); #endif - handle.sync_stream(); #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + handle.sync_stream(); auto subtime14 = std::chrono::steady_clock::now(); #endif + device_group_start(minor_comm); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - auto& values = edge_partition_values[j]; + auto& values = edge_partition_values[j]; if (minor_comm_rank == static_cast(partition_idx)) { device_gatherv(minor_comm, @@ -2570,7 +2814,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, *rx_sizes, *rx_displs, static_cast(partition_idx), - loop_stream); + handle.get_stream()); } else { device_gatherv(minor_comm, get_dataframe_buffer_begin(values), @@ -2579,47 +2823,60 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::vector{}, std::vector{}, static_cast(partition_idx), - loop_stream); + handle.get_stream()); } - - resize_dataframe_buffer(values, 0, loop_stream); - shrink_to_fit_dataframe_buffer(values, loop_stream); } + device_group_end(minor_comm); #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime15 = std::chrono::steady_clock::now(); #endif + handle.sync_stream(); // this is required before edge_partition_values.clear(); + edge_partition_values.clear(); + if (stream_pool_indices) { + handle.sync_stream_pool(*stream_pool_indices); + } // to ensure that memory is freed #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime16 = std::chrono::steady_clock::now(); #endif if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { - auto j = static_cast(minor_comm_rank % num_concurrent_loops); - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - // FIXME: we can use 8 bit integer for ranks (and 32 bit integers for rx_offsets) to cut - // sort time significantly + auto j = static_cast(minor_comm_rank % num_concurrent_loops); auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); - rmm::device_uvector rx_offsets(selected_ranks.size(), loop_stream); - thrust::sequence(rmm::exec_policy_nosync(loop_stream), - rx_offsets.begin(), - rx_offsets.end(), - vertex_t{0}); - thrust::stable_sort_by_key(rmm::exec_policy_nosync(loop_stream), - selected_ranks.begin(), - selected_ranks.end(), - rx_offsets.begin()); - // selected_ranks[] == comm_size if no GPU in minor_comm has a non-init value - rx_offsets.resize((*rx_displs).back() + (*rx_sizes).back(), loop_stream); - thrust::scatter(rmm::exec_policy_nosync(loop_stream), - get_dataframe_buffer_begin(*rx_values), - get_dataframe_buffer_end(*rx_values), - rx_offsets.begin(), - tmp_vertex_value_output_first); + // FIXME: we may use 8 bit ranks to further cut sort time + if (selected_ranks.size() <= std::numeric_limits::max()) { + rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), uint32_t{0}); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + selected_ranks.begin(), + selected_ranks.end(), + rx_offsets.begin()); + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_offsets.resize((*rx_displs).back() + (*rx_sizes).back(), handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_offsets.begin(), + tmp_vertex_value_output_first); + } + else { + rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); + thrust::sequence( + handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), size_t{0}); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + selected_ranks.begin(), + selected_ranks.end(), + rx_offsets.begin()); + // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value + rx_offsets.resize((*rx_displs).back() + (*rx_sizes).back(), handle.get_stream()); + thrust::scatter(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + rx_offsets.begin(), + tmp_vertex_value_output_first); + } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + handle.sync_stream(); #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime17 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; @@ -2648,6 +2905,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, << std::endl; #endif } else { + // FIXME: better place this inside device_group_start() & device_group_end(); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto loop_stream = stream_pool_indices diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index d9a198b4152..167147afcce 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -729,8 +729,6 @@ void fill_edge_minor_property(raft::handle_t const& handle, static_cast(partition_idx), handle.get_stream()); } else { - // FIXME: we may better send 32 bit vertex offsets if [local_v_list_range_firsts[], - // local_v_list_range_lasts[]) fit into unsigned 32 bit integer device_bcast(major_comm, (static_cast(partition_idx) == major_comm_rank) ? sorted_unique_vertex_first @@ -838,7 +836,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, [minor_range_first, range_first = local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { - return v_offset + (range_first - minor_range_first); + return static_cast(v_offset + (range_first - minor_range_first)); })); auto val_first = thrust::make_constant_iterator(input); thrust::scatter(rmm::exec_policy_nosync(loop_stream), diff --git a/cpp/tests/utilities/mg_utilities.hpp b/cpp/tests/utilities/mg_utilities.hpp index 9f98245387d..a9a1d12417e 100644 --- a/cpp/tests/utilities/mg_utilities.hpp +++ b/cpp/tests/utilities/mg_utilities.hpp @@ -29,7 +29,7 @@ void finalize_mpi(); int query_mpi_comm_world_rank(); int query_mpi_comm_world_size(); -std::unique_ptr initialize_mg_handle(size_t pool_size = 64); +std::unique_ptr initialize_mg_handle(size_t pool_size = 128); // NCCL lazily initializes for P2P, and this enforces P2P initialization for better performance // measurements From 1ddb533c0f1b5205850de11d1ccd4c8d9bbfecc6 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 30 Sep 2024 23:10:01 -0700 Subject: [PATCH 093/126] bug fix(when major_comm_size == 1) --- cpp/src/structure/renumber_edgelist_impl.cuh | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/structure/renumber_edgelist_impl.cuh b/cpp/src/structure/renumber_edgelist_impl.cuh index 75cc316b8c1..7616b94f0c5 100644 --- a/cpp/src/structure/renumber_edgelist_impl.cuh +++ b/cpp/src/structure/renumber_edgelist_impl.cuh @@ -512,8 +512,6 @@ std::tuple, std::vector, vertex_t> compu this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values( major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream()); #endif - } else { - this_bin_sorted_unique_minors = std::move(edge_partition_tmp_minors[0]); } } } From c741f2b463e8472e3e7d88db0808e4a6aa8a9f64 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 1 Oct 2024 16:25:06 -0700 Subject: [PATCH 094/126] store hypersparse-segment degree offsets as graph metadata --- cpp/include/cugraph/graph.hpp | 11 +- cpp/include/cugraph/graph_functions.hpp | 2 + cpp/include/cugraph/graph_view.hpp | 40 ++- cpp/include/cugraph/utilities/misc_utils.cuh | 2 +- .../create_graph_from_edgelist_impl.cuh | 15 +- cpp/src/structure/graph_impl.cuh | 31 ++- cpp/src/structure/graph_view_impl.cuh | 4 +- cpp/src/structure/renumber_edgelist_impl.cuh | 240 +++++++++--------- 8 files changed, 208 insertions(+), 137 deletions(-) diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 0607b39153d..2be77f57e40 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -48,6 +48,7 @@ struct graph_meta_t> { partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; vertex_t num_local_unique_edge_srcs{}; vertex_t num_local_unique_edge_dsts{}; @@ -61,6 +62,7 @@ struct graph_meta_t> { // segment offsets based on vertex degree, relevant only if vertex IDs are renumbered std::optional> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_t is an owning graph class (note that graph_view_t is a non-owning graph class) @@ -207,6 +209,7 @@ class graph_tproperties_, partition_, edge_partition_segment_offsets_, + edge_partition_hypersparse_degree_offsets_, local_sorted_unique_edge_srcs, local_sorted_unique_edge_src_chunk_start_offsets, local_sorted_unique_edge_src_chunk_size_, @@ -228,6 +231,7 @@ class graph_t edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store row/column properties in key/value pairs (this saves memory if # unique edge // sources/destinations << V / major_comm_size|minor_comm_size). @@ -290,7 +294,11 @@ class graph_t(offsets_.data(), offsets_.size()), raft::device_span(indices_.data(), indices_.size()), graph_view_meta_t{ - this->number_of_vertices(), this->number_of_edges(), this->properties_, segment_offsets_}); + this->number_of_vertices(), + this->number_of_edges(), + this->properties_, + segment_offsets_, + hypersparse_degree_offsets_}); } private: @@ -299,6 +307,7 @@ class graph_t> segment_offsets_{}; + std::optional> hypersparse_degree_offsets_{}; }; template diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 099d7003b3c..4ec29dec363 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -41,11 +41,13 @@ struct renumber_meta_t> edge_t number_of_edges{}; partition_t partition{}; std::vector edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; }; template struct renumber_meta_t> { std::vector segment_offsets{}; + std::optional> hypersparse_degree_offsets{}; }; /** diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index f535e35c785..e026d7a48bc 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -243,15 +243,20 @@ namespace detail { // use (key, value) pairs to store source/destination properties if (unique edge // sources/destinations) over (V / major_comm_size|minor_comm_size) is smaller than the threshold // value -double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold = 0.0; // FIXME: just for benchmarking +double constexpr edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold = + 0.0; // FIXME: just for benchmarking // FIXME: threshold values require tuning // use the hypersparse format (currently, DCSR or DCSC) for the vertices with their degrees smaller // than minor_comm_size * hypersparse_threshold_ratio, should be less than 1.0 double constexpr hypersparse_threshold_ratio = 0.5; -size_t constexpr low_degree_threshold{raft::warp_size()}; -size_t constexpr mid_degree_threshold{1024}; -size_t constexpr num_sparse_segments_per_vertex_partition{3}; +size_t constexpr low_degree_threshold{ + raft::warp_size()}; // belongs to the low degree segment if the global degree is smaller than + // this value. +size_t constexpr mid_degree_threshold{ + 1024}; // belongs to the medium degree segment if the global degree is smaller than this value, + // otherwise, belongs to the high degree segment. +size_t constexpr num_sparse_segments_per_vertex_partition{3}; // high, mid, low // Common for both graph_view_t & graph_t and both single-GPU & multi-GPU versions template @@ -313,6 +318,7 @@ struct graph_view_meta_t edge_partition_segment_offsets{}; + std::optional> edge_partition_hypersparse_degree_offsets{}; std::conditional_t>, @@ -356,6 +362,7 @@ struct graph_view_meta_t> segment_offsets{std::nullopt}; + std::optional> hypersparse_degree_offsets{std::nullopt}; }; // graph_view_t is a non-owning graph class (note that graph_t is an owning graph class) @@ -563,6 +570,22 @@ class graph_view_t> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx) const + { + auto num_degrees_per_vertex_partition = + edge_partition_hypersparse_degree_offsets_ + ? ((*edge_partition_hypersparse_degree_offsets_).size() / edge_partition_offsets_.size()) + : size_t{0}; + return edge_partition_hypersparse_degree_offsets_ + ? std::make_optional>( + (*edge_partition_hypersparse_degree_offsets_).begin() + + partition_idx * num_degrees_per_vertex_partition, + (*edge_partition_hypersparse_degree_offsets_).begin() + + (partition_idx + 1) * num_degrees_per_vertex_partition) + : std::nullopt; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices(), @@ -760,6 +783,7 @@ class graph_view_t edge_partition_segment_offsets_{}; + std::optional> edge_partition_hypersparse_degree_offsets_{}; // if valid, store source/destination property values in key/value pairs (this saves memory if # // unique edge sources/destinations << V / major_comm_size|minor_comm_size). @@ -910,6 +934,13 @@ class graph_view_t> local_edge_partition_hypersparse_degree_offsets( + size_t partition_idx = 0) const + { + assert(partition_idx == 0); + return hypersparse_degree_offsets_; + } + vertex_partition_view_t local_vertex_partition_view() const { return vertex_partition_view_t(this->number_of_vertices()); @@ -1050,6 +1081,7 @@ class graph_view_t> segment_offsets_{std::nullopt}; + std::optional> hypersparse_degree_offsets_{std::nullopt}; std::optional> edge_mask_view_{std::nullopt}; }; diff --git a/cpp/include/cugraph/utilities/misc_utils.cuh b/cpp/include/cugraph/utilities/misc_utils.cuh index 633dabe5b40..91a349007da 100644 --- a/cpp/include/cugraph/utilities/misc_utils.cuh +++ b/cpp/include/cugraph/utilities/misc_utils.cuh @@ -81,7 +81,7 @@ std::tuple, std::vector> compute_offset_aligned_ return std::make_tuple(h_chunk_offsets, h_element_offsets); } else { - return std::make_tuple(std::vector{{0, offsets.size() - 1}}, + return std::make_tuple(std::vector{{0, static_cast(offsets.size() - 1)}}, std::vector{{0, num_elements}}); } } diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index 196ce2ac0d6..f4855be0e7c 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -820,11 +820,13 @@ create_graph_from_partitioned_edgelist( std::move(edge_partition_offsets), std::move(edge_partition_indices), std::move(edge_partition_dcs_nzd_vertices), - cugraph::graph_meta_t{meta.number_of_vertices, - meta.number_of_edges, - graph_properties, - meta.partition, - meta.edge_partition_segment_offsets}), + cugraph::graph_meta_t{ + meta.number_of_vertices, + meta.number_of_edges, + graph_properties, + meta.partition, + meta.edge_partition_segment_offsets, + meta.edge_partition_hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), @@ -1843,7 +1845,8 @@ create_graph_from_edgelist_impl( cugraph::graph_meta_t{ num_vertices, graph_properties, - renumber ? std::optional>{meta.segment_offsets} : std::nullopt}), + renumber ? std::optional>{meta.segment_offsets} : std::nullopt, + meta.hypersparse_degree_offsets}), std::move(edge_weights), std::move(edge_ids), std::move(edge_types), diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 4d5585304f6..4c9166cb01a 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -165,7 +165,8 @@ update_local_sorted_unique_edge_majors_minors( // majors/minors to support storing edge major/minor properties in (key, value) pairs. // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 1" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "update_local_sorted_unique_edge_majors_minors 1" << std::endl; { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); @@ -192,7 +193,10 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique raft::comms::op_t::MAX, handle.get_stream()); -std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl; + std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_ratio + << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" + << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold + << std::endl; if (max_minor_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { std::cerr << "K,V pairs" << std::endl; @@ -283,7 +287,8 @@ std::cout << "max_minor_properties_fill_ratio=" << max_minor_properties_fill_rat } // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 2" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "update_local_sorted_unique_edge_majors_minors 2" << std::endl; std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { @@ -308,7 +313,10 @@ RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique raft::comms::op_t::MAX, handle.get_stream()); -std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold << std::endl; + std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio + << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" + << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold + << std::endl; if (max_major_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { auto const chunk_size = @@ -373,7 +381,8 @@ std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_rat } local_sorted_unique_edge_major_chunk_size = chunk_size; } -RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "update_local_sorted_unique_edge_majors_minors 3" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "update_local_sorted_unique_edge_majors_minors 3" << std::endl; return std::make_tuple(std::move(local_sorted_unique_edge_majors), std::move(local_sorted_unique_edge_major_chunk_start_offsets), @@ -406,7 +415,8 @@ graph_t(indices.size()), meta.properties), offsets_(std::move(offsets)), indices_(std::move(indices)), - segment_offsets_(meta.segment_offsets) + segment_offsets_(meta.segment_offsets), + hypersparse_degree_offsets_(meta.hypersparse_degree_offsets) { } diff --git a/cpp/src/structure/graph_view_impl.cuh b/cpp/src/structure/graph_view_impl.cuh index f925a142737..88188372feb 100644 --- a/cpp/src/structure/graph_view_impl.cuh +++ b/cpp/src/structure/graph_view_impl.cuh @@ -496,6 +496,7 @@ graph_view_t find_locally_unused_ext_vertex_id( : std::nullopt /* if the entire range of vertex_t is used */; } -// returns renumber map and segment_offsets +// returns renumber map, segment_offsets, and hypersparse_degree_offsets template -std::tuple, std::vector, vertex_t> compute_renumber_map( - raft::handle_t const& handle, - std::optional>&& local_vertices, - std::vector const& edgelist_majors, - std::vector const& edgelist_minors, - std::vector const& edgelist_edge_counts) +std::tuple, + std::vector, + std::optional>, + vertex_t> +compute_renumber_map(raft::handle_t const& handle, + std::optional>&& local_vertices, + std::vector const& edgelist_majors, + std::vector const& edgelist_minors, + std::vector const& edgelist_edge_counts) { // 1. if local_vertices.has_value() is false, find unique vertices from edge majors & minors (to // construct local_vertices) @@ -322,10 +325,6 @@ std::tuple, std::vector, vertex_t> compu handle.sync_stream(); for (size_t i = 0; i < num_bins; ++i) { -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "compute_renumber_map 0-1 i=" << i << std::endl; -#endif rmm::device_uvector this_bin_sorted_unique_majors(0, handle.get_stream()); { std::vector> edge_partition_tmp_majors{}; // for bin "i" @@ -377,22 +376,8 @@ std::tuple, std::vector, vertex_t> compu this_bin_sorted_unique_majors.begin() + output_offset); output_offset += edge_partition_tmp_majors[j].size(); } -#if 0 - std::tie(this_bin_sorted_unique_majors, std::ignore) = shuffle_values( - minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); - thrust::sort( - handle.get_thrust_policy(), - this_bin_sorted_unique_majors.begin(), - this_bin_sorted_unique_majors.end()); - this_bin_sorted_unique_majors.resize(thrust::distance(this_bin_sorted_unique_majors.begin(), thrust::unique( - handle.get_thrust_policy(), - this_bin_sorted_unique_majors.begin(), - this_bin_sorted_unique_majors.end())), handle.get_stream()); - this_bin_sorted_unique_majors.shrink_to_fit(handle.get_stream()); -#else this_bin_sorted_unique_majors = shuffle_and_unique_segment_sorted_values( minor_comm, this_bin_sorted_unique_majors.begin(), tx_counts, handle.get_stream()); -#endif } else { this_bin_sorted_unique_majors = std::move(edge_partition_tmp_majors[0]); } @@ -401,10 +386,6 @@ std::tuple, std::vector, vertex_t> compu } } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "compute_renumber_map 0-2 i=" << i << std::endl; -#endif rmm::device_uvector this_bin_sorted_unique_minors(0, handle.get_stream()); { std::vector> edge_partition_tmp_minors{}; // for bin "i" @@ -474,16 +455,6 @@ std::tuple, std::vector, vertex_t> compu auto const comm_size = comm.get_size(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); -#if 0 - this_bin_sorted_unique_minors = shuffle_ext_vertices_to_local_gpu_by_vertex_partitioning(handle, std::move(this_bin_sorted_unique_minors)); - thrust::sort(handle.get_thrust_policy(), - this_bin_sorted_unique_minors.begin(), - this_bin_sorted_unique_minors.end()); - this_bin_sorted_unique_minors.resize(thrust::distance(this_bin_sorted_unique_minors.begin(), thrust::unique(handle.get_thrust_policy(), - this_bin_sorted_unique_minors.begin(), - this_bin_sorted_unique_minors.end())), handle.get_stream()); - this_bin_sorted_unique_minors.shrink_to_fit(handle.get_stream()); -#else compute_gpu_id_from_ext_vertex_t gpu_id_func{ comm_size, major_comm_size, minor_comm_size}; auto d_tx_counts = groupby_and_count( @@ -511,14 +482,9 @@ std::tuple, std::vector, vertex_t> compu } this_bin_sorted_unique_minors = shuffle_and_unique_segment_sorted_values( major_comm, this_bin_sorted_unique_minors.begin(), h_tx_counts, handle.get_stream()); -#endif } } } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "compute_renumber_map 0-3 i=" << i << std::endl; -#endif rmm::device_uvector this_bin_sorted_unique_vertices(0, handle.get_stream()); { rmm::device_uvector merged_vertices( @@ -542,10 +508,6 @@ std::tuple, std::vector, vertex_t> compu merged_vertices.shrink_to_fit(handle.get_stream()); this_bin_sorted_unique_vertices = std::move(merged_vertices); } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "compute_renumber_map 0-4 i=" << i << std::endl; -#endif if (sorted_local_vertices.size() == 0) { sorted_local_vertices = std::move(this_bin_sorted_unique_vertices); } else { @@ -690,61 +652,98 @@ std::tuple, std::vector, vertex_t> compu (detail::hypersparse_threshold_ratio <= 1.0)); size_t mid_degree_threshold{detail::mid_degree_threshold}; size_t low_degree_threshold{detail::low_degree_threshold}; - size_t hypersparse_degree_threshold{0}; + size_t hypersparse_degree_threshold{1}; if (multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); mid_degree_threshold *= minor_comm_size; low_degree_threshold *= minor_comm_size; - hypersparse_degree_threshold = - static_cast(minor_comm_size * detail::hypersparse_threshold_ratio); + hypersparse_degree_threshold = std::max( + static_cast(minor_comm_size * detail::hypersparse_threshold_ratio), size_t{1}); } - auto num_segments_per_vertex_partition = - detail::num_sparse_segments_per_vertex_partition + - (hypersparse_degree_threshold > 0 ? size_t{2} : size_t{1}); // last is 0-degree segment - rmm::device_uvector d_thresholds(num_segments_per_vertex_partition - 1, - handle.get_stream()); - auto h_thresholds = - hypersparse_degree_threshold > 0 - ? std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - static_cast(hypersparse_degree_threshold), - std::min(static_cast(hypersparse_degree_threshold), edge_t{1})} - : std::vector{static_cast(mid_degree_threshold), - static_cast(low_degree_threshold), - edge_t{1}}; - raft::update_device( - d_thresholds.data(), h_thresholds.data(), h_thresholds.size(), handle.get_stream()); - - rmm::device_uvector d_segment_offsets(num_segments_per_vertex_partition + 1, - handle.get_stream()); - auto vertex_count = static_cast(sorted_local_vertices.size()); - d_segment_offsets.set_element_to_zero_async(0, handle.get_stream()); - d_segment_offsets.set_element( - num_segments_per_vertex_partition, vertex_count, handle.get_stream()); + std::vector h_segment_offsets{}; + std::optional> h_hypersparse_degree_offsets{}; + { + auto num_partitions = detail::num_sparse_segments_per_vertex_partition /* high, mid, low */ + + (hypersparse_degree_threshold > 1 + ? hypersparse_degree_threshold - size_t{1} + /* one partition per each global degree in the hypersparse region */ + : size_t{0}) + + size_t{1} /* zero */; + rmm::device_uvector d_thresholds(num_partitions - 1, handle.get_stream()); + thrust::tabulate(handle.get_thrust_policy(), + d_thresholds.begin(), + d_thresholds.end(), + [mid_degree_threshold, + low_degree_threshold, + hypersparse_degree_threshold] __device__(size_t i) { + if (i == 0) { + return mid_degree_threshold; // high,mid boundary + } else if (i == 1) { + return low_degree_threshold; // mid, low boundary + } else { + assert(hypersparse_degree_threshold > (i - 2)); + return hypersparse_degree_threshold - (i - 2); + } + }); + rmm::device_uvector d_offsets(num_partitions + 1, handle.get_stream()); + d_offsets.set_element_to_zero_async(0, handle.get_stream()); + auto vertex_count = static_cast(sorted_local_vertices.size()); + d_offsets.set_element(num_partitions, vertex_count, handle.get_stream()); + thrust::upper_bound(handle.get_thrust_policy(), + sorted_local_vertex_degrees.begin(), + sorted_local_vertex_degrees.end(), + d_thresholds.begin(), + d_thresholds.end(), + d_offsets.begin() + 1, + thrust::greater{}); + std::vector h_offsets(d_offsets.size()); + raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); + handle.sync_stream(); + std::cerr << "hypersparse_degree_threshold=" << hypersparse_degree_threshold << std::endl; + raft::print_host_vector("h_offsets", h_offsets.data(), h_offsets.size(), std::cerr); + + auto num_segments_per_vertex_partition = + detail::num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold > 1 ? size_t{2} : size_t{1}); // last is 0-degree segment + h_segment_offsets.resize(num_segments_per_vertex_partition + 1); + std::copy(h_offsets.begin(), + h_offsets.begin() + num_sparse_segments_per_vertex_partition + 1, + h_segment_offsets.begin()); + *(h_segment_offsets.rbegin()) = *(h_offsets.rbegin()); + if (hypersparse_degree_threshold > 1) { + *(h_segment_offsets.rbegin() + 1) = *(h_offsets.rbegin() + 1); + + h_hypersparse_degree_offsets = std::vector(hypersparse_degree_threshold); + std::copy(h_offsets.begin() + num_sparse_segments_per_vertex_partition, + h_offsets.begin() + num_sparse_segments_per_vertex_partition + + (hypersparse_degree_threshold - 1), + (*h_hypersparse_degree_offsets).begin()); + auto shift = (*h_hypersparse_degree_offsets)[0]; + std::transform((*h_hypersparse_degree_offsets).begin(), + (*h_hypersparse_degree_offsets).end(), + (*h_hypersparse_degree_offsets).begin(), + [shift](auto offset) { return offset - shift; }); + *((*h_hypersparse_degree_offsets).rbegin()) = *(h_offsets.rbegin() + 1); + raft::print_host_vector("hypersparse_degree_offsets", + (*h_hypersparse_degree_offsets).data(), + (*h_hypersparse_degree_offsets).size(), + std::cerr); + } + raft::print_host_vector( + "h_segment_offsets", h_segment_offsets.data(), h_segment_offsets.size(), std::cerr); + } - thrust::upper_bound(handle.get_thrust_policy(), - sorted_local_vertex_degrees.begin(), - sorted_local_vertex_degrees.end(), - d_thresholds.begin(), - d_thresholds.end(), - d_segment_offsets.begin() + 1, - thrust::greater{}); - - std::vector h_segment_offsets(d_segment_offsets.size()); - raft::update_host(h_segment_offsets.data(), - d_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - handle.sync_stream(); #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "compute_renumber_map 4" << std::endl; #endif - return std::make_tuple( - std::move(sorted_local_vertices), h_segment_offsets, *locally_unused_vertex_id); + return std::make_tuple(std::move(sorted_local_vertices), + h_segment_offsets, + h_hypersparse_degree_offsets, + *locally_unused_vertex_id); } template @@ -930,32 +929,28 @@ void expensive_check_edgelist( } template -std::vector aggregate_segment_offsets(raft::handle_t const& handle, - std::vector const& segment_offsets) +std::vector aggregate_offset_vectors(raft::handle_t const& handle, + std::vector const& offsets) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - rmm::device_uvector d_segment_offsets(segment_offsets.size(), handle.get_stream()); - raft::update_device( - d_segment_offsets.data(), segment_offsets.data(), segment_offsets.size(), handle.get_stream()); - rmm::device_uvector d_aggregate_segment_offsets( - minor_comm_size * d_segment_offsets.size(), handle.get_stream()); - minor_comm.allgather(d_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_segment_offsets.size(), - handle.get_stream()); - - std::vector h_aggregate_segment_offsets(d_aggregate_segment_offsets.size(), - vertex_t{0}); - raft::update_host(h_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.data(), - d_aggregate_segment_offsets.size(), + rmm::device_uvector d_offsets(offsets.size(), handle.get_stream()); + raft::update_device(d_offsets.data(), offsets.data(), offsets.size(), handle.get_stream()); + rmm::device_uvector d_aggregate_offset_vectors(minor_comm_size * d_offsets.size(), + handle.get_stream()); + minor_comm.allgather( + d_offsets.data(), d_aggregate_offset_vectors.data(), d_offsets.size(), handle.get_stream()); + + std::vector h_aggregate_offset_vectors(d_aggregate_offset_vectors.size(), vertex_t{0}); + raft::update_host(h_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.data(), + d_aggregate_offset_vectors.size(), handle.get_stream()); handle.sync_stream(); // this is necessary as h_aggregate_offsets can be used right after return. - return h_aggregate_segment_offsets; + return h_aggregate_offset_vectors; } } // namespace detail @@ -1038,7 +1033,10 @@ renumber_edgelist( RAFT_CUDA_TRY(cudaDeviceSynchronize()); std::cerr << "renumber_edgelist 0" << std::endl; #endif - auto [renumber_map_labels, vertex_partition_segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + vertex_partition_segment_offsets, + vertex_partition_hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map(handle, std::move(local_vertices), edgelist_const_majors, @@ -1243,12 +1241,20 @@ renumber_edgelist( std::cerr << "renumber_edgelist 4" << std::endl; #endif auto edge_partition_segment_offsets = - detail::aggregate_segment_offsets(handle, vertex_partition_segment_offsets); + detail::aggregate_offset_vectors(handle, vertex_partition_segment_offsets); + auto edge_partition_hypersparse_degree_offsets = + vertex_partition_hypersparse_degree_offsets + ? std::make_optional( + detail::aggregate_offset_vectors(handle, *vertex_partition_hypersparse_degree_offsets)) + : std::nullopt; return std::make_tuple( std::move(renumber_map_labels), - renumber_meta_t{ - number_of_vertices, number_of_edges, partition, edge_partition_segment_offsets}); + renumber_meta_t{number_of_vertices, + number_of_edges, + partition, + edge_partition_segment_offsets, + edge_partition_hypersparse_degree_offsets}); } template @@ -1276,7 +1282,10 @@ renumber_edgelist(raft::handle_t const& handle, std::nullopt); } - auto [renumber_map_labels, segment_offsets, locally_unused_vertex_id] = + auto [renumber_map_labels, + segment_offsets, + hypersparse_degree_offsets, + locally_unused_vertex_id] = detail::compute_renumber_map( handle, std::move(vertices), @@ -1297,8 +1306,9 @@ renumber_edgelist(raft::handle_t const& handle, renumber_map_view.find( edgelist_minors, edgelist_minors + num_edgelist_edges, edgelist_minors, handle.get_stream()); - return std::make_tuple(std::move(renumber_map_labels), - renumber_meta_t{segment_offsets}); + return std::make_tuple( + std::move(renumber_map_labels), + renumber_meta_t{segment_offsets, hypersparse_degree_offsets}); } } // namespace cugraph From 6474f841c9f02ac27b26d7a1cdec6a7092d4c589 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 2 Oct 2024 12:15:54 -0700 Subject: [PATCH 095/126] update key list retrieval code --- .../prims/detail/per_v_transform_reduce_e.cuh | 699 +++++++++++------- cpp/src/prims/vertex_frontier.cuh | 2 +- 2 files changed, 420 insertions(+), 281 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 2aab5cf3200..739b79a0add 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -34,6 +34,7 @@ #include #include #include +#include #include #include @@ -1509,6 +1510,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } // 3. filter input keys & update key_segment_offsets +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time1 = std::chrono::steady_clock::now(); +#endif auto edge_mask_view = graph_view.edge_mask_view(); @@ -1629,6 +1634,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } /* 4. compute subgroup_size (used to compute priority in device_gatherv) */ +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto time2 = std::chrono::steady_clock::now(); +#endif [[maybe_unused]] std::conditional_t>, @@ -1655,15 +1664,15 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 5. collect local_key_list_sizes & local_key_list_range_firsts & local_key_list_range_lasts & - // key_segment_offsets + // 5. collect local_key_list_sizes & local_v_list_range_firsts & local_v_list_range_lasts & + // key_segment_offset_vectors std::conditional_t, std::byte /* dummy */> local_key_list_sizes{}; std::conditional_t, std::byte /* dummy */> - local_key_list_range_firsts{}; + local_v_list_range_firsts{}; std::conditional_t, std::byte /* dummy */> - local_key_list_range_lasts{}; + local_v_list_range_lasts{}; std::conditional_t>>, std::byte /* dummy */> @@ -1743,8 +1752,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, handle.sync_stream(); local_key_list_sizes = std::vector(minor_comm_size); if constexpr (try_bitmap) { - local_key_list_range_firsts = std::vector(minor_comm_size); - local_key_list_range_lasts = std::vector(minor_comm_size); + local_v_list_range_firsts = std::vector(minor_comm_size); + local_v_list_range_lasts = std::vector(minor_comm_size); } if (key_segment_offsets) { key_segment_offset_vectors = std::vector>{}; @@ -1753,9 +1762,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (int i = 0; i < minor_comm_size; ++i) { local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars]; if constexpr (try_bitmap) { - local_key_list_range_firsts[i] = + local_v_list_range_firsts[i] = static_cast(h_aggregate_tmps[i * num_scalars + 1]); - local_key_list_range_lasts[i] = + local_v_list_range_lasts[i] = static_cast(h_aggregate_tmps[i * num_scalars + 2]); } if (key_segment_offsets) { @@ -1793,7 +1802,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { vertex_t local_v_list_max_range_size{0}; for (int i = 0; i < minor_comm_size; ++i) { - auto range_size = local_key_list_range_lasts[i] - local_key_list_range_firsts[i]; + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; local_v_list_max_range_size = std::max(range_size, local_v_list_max_range_size); } if (local_v_list_max_range_size <= @@ -1806,7 +1815,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, double avg_fill_ratio{0.0}; for (int i = 0; i < minor_comm_size; ++i) { auto num_keys = static_cast(local_key_list_sizes[i]); - auto range_size = local_key_list_range_lasts[i] - local_key_list_range_firsts[i]; + auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; avg_fill_ratio += (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; } @@ -1816,12 +1825,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, 2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ / static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); if (avg_fill_ratio > threshold_ratio) { - v_list_bitmap = - compute_vertex_list_bitmap_info(sorted_unique_key_first, - sorted_unique_nzd_key_last, - local_key_list_range_firsts[minor_comm_rank], - local_key_list_range_lasts[minor_comm_rank], - handle.get_stream()); + v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, + sorted_unique_nzd_key_last, + local_v_list_range_firsts[minor_comm_rank], + local_v_list_range_lasts[minor_comm_rank], + handle.get_stream()); } else if (v_compressible) { rmm::device_uvector tmps(local_key_list_sizes[minor_comm_rank], handle.get_stream()); @@ -1830,7 +1838,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, sorted_unique_nzd_key_last, tmps.begin(), cuda::proclaim_return_type( - [range_first = local_key_list_range_firsts[minor_comm_rank]] __device__( + [range_first = local_v_list_range_firsts[minor_comm_rank]] __device__( auto v) { return static_cast(v - range_first); })); compressed_v_list = std::move(tmps); } @@ -1930,7 +1938,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time1 = std::chrono::steady_clock::now(); + auto time3 = std::chrono::steady_clock::now(); #endif for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1945,31 +1953,46 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::vector>, std::byte /* dummy */> edge_partition_key_buffers{}; + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in the + // hypersparse region if constexpr (GraphViewType::is_multi_gpu && use_input_key) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); + auto const minor_comm_rank = minor_comm.get_rank(); edge_partition_key_buffers.reserve(loop_count); - std::optional>> edge_partition_tmp_buffers{ - std::nullopt}; - if (v_list_bitmap || compressed_v_list) { - edge_partition_tmp_buffers = std::vector>{}; - (*edge_partition_tmp_buffers).reserve(loop_count); + + std::conditional_t, + std::optional>>, + std::byte /* dummy */> + edge_partition_tmp_bcast_buffers{std::nullopt}; + if constexpr (std::is_same_v) { + if (v_list_bitmap || compressed_v_list) { + edge_partition_tmp_bcast_buffers = std::vector>{}; + (*edge_partition_tmp_bcast_buffers).reserve(loop_count); + } } for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - if (v_list_bitmap || compressed_v_list) { - (*edge_partition_tmp_buffers) - .emplace_back(v_list_bitmap - ? packed_bool_size(local_key_list_range_lasts[partition_idx] - - local_key_list_range_firsts[partition_idx]) - : local_key_list_sizes[partition_idx], - handle.get_stream()); - } else { - edge_partition_key_buffers.emplace_back(local_key_list_sizes[partition_idx], - handle.get_stream()); + auto partition_idx = i + j; + bool use_tmp_bcast_buffer = false; + if constexpr (std::is_same_v) { + if (edge_partition_tmp_bcast_buffers) { + (*edge_partition_tmp_bcast_buffers) + .emplace_back(v_list_bitmap + ? packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]) + : local_key_list_sizes[partition_idx], + handle.get_stream()); + use_tmp_bcast_buffer = true; + } + } + if (!use_tmp_bcast_buffer) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_key_list_sizes[partition_idx], handle.get_stream())); } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1979,22 +2002,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, device_group_start(minor_comm); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - if (v_list_bitmap) { - device_bcast(minor_comm, - (*v_list_bitmap).data(), - get_dataframe_buffer_begin((*edge_partition_tmp_buffers)[j]), - size_dataframe_buffer((*edge_partition_tmp_buffers)[j]), - static_cast(partition_idx), - handle.get_stream()); - } else if (compressed_v_list) { - device_bcast(minor_comm, - (*compressed_v_list).data(), - get_dataframe_buffer_begin((*edge_partition_tmp_buffers)[j]), - size_dataframe_buffer((*edge_partition_tmp_buffers)[j]), - static_cast(partition_idx), - handle.get_stream()); - } else { + auto partition_idx = i + j; + bool use_tmp_bcast_buffer = false; + if constexpr (std::is_same_v) { + if (v_list_bitmap) { + device_bcast(minor_comm, + (*v_list_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_tmp_bcast_buffers)[j]), + size_dataframe_buffer((*edge_partition_tmp_bcast_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + use_tmp_bcast_buffer = true; + } else if (compressed_v_list) { + device_bcast(minor_comm, + (*compressed_v_list).data(), + get_dataframe_buffer_begin((*edge_partition_tmp_bcast_buffers)[j]), + size_dataframe_buffer((*edge_partition_tmp_bcast_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + use_tmp_bcast_buffer = true; + } + } + if (!use_tmp_bcast_buffer) { device_bcast(minor_comm, sorted_unique_key_first, get_dataframe_buffer_begin(edge_partition_key_buffers[j]), @@ -2009,212 +2038,340 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, subtime2 = std::chrono::steady_clock::now(); #endif - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - bool process_local_edges{true}; - if constexpr (filter_input_key) { - process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - } + if constexpr (std::is_same_v) { + if (edge_partition_tmp_bcast_buffers) { + // copy data (in the sparse region first if filter_input_key is true) to + // edge_partition_key_buffers[j] - if (process_local_edges) { - if (v_list_bitmap || compressed_v_list) { - rmm::device_uvector rx_vertices(local_key_list_sizes[partition_idx], - loop_stream); - auto const& rx_tmps = (*edge_partition_tmp_buffers)[j]; - if (v_list_bitmap) { - rmm::device_scalar dummy(size_t{0}, loop_stream); - retrieve_vertex_list_from_bitmap( - raft::device_span(rx_tmps.data(), rx_tmps.size()), - rx_vertices.begin(), - raft::device_span(dummy.data(), size_t{1}), - local_key_list_range_firsts[partition_idx], - local_key_list_range_lasts[partition_idx], - loop_stream); - } else { - thrust::transform( - rmm::exec_policy_nosync(loop_stream), - rx_tmps.begin(), - rx_tmps.end(), - rx_vertices.begin(), - cuda::proclaim_return_type( - [range_first = local_key_list_range_firsts[partition_idx]] __device__( - uint32_t v_offset) { return static_cast(range_first + v_offset); })); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + bool process_local_edges{true}; + if constexpr (filter_input_key) { + process_local_edges = (static_cast(partition_idx) != minor_comm_rank); } - edge_partition_key_buffers.push_back(std::move(rx_vertices)); - } - } else { - if (v_list_bitmap || compressed_v_list) { - edge_partition_key_buffers.emplace_back(0, loop_stream); - } else { - resize_dataframe_buffer(edge_partition_key_buffers[j], 0, loop_stream); - shrink_to_fit_dataframe_buffer(edge_partition_key_buffers[j], loop_stream); + + auto keys = allocate_dataframe_buffer( + process_local_edges ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); + if (process_local_edges) { + auto range_first = local_v_list_range_firsts[partition_idx]; + if (v_list_bitmap) { + auto range_last = local_v_list_range_lasts[partition_idx]; + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { // skip copying the hypersparse segment + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + range_first = + std::min(range_first, *(edge_partition.major_hypersparse_first())); + range_last = std::min(range_last, *(edge_partition.major_hypersparse_first())); + } + } + auto const& rx_tmps = (*edge_partition_tmp_bcast_buffers)[j]; + rmm::device_scalar dummy(size_t{0}, loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_tmps.data(), rx_tmps.size()), + get_dataframe_buffer_begin(keys), + raft::device_span(dummy.data(), size_t{1}), + range_first, + range_last, + loop_stream); + } else { + assert(compressed_v_list); + auto const& rx_tmps = (*edge_partition_tmp_bcast_buffers)[j]; + auto input_first = rx_tmps.begin(); + auto input_last = rx_tmps.end(); + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { // skip copying the hypersparse segment + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + input_last = input_first + key_segment_offsets[3]; + } + } + thrust::transform( + rmm::exec_policy_nosync(loop_stream), + input_first, + input_last, + get_dataframe_buffer_begin(keys), + cuda::proclaim_return_type([range_first] __device__(uint32_t v_offset) { + return static_cast(range_first + v_offset); + })); + } + } + edge_partition_key_buffers.push_back(std::move(keys)); } } } - } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime3 = std::chrono::steady_clock::now(); -#endif - std::conditional_t>>, - std::byte /* dummy */> - edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in the - // hypersparse region - if constexpr (filter_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - if (graph_view.use_dcs() && (minor_comm_size > 1)) { - // FIXME: we can pre-compute this & store in graph_t - std::vector> edge_partition_hypersparse_bitmap_vectors{}; - edge_partition_hypersparse_bitmap_vectors.reserve(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto const& segment_offsets = - graph_view.local_edge_partition_segment_offsets(partition_idx); - - auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - rmm::device_uvector bitmap( - process_local_edges ? packed_bool_size((*segment_offsets)[4] - (*segment_offsets)[3]) - : size_t{0}, - loop_stream); - if (process_local_edges) { - thrust::fill(rmm::exec_policy_nosync(loop_stream), - bitmap.begin(), - bitmap.end(), - packed_bool_empty_mask()); - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - *(edge_partition.dcs_nzd_vertices()), - *(edge_partition.dcs_nzd_vertices()) + *(edge_partition.dcs_nzd_vertex_count()), - [bitmap = raft::device_span(bitmap.data(), bitmap.size()), - major_hypersparse_first = - *(edge_partition.major_hypersparse_first())] __device__(auto major) { - auto major_offset = major - major_hypersparse_first; - cuda::atomic_ref word( - bitmap[packed_bool_offset(major_offset)]); - word.fetch_or(packed_bool_mask(major_offset), cuda::std::memory_order_relaxed); - }); + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + edge_partition_hypersparse_key_offset_vectors = + std::vector>{}; + (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + + std::vector> edge_partition_count_scalars{}; + edge_partition_count_scalars.reserve(loop_count); + + std::optional>> + edge_partition_tmp_key_buffers{}; + bool direct_copy = false; /// directly copy to edge_partition_key_buffers[] + if constexpr (std::is_same_v) { + if (edge_partition_tmp_bcast_buffers) { direct_copy = true; } + } + if (!direct_copy) { // copy the hypersparse keys to a temporary key buffer first + edge_partition_tmp_key_buffers = std::vector>{}; + (*edge_partition_tmp_key_buffers).reserve(loop_count); } - edge_partition_hypersparse_bitmap_vectors.push_back(std::move(bitmap)); - } - edge_partition_hypersparse_key_offset_vectors = std::vector>{}; - (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - std::vector> edge_partition_tmp_key_buffers{}; - edge_partition_tmp_key_buffers.reserve(loop_count); - std::vector> edge_partition_tmp_count_scalars{}; - edge_partition_tmp_count_scalars.reserve(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); + auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + + auto& keys = edge_partition_key_buffers[j]; + auto offsets = rmm::device_uvector( + process_local_edges ? (key_segment_offsets[4] - key_segment_offsets[3]) : vertex_t{0}, + loop_stream); + rmm::device_scalar count(size_t{0}, loop_stream); + auto tmp_keys = allocate_dataframe_buffer( + edge_partition_tmp_key_buffers ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); - auto edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - - auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - - auto keys = allocate_dataframe_buffer( - process_local_edges ? (key_segment_offsets[4] - key_segment_offsets[3]) : size_t{0}, - loop_stream); - rmm::device_uvector offsets( - process_local_edges ? (key_segment_offsets[4] - key_segment_offsets[3]) : size_t{0}, - loop_stream); - rmm::device_scalar count(size_t{0}, loop_stream); - if (process_local_edges) { - auto input_first = - thrust::make_zip_iterator(edge_partition_key_first + key_segment_offsets[3], - thrust::make_counting_iterator(key_segment_offsets[3])); - auto flag_first = thrust::make_transform_iterator( - edge_partition_key_first + key_segment_offsets[3], - cuda::proclaim_return_type( - [bitmap = - raft::device_span(edge_partition_hypersparse_bitmap_vectors[j].data(), - edge_partition_hypersparse_bitmap_vectors[j].size()), + if (process_local_edges) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + // FIXME: we can pre-compute this & store in graph_t + rmm::device_uvector segment_bitmap( + packed_bool_size((*segment_offsets)[4] - (*segment_offsets)[3]), loop_stream); + thrust::fill(rmm::exec_policy_nosync(loop_stream), + segment_bitmap.begin(), + segment_bitmap.end(), + packed_bool_empty_mask()); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + *(edge_partition.dcs_nzd_vertices()), + *(edge_partition.dcs_nzd_vertices()) + *(edge_partition.dcs_nzd_vertex_count()), + [bitmap = raft::device_span(segment_bitmap.data(), segment_bitmap.size()), major_hypersparse_first = - *(edge_partition.major_hypersparse_first())] __device__(key_t key) { - auto major = thrust_tuple_get_or_identity(key); + *(edge_partition.major_hypersparse_first())] __device__(auto major) { auto major_offset = major - major_hypersparse_first; - return ((bitmap[packed_bool_offset(major_offset)] & - packed_bool_mask(major_offset)) != packed_bool_empty_mask()); - })); - copy_if_nosync( - input_first, - input_first + (key_segment_offsets[4] - key_segment_offsets[3]), - flag_first, - thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), offsets.begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); - } - edge_partition_tmp_key_buffers.push_back(std::move(keys)); - (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); - edge_partition_tmp_count_scalars.push_back(std::move(count)); - } + cuda::atomic_ref word( + bitmap[packed_bool_offset(major_offset)]); + word.fetch_or(packed_bool_mask(major_offset), cuda::std::memory_order_relaxed); + }); + + auto range_offset_first = std::min( + (edge_partition.major_range_first() + (*segment_offsets)[3] > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]); + + if constexpr (std::is_same_v) { + if (edge_partition_tmp_bcast_buffers) { + auto const& rx_tmps = (*edge_partition_tmp_bcast_buffers)[j]; + if (v_list_bitmap) { + auto range_offset_last = + std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + auto count_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_tmps.data(), rx_tmps.size()), + range_offset_first] __device__(size_t i) { + auto word = range_bitmap[i]; + if (i == packed_bool_offset(range_offset_first)) { + word &= ~packed_bool_partial_mask( + range_offset_first % + packed_bools_per_word()); // exclude the bits in the sparse region + } + return static_cast(__popc(word)); + })); + rmm::device_uvector count_displacements( + rx_tmps.size() - packed_bool_offset(range_offset_first), loop_stream); + thrust::exclusive_scan(rmm::exec_policy_nosync(loop_stream), + count_first, + count_first + count_displacements.size(), + count_displacements.begin()); + auto offset_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(range_offset_first), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_tmps.data(), rx_tmps.size()), + count_displacements = raft::device_span( + count_displacements.data(), count_displacements.size()), + range_offset_first, + start_offset = key_segment_offsets[3]] __device__(auto range_offset) { + auto word = range_bitmap[packed_bool_offset(range_offset)]; + if (packed_bool_offset(range_offset) == + packed_bool_offset(range_offset_first)) { + word &= ~packed_bool_partial_mask( + range_offset_first % + packed_bools_per_word()); // exclude the bits in the sparse region + } + return static_cast( + start_offset + + count_displacements[packed_bool_offset(range_offset) - + packed_bool_offset(range_offset_first)] + + __popc(word & packed_bool_partial_mask(range_offset % + packed_bools_per_word()))); + })); + auto input_pair_first = thrust::make_zip_iterator( + thrust::make_counting_iterator(local_v_list_range_firsts[partition_idx] + + range_offset_first), + offset_first); + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (range_offset_last - range_offset_first), + thrust::make_transform_iterator( + thrust::make_counting_iterator(range_offset_first), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_tmps.data(), rx_tmps.size()), + segment_bitmap = raft::device_span( + segment_bitmap.data(), segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = + *(edge_partition + .major_hypersparse_first())] __device__(auto range_offset) { + auto segment_offset = + (range_first + range_offset) - major_hypersparse_first; + return ((range_bitmap[packed_bool_offset(range_offset)] & + packed_bool_mask(range_offset)) != packed_bool_empty_mask()) && + ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })), + thrust::make_zip_iterator( + get_dataframe_buffer_begin(keys) + key_segment_offsets[3], offsets.begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); + } else { + assert(compressed_v_list); + auto input_pair_first = thrust::make_zip_iterator( + thrust::make_transform_iterator( + rx_tmps.begin() + key_segment_offsets[3], + cuda::proclaim_return_type( + [range_first = + local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { + return static_cast(range_first + v_offset); + })), + thrust::make_counting_iterator(static_cast(key_segment_offsets[3]))); + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + thrust::make_transform_iterator( + rx_tmps.begin() + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span( + segment_bitmap.data(), segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = *( + edge_partition.major_hypersparse_first())] __device__(auto v_offset) { + auto segment_offset = + (range_first + v_offset) - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })), + thrust::make_zip_iterator( + get_dataframe_buffer_begin(keys) + key_segment_offsets[3], offsets.begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); + } + } + } + if (edge_partition_tmp_key_buffers) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + thrust::make_transform_iterator( + get_dataframe_buffer_begin(keys) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(auto key) { + auto segment_offset = + thrust_tuple_get_or_identity(key) - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })), + thrust::make_zip_iterator(get_dataframe_buffer_begin(tmp_keys), offsets.begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); + } + } - std::vector h_counts(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - h_counts[j] = edge_partition_tmp_count_scalars[j].value(loop_stream); - } + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + edge_partition_count_scalars.push_back(std::move(count)); + if (edge_partition_tmp_key_buffers) { + (*edge_partition_tmp_key_buffers).push_back(std::move(tmp_keys)); + } + } + if (edge_partition_tmp_bcast_buffers) { (*edge_partition_tmp_bcast_buffers).clear(); } - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices + std::vector h_counts(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); + h_counts[j] = edge_partition_count_scalars[j].value(loop_stream); + } - auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - if (process_local_edges) { - auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - resize_dataframe_buffer(edge_partition_tmp_key_buffers[j], h_counts[j], loop_stream); - shrink_to_fit_dataframe_buffer( - edge_partition_tmp_key_buffers[j], - loop_stream); // FIXME: we can skip this to cut execution time - resize_dataframe_buffer( - (*edge_partition_hypersparse_key_offset_vectors)[j], h_counts[j], loop_stream); - shrink_to_fit_dataframe_buffer( - (*edge_partition_hypersparse_key_offset_vectors)[j], - loop_stream); // FIXME: we can skip this to cut execution time - auto keys = - allocate_dataframe_buffer(key_segment_offsets[3] + h_counts[j], loop_stream); - thrust::copy( - rmm::exec_policy_nosync(loop_stream), - get_dataframe_buffer_begin(edge_partition_key_buffers[j]), - get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + key_segment_offsets[3], - get_dataframe_buffer_begin(keys)); - thrust::copy(rmm::exec_policy_nosync(loop_stream), - get_dataframe_buffer_begin(edge_partition_tmp_key_buffers[j]), - get_dataframe_buffer_end(edge_partition_tmp_key_buffers[j]), - get_dataframe_buffer_begin(keys) + key_segment_offsets[3]); - edge_partition_key_buffers[j] = std::move(keys); + auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + if (process_local_edges) { + auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + if (edge_partition_tmp_key_buffers) { + auto const& tmp_keys = (*edge_partition_tmp_key_buffers)[j]; + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(tmp_keys), + get_dataframe_buffer_begin(tmp_keys) + h_counts[j], + get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + + key_segment_offsets[3]); + } + resize_dataframe_buffer( + edge_partition_key_buffers[j], key_segment_offsets[3] + h_counts[j], loop_stream); + (*edge_partition_hypersparse_key_offset_vectors)[j].resize(h_counts[j], loop_stream); + // skip shrink_to_fit to cut execution time + } } } } } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + auto subtime3 = std::chrono::steady_clock::now(); + auto subtime4 = std::chrono::steady_clock::now(); +#endif std::conditional_t>, @@ -2249,7 +2406,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime4 = std::chrono::steady_clock::now(); + auto subtime5 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { @@ -2390,9 +2547,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_stream_pool_indices); } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime5 = std::chrono::steady_clock::now(); -#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete auto subtime6 = std::chrono::steady_clock::now(); @@ -2494,6 +2648,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_priorities.push_back(std::move(priorities)); } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime7 = std::chrono::steady_clock::now(); +#endif device_group_start(minor_comm); for (size_t j = 0; j < loop_count; ++j) { @@ -2520,6 +2677,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } device_group_end(minor_comm); if (stream_pool_indices) { handle.sync_stream(); } +#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete + auto subtime8 = std::chrono::steady_clock::now(); +#endif std::vector< std::variant, std::optional>>> @@ -2607,14 +2767,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime7 = std::chrono::steady_clock::now(); -#endif #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime8 = std::chrono::steady_clock::now(); - auto subtime9 = std::chrono::steady_clock::now(); - auto subtime10 = std::chrono::steady_clock::now(); + auto subtime9 = std::chrono::steady_clock::now(); #endif std::vector copy_sizes(loop_count); @@ -2664,20 +2819,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_copy_sizes.push_back(std::move(copy_size)); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - subtime9 = std::chrono::steady_clock::now(); -#endif for (size_t j = 0; j < loop_count; ++j) { auto loop_stream = stream_pool_indices ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); copy_sizes[j] = edge_partition_copy_sizes[j].value(loop_stream); } + } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - subtime10 = std::chrono::steady_clock::now(); + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + auto subtime10 = std::chrono::steady_clock::now(); #endif - } // FIXME: combine count & copy_if??? std::vector> edge_partition_values{}; @@ -2745,12 +2897,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_values.push_back(std::move(values)); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime11 = std::chrono::steady_clock::now(); -#endif if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime12 = std::chrono::steady_clock::now(); + auto subtime11 = std::chrono::steady_clock::now(); #endif std::optional> rx_sizes{}; @@ -2793,12 +2942,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, handle.get_stream()); } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime13 = std::chrono::steady_clock::now(); -#endif #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete handle.sync_stream(); - auto subtime14 = std::chrono::steady_clock::now(); + auto subtime12 = std::chrono::steady_clock::now(); #endif device_group_start(minor_comm); @@ -2827,16 +2973,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } device_group_end(minor_comm); -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime15 = std::chrono::steady_clock::now(); -#endif handle.sync_stream(); // this is required before edge_partition_values.clear(); edge_partition_values.clear(); if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } // to ensure that memory is freed #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime16 = std::chrono::steady_clock::now(); + auto subtime13 = std::chrono::steady_clock::now(); #endif if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { @@ -2858,8 +3001,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, get_dataframe_buffer_end(*rx_values), rx_offsets.begin(), tmp_vertex_value_output_first); - } - else { + } else { rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); thrust::sequence( handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), size_t{0}); @@ -2878,7 +3020,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } handle.sync_stream(); #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime17 = std::chrono::steady_clock::now(); + auto subtime14 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; std::chrono::duration subdur2 = subtime3 - subtime2; @@ -2893,24 +3035,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration subdur11 = subtime12 - subtime11; std::chrono::duration subdur12 = subtime13 - subtime12; std::chrono::duration subdur13 = subtime14 - subtime13; - std::chrono::duration subdur14 = subtime15 - subtime14; - std::chrono::duration subdur15 = subtime16 - subtime15; - std::chrono::duration subdur16 = subtime17 - subtime16; std::cerr << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," - << subdur11.count() << "," << subdur12.count() << "," << subdur13.count() << "," - << subdur14.count() << "," << subdur15.count() << "," << subdur16.count() << ")" + << subdur11.count() << "," << subdur12.count() << "," << subdur13.count() << ")" << std::endl; #endif } else { - // FIXME: better place this inside device_group_start() & device_group_end(); + device_group_start(minor_comm); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); device_reduce(minor_comm, get_dataframe_buffer_begin(major_output_buffers[j]), @@ -2918,15 +3053,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, size_dataframe_buffer(major_output_buffers[j]), ReduceOp::compatible_raft_comms_op, static_cast(partition_idx), - loop_stream); + handle.get_stream()); } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + device_group_end(minor_comm); + if (stream_pool_indices) { handle.sync_stream(); } } } } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time2 = std::chrono::steady_clock::now(); + auto time4 = std::chrono::steady_clock::now(); #endif // 10. communication @@ -3019,13 +3155,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } #if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time3 = std::chrono::steady_clock::now(); + auto time5 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; + std::chrono::duration dur3 = time4 - time3; + std::chrono::duration dur4 = time5 - time4; std::cerr << "\t\t" - << "detail::per_v (prep, ep, comm) took (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << ")" << std::endl; + << "detail::per_v (pre, filter, post, ep, comm) took (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << ")" << std::endl; #endif } diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 80ab3e80852..6da9b843095 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -207,7 +207,7 @@ void retrieve_vertex_list_from_bitmap( { using vertex_t = typename thrust::iterator_traits::value_type; - assert((comm.get_rank() != root) || (bitmap.size() == packed_bool_size(vertex_range_last - vertex_ragne_first))); + assert((comm.get_rank() != root) || (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first))); detail::copy_if_nosync( thrust::make_counting_iterator(vertex_range_first), thrust::make_counting_iterator(vertex_range_last), From 547fb9b3c58c540fc1669ee8c46eef32292559e5 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 5 Oct 2024 00:13:49 -0700 Subject: [PATCH 096/126] reduce allreduce size --- cpp/include/cugraph/graph_view.hpp | 91 +- .../prims/detail/per_v_transform_reduce_e.cuh | 959 +++++++++++++----- 2 files changed, 746 insertions(+), 304 deletions(-) diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index e026d7a48bc..feb64c0aa8e 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -165,7 +165,12 @@ class partition_t { return vertex_partition_range_last(partition_idx) - vertex_partition_range_first(partition_idx); } - size_t number_of_local_edge_partitions() const { return minor_comm_size_; } + size_t number_of_local_edge_partitions() const { return static_cast(minor_comm_size_); } + size_t coinciding_local_edge_partition_idx() const + { + return static_cast(minor_comm_rank_); + } // the major range of coinciding_local_edge_partition_idx()'th local edge partition coincides + // with the local vertex partition range // major: source of the edge partition (if not transposed) or destination of the edge partition // (if transposed). @@ -559,6 +564,12 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_segment_offsets(partition_idx); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx) const { @@ -570,6 +581,12 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + auto partition_idx = partition_.coinciding_local_edge_partition_idx(); + return local_edge_partition_hypersparse_degree_offsets(partition_idx); + } + std::optional> local_edge_partition_hypersparse_degree_offsets( size_t partition_idx) const { @@ -927,6 +944,11 @@ class graph_view_t> local_vertex_partition_segment_offsets() const + { + return local_edge_partition_segment_offsets(size_t{0}); + } + std::optional> local_edge_partition_segment_offsets( size_t partition_idx = 0) const { @@ -934,32 +956,37 @@ class graph_view_t> local_vertex_partition_hypersparse_degree_offsets() const + { + return local_edge_partition_hypersparse_degree_offsets(size_t{0}); + } + std::optional> local_edge_partition_hypersparse_degree_offsets( size_t partition_idx = 0) const { - assert(partition_idx == 0); - return hypersparse_degree_offsets_; + assert(partition_idx == 0); + return hypersparse_degree_offsets_; } vertex_partition_view_t local_vertex_partition_view() const { - return vertex_partition_view_t(this->number_of_vertices()); + return vertex_partition_view_t(this->number_of_vertices()); } edge_partition_view_t local_edge_partition_view( size_t partition_idx = 0) const { - assert(partition_idx == 0); // there is only one edge partition in single-GPU - return edge_partition_view_t( - offsets_, indices_, this->number_of_vertices()); + assert(partition_idx == 0); // there is only one edge partition in single-GPU + return edge_partition_view_t( + offsets_, indices_, this->number_of_vertices()); } // FIXME: deprecated, replaced with compute_number_of_edges (which works with or without edge // masking) edge_t number_of_edges() const { - CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented."); - return this->number_of_edges_; + CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented."); + return this->number_of_edges_; } edge_t compute_number_of_edges(raft::handle_t const& handle) const; @@ -987,92 +1014,96 @@ class graph_view_t>> local_sorted_unique_edge_srcs() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_srcs(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_src_chunk_start_offsets() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_src_chunk_start_offsets(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } - std::optional local_sorted_unique_edge_src_chunk_size() const { return std::nullopt; } + std::optional local_sorted_unique_edge_src_chunk_size() const { + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_src_vertex_partition_offsets() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dsts() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dsts(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dst_chunk_start_offsets() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dst_chunk_start_offsets(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } - std::optional local_sorted_unique_edge_dst_chunk_size() const { return std::nullopt; } + std::optional local_sorted_unique_edge_dst_chunk_size() const { + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dst_vertex_partition_offsets() const { - return std::nullopt; + return std::nullopt; } void attach_edge_mask(edge_property_view_t edge_mask_view) { - edge_mask_view_ = edge_mask_view; + edge_mask_view_ = edge_mask_view; } - void clear_edge_mask() { edge_mask_view_ = std::nullopt; } + void clear_edge_mask() { + edge_mask_view_ = std::nullopt; } - bool has_edge_mask() const { return edge_mask_view_.has_value(); } + bool has_edge_mask() const { + return edge_mask_view_.has_value(); } std::optional> edge_mask_view() const { - return edge_mask_view_; + return edge_mask_view_; } private: @@ -1084,6 +1115,6 @@ class graph_view_t> hypersparse_degree_offsets_{std::nullopt}; std::optional> edge_mask_view_{std::nullopt}; -}; + }; } // namespace cugraph diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 739b79a0add..ca03765a191 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -68,10 +68,8 @@ namespace cugraph { namespace detail { -// FIXME: on A6000 we got better performance with 128, need to tune on H100 (possibly due to wasting -// less computing power on processing high degree vertices, we may use different values for -// different kernels for exhaustive tuning) -int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +int32_t constexpr per_v_transform_reduce_e_kernel_block_size = 512; +int32_t constexpr per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size = 128; template struct iterator_value_type_or_default_t; @@ -721,7 +719,9 @@ __global__ static void per_v_transform_reduce_e_high_degree( using BlockReduce = cub::BlockReduce< std::conditional_t>, int32_t, e_op_result_t>, - per_v_transform_reduce_e_kernel_block_size>; + std::is_same_v> + ? per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : per_v_transform_reduce_e_kernel_block_size>; [[maybe_unused]] __shared__ std::conditional_t temp_storage; @@ -777,16 +777,17 @@ __global__ static void per_v_transform_reduce_e_high_degree( ? init : identity_element; // init == identity_element for reduce_op::any if constexpr (std::is_same_v>) { - first_valid_thread_id = per_v_transform_reduce_e_kernel_block_size; + first_valid_thread_id = per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; } } if (edge_partition_e_mask) { if constexpr (update_major && std::is_same_v>) { auto rounded_up_local_degree = - ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / - per_v_transform_reduce_e_kernel_block_size) * - per_v_transform_reduce_e_kernel_block_size; + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { thrust::optional e_op_result{thrust::nullopt}; if ((i < static_cast(local_degree)) && @@ -795,13 +796,18 @@ __global__ static void per_v_transform_reduce_e_high_degree( } first_valid_thread_id = BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, cub::Min()); if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } __syncthreads(); first_valid_thread_id = output_thread_id; if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } - if (first_valid_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } } } else { for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { @@ -823,9 +829,10 @@ __global__ static void per_v_transform_reduce_e_high_degree( } else { if constexpr (update_major && std::is_same_v>) { auto rounded_up_local_degree = - ((static_cast(local_degree) + (per_v_transform_reduce_e_kernel_block_size - 1)) / - per_v_transform_reduce_e_kernel_block_size) * - per_v_transform_reduce_e_kernel_block_size; + ((static_cast(local_degree) + + (per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size - 1)) / + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) * + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size; for (size_t i = threadIdx.x; i < rounded_up_local_degree; i += blockDim.x) { thrust::optional e_op_result{thrust::nullopt}; if ((i < static_cast(local_degree)) && call_pred_op(i)) { @@ -833,13 +840,18 @@ __global__ static void per_v_transform_reduce_e_high_degree( } first_valid_thread_id = BlockReduce(temp_storage) - .Reduce(e_op_result ? threadIdx.x : per_v_transform_reduce_e_kernel_block_size, + .Reduce(e_op_result + ? threadIdx.x + : per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size, cub::Min()); if (threadIdx.x == 0) { output_thread_id = first_valid_thread_id; } __syncthreads(); first_valid_thread_id = output_thread_id; if (threadIdx.x == first_valid_thread_id) { reduced_e_op_result = *e_op_result; } - if (first_valid_thread_id != per_v_transform_reduce_e_kernel_block_size) { break; } + if (first_valid_thread_id != + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) { + break; + } } } else { for (edge_t i = threadIdx.x; i < local_degree; i += blockDim.x) { @@ -862,7 +874,8 @@ __global__ static void per_v_transform_reduce_e_high_degree( if constexpr (update_major) { if constexpr (std::is_same_v>) { - if (threadIdx.x == ((first_valid_thread_id == per_v_transform_reduce_e_kernel_block_size) + if (threadIdx.x == ((first_valid_thread_id == + per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size) ? 0 : first_valid_thread_id)) { *(result_value_output + idx) = reduced_e_op_result; @@ -977,8 +990,8 @@ rmm::device_uvector compute_priorities( auto const comm_size = comm.get_size(); // For each vertex, select a comm_rank among the GPUs with a value other than init (if there are - // more than one, the GPU with (comm_rank == root) has the highest priority, the GPUs in the same - // DGX node should be the next) + // more than one, the GPU with (comm_rank == root) has the highest priority, GPUs in the same DGX + // node should be the next) rmm::device_uvector priorities(range_size, stream_view); @@ -1296,9 +1309,12 @@ void per_v_transform_reduce_e_edge_partition( edge_partition_stream_pool_indices ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) : handle.get_stream(); - raft::grid_1d_block_t update_grid((*key_segment_offsets)[1], - detail::per_v_transform_reduce_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); + raft::grid_1d_block_t update_grid( + (*key_segment_offsets)[1], + std::is_same_v> + ? detail::per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size + : detail::per_v_transform_reduce_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); segment_key_iterator_t segment_key_first{}; if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; @@ -1359,7 +1375,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 1 +#define PER_V_PERFORMANCE_MEASUREMENT 1 // FIXME: delete performance logging code template > local_vertex_partition_segment_offsets{std::nullopt}; - { - size_t partition_idx = 0; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - partition_idx = static_cast(minor_comm_rank); - } - local_vertex_partition_segment_offsets = - graph_view.local_edge_partition_segment_offsets(partition_idx); - } + auto local_vertex_partition_segment_offsets = graph_view.local_vertex_partition_segment_offsets(); std::conditional_t>, std::byte /* dummy */> key_segment_offsets{}; @@ -1510,7 +1516,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } // 3. filter input keys & update key_segment_offsets -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time1 = std::chrono::steady_clock::now(); #endif @@ -1634,7 +1640,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } /* 4. compute subgroup_size (used to compute priority in device_gatherv) */ -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time2 = std::chrono::steady_clock::now(); #endif @@ -1673,13 +1679,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_v_list_range_firsts{}; std::conditional_t, std::byte /* dummy */> local_v_list_range_lasts{}; + std::conditional_t>, + std::optional>, + std::byte /* dummy */> + local_key_list_deg1_sizes{}; // if global degree is 1, any valid local value should be selected std::conditional_t>>, std::byte /* dummy */> key_segment_offset_vectors{}; if constexpr (use_input_key) { if constexpr (GraphViewType::is_multi_gpu) { - // FIMXE: refactor this code (create host_scalar_array_allgather) auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); @@ -1688,16 +1697,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (try_bitmap) { num_scalars += 2; // local_key_list_range_first, local_key_list_range_last } + if (filter_input_key && graph_view.use_dcs()) { + num_scalars += 1; // local_key_list_degree_1_size + } if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, handle.get_stream()); + auto hypersparse_degree_offsets = + graph_view.local_vertex_partition_hypersparse_degree_offsets(); thrust::tabulate( handle.get_thrust_policy(), d_aggregate_tmps.begin() + minor_comm_rank * num_scalars, - d_aggregate_tmps.begin() + minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1), + d_aggregate_tmps.begin() + minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1) + + (filter_input_key && graph_view.use_dcs() ? 1 : 0), [sorted_unique_key_first, v_list_size = static_cast( thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), + deg1_v_first = (filter_input_key && graph_view.use_dcs()) + ? thrust::make_optional(graph_view.local_vertex_partition_range_first() + + (*local_vertex_partition_segment_offsets)[3] + + *((*hypersparse_degree_offsets).rbegin() + 1)) + : thrust::nullopt, vertex_partition_range_first = graph_view.local_vertex_partition_range_first()] __device__(size_t i) { if constexpr (try_bitmap) { @@ -1712,8 +1733,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } assert(static_cast(static_cast(first)) == first); return static_cast(first); - } else { - assert(i == 2); + } + if (i == 2) { vertex_t last{}; if (v_list_size > 0) { last = *(sorted_unique_key_first + (v_list_size - 1)) + 1; @@ -1722,15 +1743,51 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } assert(static_cast(static_cast(last)) == last); return static_cast(last); + } else { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } else { + assert(false); + return size_t{0}; + } } } else { - assert(i == 0); - return v_list_size; + if (i == 0) { + return v_list_size; + } else { + if (deg1_v_first) { + auto sorted_unique_v_first = thrust::make_transform_iterator( + sorted_unique_key_first, + cuda::proclaim_return_type([] __device__(auto key) { + return thrust_tuple_get_or_identity(key); + })); + return v_list_size - static_cast(thrust::distance( + sorted_unique_v_first, + thrust::lower_bound(thrust::seq, + sorted_unique_v_first, + sorted_unique_v_first + v_list_size, + deg1_v_first))); + } else { + assert(false); + return size_t{0}; + } + } } }); if (key_segment_offsets) { raft::update_device( - d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1)), + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1) + + (filter_input_key && graph_view.use_dcs() ? 1 : 0)), (*key_segment_offsets).data(), (*key_segment_offsets).size(), handle.get_stream()); @@ -1755,6 +1812,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_v_list_range_firsts = std::vector(minor_comm_size); local_v_list_range_lasts = std::vector(minor_comm_size); } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + local_key_list_deg1_sizes = std::vector(minor_comm_size); + } + } if (key_segment_offsets) { key_segment_offset_vectors = std::vector>{}; (*key_segment_offset_vectors).reserve(minor_comm_size); @@ -1767,10 +1829,18 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_v_list_range_lasts[i] = static_cast(h_aggregate_tmps[i * num_scalars + 2]); } + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { + (*local_key_list_deg1_sizes)[i] = + static_cast(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 3 : 1)]); + } + } if (key_segment_offsets) { (*key_segment_offset_vectors) - .emplace_back(h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1), + .emplace_back(h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1) + + ((filter_input_key && graph_view.use_dcs()) ? 1 : 0), h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1) + + ((filter_input_key && graph_view.use_dcs()) ? 1 : 0) + (*key_segment_offsets).size()); } } @@ -1850,11 +1920,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - - size_t tmp_buffer_size_per_loop{0}; // FIXME: need to review this logic + size_t tmp_buffer_size_per_loop{0}; if constexpr (update_major) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + size_t key_size{0}; if constexpr (use_input_key) { if constexpr (std::is_arithmetic_v) { @@ -1869,25 +1939,39 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } else { value_size = sum_thrust_tuple_element_sizes(); } + size_t approx_tmp_size{0}; + if constexpr (filter_input_key) { + // use tmeporary buffers to store non-zero local degree key offsets in the hypersparse + // regioon, priorities, selected ranks (or) flags (non-root), and selected values (and + // key offsets for the selected values that are in the hypersparse region and have the + // global degree of 1) + approx_tmp_size = static_cast( + static_cast(sizeof(size_t)) * 0.25 + + static_cast(value_size) / + static_cast(minor_comm_size) /* only one value will be selected */); + } size_t aggregate_major_range_size{}; if constexpr (use_input_key) { aggregate_major_range_size = - host_scalar_allreduce(comm, - static_cast(thrust::distance(sorted_unique_key_first, - sorted_unique_nzd_key_last)), - raft::comms::op_t::SUM, - handle.get_stream()); + std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()); } else { - aggregate_major_range_size = graph_view.number_of_vertices(); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + if constexpr (GraphViewType::is_storage_transposed) { + aggregate_major_range_size += graph_view.local_edge_partition_dst_range_size(i); + } else { + aggregate_major_range_size += graph_view.local_edge_partition_src_range_size(i); + } + } } tmp_buffer_size_per_loop = - (aggregate_major_range_size / comm_size) * (key_size + value_size); + (aggregate_major_range_size / graph_view.number_of_local_edge_partitions()) * + (key_size + value_size + approx_tmp_size); } stream_pool_indices = init_stream_pool_indices( static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * - 0.05), + 0.1), tmp_buffer_size_per_loop, graph_view.number_of_local_edge_partitions(), max_segments, @@ -1936,12 +2020,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // 9. proces local edge partitions -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time3 = std::chrono::steady_clock::now(); #endif for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); i += num_concurrent_loops) { -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT auto subtime0 = std::chrono::steady_clock::now(); auto subtime1 = std::chrono::steady_clock::now(); auto subtime2 = std::chrono::steady_clock::now(); @@ -1958,6 +2042,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::byte /* dummy */> edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in the // hypersparse region + std::conditional_t>, std::byte /* dummy */> + edge_partition_deg1_hypersparse_key_offset_counts{}; if constexpr (GraphViewType::is_multi_gpu && use_input_key) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -1995,7 +2081,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_key_list_sizes[partition_idx], handle.get_stream())); } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT handle.sync_stream(); subtime1 = std::chrono::steady_clock::now(); #endif @@ -2034,7 +2120,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } device_group_end(minor_comm); if (stream_pool_indices) { handle.sync_stream(); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT subtime2 = std::chrono::steady_clock::now(); #endif @@ -2110,6 +2196,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_hypersparse_key_offset_vectors = std::vector>{}; (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); + edge_partition_deg1_hypersparse_key_offset_counts = std::vector(loop_count); std::vector> edge_partition_count_scalars{}; edge_partition_count_scalars.reserve(loop_count); @@ -2362,23 +2449,32 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_key_buffers[j], key_segment_offsets[3] + h_counts[j], loop_stream); (*edge_partition_hypersparse_key_offset_vectors)[j].resize(h_counts[j], loop_stream); // skip shrink_to_fit to cut execution time + (*edge_partition_deg1_hypersparse_key_offset_counts)[j] = + size_dataframe_buffer((*edge_partition_hypersparse_key_offset_vectors)[j]) - + static_cast(thrust::distance( + get_dataframe_buffer_begin((*edge_partition_hypersparse_key_offset_vectors)[j]), + thrust::lower_bound( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin((*edge_partition_hypersparse_key_offset_vectors)[j]), + get_dataframe_buffer_end((*edge_partition_hypersparse_key_offset_vectors)[j]), + local_key_list_sizes[partition_idx] - + (*local_key_list_deg1_sizes)[partition_idx]))); } } } } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime3 = std::chrono::steady_clock::now(); - auto subtime4 = std::chrono::steady_clock::now(); #endif std::conditional_t>, std::byte /* dummy */> - major_output_buffers{}; + edge_partition_major_output_buffers{}; if constexpr (GraphViewType::is_multi_gpu && update_major) { - major_output_buffers.reserve(loop_count); + edge_partition_major_output_buffers.reserve(loop_count); } for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -2401,12 +2497,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ : edge_partition.major_range_size(); } - major_output_buffers.push_back(allocate_dataframe_buffer(buffer_size, loop_stream)); + edge_partition_major_output_buffers.push_back( + allocate_dataframe_buffer(buffer_size, loop_stream)); } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime5 = std::chrono::steady_clock::now(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime4 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { @@ -2509,7 +2606,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, output_buffer{}; if constexpr (GraphViewType::is_multi_gpu) { if constexpr (update_major) { - output_buffer = get_dataframe_buffer_begin(major_output_buffers[j]); + output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]); } else { output_buffer = edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); @@ -2548,8 +2645,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime6 = std::chrono::steady_clock::now(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime5 = std::chrono::steady_clock::now(); #endif if constexpr (GraphViewType::is_multi_gpu && update_major) { @@ -2570,6 +2667,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, rmm::device_uvector>> edge_partition_priorities{}; edge_partition_priorities.reserve(loop_count); + + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_hypersparse_non_deg1_key_offset_spans{}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + edge_partition_hypersparse_non_deg1_key_offset_spans = + std::vector>(loop_count); + } + } + + std::vector edge_partition_allreduce_sizes(loop_count); + std::vector edge_partition_contiguous_sizes(loop_count); + for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto loop_stream = stream_pool_indices @@ -2581,27 +2693,49 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } } - auto const& output_buffer = major_output_buffers[j]; - std::optional> hypersparse_key_offsets{std::nullopt}; + std::optional> hypersparse_non_deg1_key_offsets{ + std::nullopt}; if constexpr (filter_input_key) { if (edge_partition_hypersparse_key_offset_vectors) { - hypersparse_key_offsets = raft::device_span( + hypersparse_non_deg1_key_offsets = raft::device_span( (*edge_partition_hypersparse_key_offset_vectors)[j].data(), - (*edge_partition_hypersparse_key_offset_vectors)[j].size()); + (*edge_partition_hypersparse_key_offset_vectors)[j].size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = + *hypersparse_non_deg1_key_offsets; } } - size_t range_size{0}; + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + size_t allreduce_size{}; + size_t contiguous_size{}; if constexpr (filter_input_key) { - range_size = local_key_list_sizes[partition_idx]; + allreduce_size = local_key_list_sizes[partition_idx]; + if (local_key_list_deg1_sizes) { + allreduce_size -= (*local_key_list_deg1_sizes)[partition_idx]; + } + if (key_segment_offset_vectors) { + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + contiguous_size = key_segment_offsets[3]; + } else { + contiguous_size = local_key_list_sizes[partition_idx]; + } } else { - range_size = size_dataframe_buffer(output_buffer); + static_assert(!use_input_key); + auto hypersparse_degree_offsets = + graph_view.local_edge_partition_hypersparse_degree_offsets(partition_idx); + allreduce_size = size_dataframe_buffer(output_buffer); + if (hypersparse_degree_offsets) { + allreduce_size -= *((*hypersparse_degree_offsets).rbegin()) - + *((*hypersparse_degree_offsets).rbegin() + 1); + } + contiguous_size = size_dtaframe_buffer(output_buffer); } - - auto contiguous_size = - hypersparse_key_offsets - ? (size_dataframe_buffer(output_buffer) - (*hypersparse_key_offsets).size()) - : range_size; + edge_partition_allreduce_sizes[j] = allreduce_size; + edge_partition_contiguous_sizes[j] = contiguous_size; std::variant, rmm::device_uvector, @@ -2611,9 +2745,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, priorities = compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - hypersparse_key_offsets, + hypersparse_non_deg1_key_offsets, contiguous_size, - range_size, + allreduce_size, static_cast(partition_idx), subgroup_size, init, @@ -2624,9 +2758,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, priorities = compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - hypersparse_key_offsets, + hypersparse_non_deg1_key_offsets, contiguous_size, - range_size, + allreduce_size, static_cast(partition_idx), subgroup_size, init, @@ -2636,9 +2770,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, priorities = compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - hypersparse_key_offsets, + hypersparse_non_deg1_key_offsets, contiguous_size, - range_size, + allreduce_size, static_cast(partition_idx), subgroup_size, init, @@ -2648,8 +2782,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_priorities.push_back(std::move(priorities)); } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime7 = std::chrono::steady_clock::now(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime6 = std::chrono::steady_clock::now(); #endif device_group_start(minor_comm); @@ -2677,8 +2811,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } device_group_end(minor_comm); if (stream_pool_indices) { handle.sync_stream(); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime8 = std::chrono::steady_clock::now(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime7 = std::chrono::steady_clock::now(); #endif std::vector< @@ -2696,27 +2830,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } } - auto const& output_buffer = major_output_buffers[j]; - std::optional> hypersparse_key_offsets{std::nullopt}; + auto const& output_buffer = edge_partition_major_output_buffers[j]; + std::optional> hypersparse_non_deg1_key_offsets{ + std::nullopt}; if constexpr (filter_input_key) { if (edge_partition_hypersparse_key_offset_vectors) { - hypersparse_key_offsets = raft::device_span( - (*edge_partition_hypersparse_key_offset_vectors)[j].data(), - (*edge_partition_hypersparse_key_offset_vectors)[j].size()); + hypersparse_non_deg1_key_offsets = + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; } } - size_t range_size{0}; - if constexpr (filter_input_key) { - range_size = local_key_list_sizes[partition_idx]; - } else { - range_size = size_dataframe_buffer(output_buffer); - } - - auto contiguous_size = - hypersparse_key_offsets - ? (size_dataframe_buffer(output_buffer) - (*hypersparse_key_offsets).size()) - : range_size; + auto contiguous_size = edge_partition_contiguous_sizes[j]; auto& priorities = edge_partition_priorities[j]; std::variant, std::optional>> @@ -2726,56 +2850,156 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, minor_comm, raft::device_span(std::get<0>(priorities).data(), std::get<0>(priorities).size()), - hypersparse_key_offsets, + hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, process_local_edges ? false : true /* ignore_local_values */, loop_stream); + std::get<0>(priorities).resize(0, loop_stream); + std::get<0>(priorities).shrink_to_fit(loop_stream); } else if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint16_t selected_ranks_or_flags = compute_selected_ranks_from_priorities( minor_comm, raft::device_span(std::get<1>(priorities).data(), std::get<1>(priorities).size()), - hypersparse_key_offsets, + hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, process_local_edges ? false : true /* ignore_local_values */, loop_stream); + std::get<1>(priorities).resize(0, loop_stream); + std::get<1>(priorities).shrink_to_fit(loop_stream); } else { // priority_t == uint32_t selected_ranks_or_flags = compute_selected_ranks_from_priorities( minor_comm, raft::device_span(std::get<2>(priorities).data(), std::get<2>(priorities).size()), - hypersparse_key_offsets, + hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, process_local_edges ? false : true /* ignore_local_values */, loop_stream); + std::get<2>(priorities).resize(0, loop_stream); + std::get<2>(priorities).shrink_to_fit(loop_stream); } edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); + } +#if PER_V_PERFORMANCE_MEASUREMENT + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + auto subtime8 = std::chrono::steady_clock::now(); +#endif + + std::vector> edge_partition_values{}; + edge_partition_values.reserve(loop_count); + std::vector> edge_partition_count_scalars{}; + edge_partition_count_scalars.reserve(loop_count); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + bool process_local_edges = true; if constexpr (filter_input_key) { - if (edge_partition_hypersparse_key_offset_vectors) { - resize_dataframe_buffer( - (*edge_partition_hypersparse_key_offset_vectors)[j], 0, loop_stream); - shrink_to_fit_dataframe_buffer((*edge_partition_hypersparse_key_offset_vectors)[j], - loop_stream); + if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } + } + + auto& output_buffer = edge_partition_major_output_buffers[j]; + + auto values = + allocate_dataframe_buffer(size_dataframe_buffer(output_buffer), loop_stream); + rmm::device_scalar count(size_t{0}, loop_stream); + if (minor_comm_rank == static_cast(partition_idx)) { + if (process_local_edges) { + assert(!use_input_key); + assert(edge_partition_selected_ranks_or_flags[j].index() == 0); + auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + selected_ranks.begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + raft::device_span(count.data(), size_t{1}), + loop_stream); } + } else { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + size_t input_end_offset{}; + if constexpr (filter_input_key) { + input_end_offset = + edge_partition_contiguous_sizes[j] + + (edge_partition_hypersparse_non_deg1_key_offset_spans + ? (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size() + : size_t{0}); + } else { + input_end_offset = edge_partition_allreduce_sizes[j]; + } + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + input_end_offset, + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span( + (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { + auto word = keep_flags[packed_bool_offset(offset)]; + return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); + })), + get_dataframe_buffer_begin(values), + raft::device_span(count.data(), size_t{1}), + loop_stream); + (*keep_flags).resize(0, loop_stream); + (*keep_flags).shrink_to_fit(loop_stream); } + + edge_partition_values.push_back(std::move(values)); + edge_partition_count_scalars.push_back(std::move(count)); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime9 = std::chrono::steady_clock::now(); -#endif std::vector copy_sizes(loop_count); - { - std::vector> edge_partition_copy_sizes{}; - edge_partition_copy_sizes.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + copy_sizes[j] = edge_partition_count_scalars[j].value(loop_stream); + } + + std::optional>, + std::vector>>> + edge_partition_deg1_hypersparse_output_offset_vectors{}; + + if (graph_view.use_dcs()) { + size_t max_output_range_size{0}; + if constexpr (filter_input_key) { + max_output_range_size = std::reduce(local_key_list_sizes.begin(), + local_key_list_sizes.end(), + size_t{0}, + [](auto l, auto r) { return std::max(l, r); }); + } else { + for (size_t j = 0; j < loop_count; ++j) { + auto& output_buffer = edge_partition_major_output_buffers[j]; + max_output_range_size = + std::max(size_dataframe_buffer(output_buffer), max_output_range_size); + } + } + if (max_output_range_size < static_cast(std::numeric_limits::max())) { + edge_partition_deg1_hypersparse_output_offset_vectors = + std::vector>{}; + std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); + } else { + edge_partition_deg1_hypersparse_output_offset_vectors = + std::vector>{}; + std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); + } for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -2790,161 +3014,230 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - rmm::device_scalar copy_size(size_t{0}, loop_stream); - if (minor_comm_rank == static_cast(partition_idx)) { - if (process_local_edges) { - assert(edge_partition_selected_ranks_or_flags[j].index() == 0); - auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); - count_nosync(selected_ranks.begin(), - selected_ranks.end(), - raft::device_span(copy_size.data(), size_t{1}), - minor_comm_rank, - loop_stream); - } + auto& output_buffer = edge_partition_major_output_buffers[j]; + auto& values = edge_partition_values[j]; + auto& count = edge_partition_count_scalars[j]; + + size_t output_offset_buf_size{0}; + if constexpr (filter_input_key) { + output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j]; } else { - assert(edge_partition_selected_ranks_or_flags[j].index() == 1); - auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); - if (keep_flags) { - auto count_first = thrust::make_transform_iterator( - (*keep_flags).begin(), - cuda::proclaim_return_type( - [] __device__(uint32_t word) { return static_cast(__popc(word)); })); - sum_nosync(count_first, - count_first + (*keep_flags).size(), - raft::device_span(copy_size.data(), size_t{1}), - loop_stream); - } + assert(!use_input_key); + output_offset_buf_size = + size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j]; } - edge_partition_copy_sizes.push_back(std::move(copy_size)); - } - - for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - copy_sizes[j] = edge_partition_copy_sizes[j].value(loop_stream); - } - } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } - auto subtime10 = std::chrono::steady_clock::now(); -#endif - - // FIXME: combine count & copy_if??? - std::vector> edge_partition_values{}; - edge_partition_values.reserve(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + std::variant, rmm::device_uvector> + output_offsets = rmm::device_uvector(0, loop_stream); + if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream); + } else { + output_offsets = rmm::device_uvector(output_offset_buf_size, loop_stream); + } - auto& output_buffer = major_output_buffers[j]; - auto copy_size = copy_sizes[j]; + size_t input_start_offset{}; + if constexpr (filter_input_key) { + input_start_offset = + edge_partition_contiguous_sizes[j] + + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size(); + } else { + static_assert(!use_input_key); + input_start_offset = edge_partition_allreduce_sizes[j]; + } + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + cuda::proclaim_return_type( + [init] __device__(auto val) { return val != init; })); - auto values = allocate_dataframe_buffer(0, loop_stream); - if (minor_comm_rank == static_cast(partition_idx)) { - if (copy_size > 0) { - if constexpr (filter_input_key) { - assert(false); // should not be reached + if constexpr (filter_input_key) { + assert(static_cast(partition_idx) != minor_comm_rank); + auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (output_offsets.index() == 0) { + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + thrust::make_transform_iterator( + hypersparse_key_offsets.begin() + + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size(), + typecast_t{})); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); } else { - assert(edge_partition_selected_ranks_or_flags[j].index() == 0); - auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); - resize_dataframe_buffer(values, copy_size, loop_stream); - rmm::device_scalar dummy(size_t{0}, - loop_stream); // we already know the count + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + hypersparse_key_offsets.begin() + + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size()); copy_if_nosync( - get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_end(output_buffer), - thrust::make_transform_iterator( - selected_ranks.begin(), - cuda::proclaim_return_type( - [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), - get_dataframe_buffer_begin(values), - raft::device_span(dummy.data(), size_t{1}), + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); + } + hypersparse_key_offsets.resize(0, loop_stream); + hypersparse_key_offsets.shrink_to_fit(loop_stream); + } else { + static_assert(!use_input_key); + assert(process_local_edges); + if (output_offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(uint32_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(count.data(), size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(size_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(count.data(), size_t{1}), loop_stream); } } - } else { - if (copy_size > 0) { - assert(edge_partition_selected_ranks_or_flags[j].index() == 1); - auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); - resize_dataframe_buffer(values, copy_size, loop_stream); - rmm::device_scalar dummy(size_t{0}, - loop_stream); // we already know the count - copy_if_nosync( - get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_end(output_buffer), - thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [keep_flags = raft::device_span( - (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { - auto word = keep_flags[packed_bool_offset(offset)]; - return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); - })), - get_dataframe_buffer_begin(values), - raft::device_span(dummy.data(), size_t{1}), - loop_stream); - (*keep_flags).resize(0, loop_stream); - (*keep_flags).shrink_to_fit(loop_stream); + + if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors) + .push_back(std::move(std::get<0>(output_offsets))); + } else { + assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); + std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors) + .push_back(std::move(std::get<1>(output_offsets))); } + + resize_dataframe_buffer(output_buffer, 0, loop_stream); + shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); } - resize_dataframe_buffer(output_buffer, 0, loop_stream); - shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + auto deg1_copy_size = edge_partition_count_scalars[j].value(loop_stream); + copy_sizes[j] += deg1_copy_size; + if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].resize( + deg1_copy_size, loop_stream); + } else { + assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); + std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].resize( + deg1_copy_size, loop_stream); + } + // skip shrink_to_fit() to cut execution time + } + } - edge_partition_values.push_back(std::move(values)); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], loop_stream); + // skip shrink_to_fit() to cut execution time } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime11 = std::chrono::steady_clock::now(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime9 = std::chrono::steady_clock::now(); #endif - std::optional> rx_sizes{}; - std::optional> rx_displs{}; + std::optional> rx_value_sizes{}; + std::optional> rx_value_displs{}; std::optional> rx_values{}; + + std::optional> rx_offset_sizes{}; + std::optional> rx_offset_displs{}; + std::optional, rmm::device_uvector>> + rx_offsets{}; { - std::vector h_value_buffer_sizes(loop_count); + auto size_per_rank = loop_count * (graph_view.use_dcs() ? 2 : 1); + rmm::device_uvector d_aggregate_buffer_sizes(minor_comm_size * size_per_rank, + handle.get_stream()); + std::vector h_buffer_sizes(size_per_rank); for (size_t j = 0; j < loop_count; ++j) { - h_value_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); + h_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); + if (graph_view.use_dcs()) { + if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + h_buffer_sizes[loop_count + j] = + std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].size(); + } else { + assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); + h_buffer_sizes[loop_count + j] = + std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].size(); + } + } } - rmm::device_uvector d_aggregate_value_buffer_sizes(minor_comm_size * loop_count, - handle.get_stream()); - raft::update_device(d_aggregate_value_buffer_sizes.data() + minor_comm_rank * loop_count, - h_value_buffer_sizes.data(), - h_value_buffer_sizes.size(), + raft::update_device(d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + h_buffer_sizes.data(), + h_buffer_sizes.size(), handle.get_stream()); device_allgather(minor_comm, - d_aggregate_value_buffer_sizes.data() + minor_comm_rank * loop_count, - d_aggregate_value_buffer_sizes.data(), - loop_count, + d_aggregate_buffer_sizes.data() + minor_comm_rank * size_per_rank, + d_aggregate_buffer_sizes.data(), + size_per_rank, handle.get_stream()); if (static_cast(minor_comm_rank / num_concurrent_loops) == (i / num_concurrent_loops)) { - std::vector h_aggregate_value_buffer_sizes( - d_aggregate_value_buffer_sizes.size()); - raft::update_host(h_aggregate_value_buffer_sizes.data(), - d_aggregate_value_buffer_sizes.data(), - d_aggregate_value_buffer_sizes.size(), + std::vector h_aggregate_buffer_sizes(d_aggregate_buffer_sizes.size()); + raft::update_host(h_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.data(), + d_aggregate_buffer_sizes.size(), handle.get_stream()); handle.sync_stream(); - auto j = static_cast(minor_comm_rank % num_concurrent_loops); - rx_sizes = std::vector(minor_comm_size); - rx_displs = std::vector(minor_comm_size); + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + rx_value_sizes = std::vector(minor_comm_size); + rx_value_displs = std::vector(minor_comm_size); + if (graph_view.use_dcs()) { + rx_offset_sizes = std::vector(minor_comm_size); + rx_offset_displs = std::vector(minor_comm_size); + } for (int k = 0; k < minor_comm_size; ++k) { - (*rx_sizes)[k] = h_aggregate_value_buffer_sizes[k * loop_count + j]; + (*rx_value_sizes)[k] = h_aggregate_buffer_sizes[k * size_per_rank + j]; + if (graph_view.use_dcs()) { + (*rx_offset_sizes)[k] = + h_aggregate_buffer_sizes[k * size_per_rank + loop_count + j]; + } + } + std::exclusive_scan((*rx_value_sizes).begin(), + (*rx_value_sizes).end(), + (*rx_value_displs).begin(), + size_t{0}); + std::exclusive_scan((*rx_offset_sizes).begin(), + (*rx_offset_sizes).end(), + (*rx_offset_displs).begin(), + size_t{0}); + rx_values = allocate_dataframe_buffer( + (*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); + if (graph_view.use_dcs()) { + if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } else { + assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); + rx_offsets = rmm::device_uvector( + (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); + } } - std::exclusive_scan( - (*rx_sizes).begin(), (*rx_sizes).end(), (*rx_displs).begin(), size_t{0}); - rx_values = allocate_dataframe_buffer((*rx_displs).back() + (*rx_sizes).back(), - handle.get_stream()); } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT handle.sync_stream(); - auto subtime12 = std::chrono::steady_clock::now(); + auto subtime10 = std::chrono::steady_clock::now(); #endif device_group_start(minor_comm); @@ -2957,8 +3250,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, get_dataframe_buffer_begin(values), get_dataframe_buffer_begin(*rx_values), values.size(), - *rx_sizes, - *rx_displs, + *rx_value_sizes, + *rx_value_displs, static_cast(partition_idx), handle.get_stream()); } else { @@ -2973,54 +3266,176 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } device_group_end(minor_comm); + if (graph_view.use_dcs()) { + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto& values = edge_partition_values[j]; + + if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + auto& offsets = + std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + offsets.data(), + std::get<0>(*rx_offsets).data(), + offsets.size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + offsets.data(), + static_cast(nullptr), + offsets.size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } else { + assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); + auto& offsets = + std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (minor_comm_rank == static_cast(partition_idx)) { + device_gatherv(minor_comm, + offsets.data(), + std::get<1>(*rx_offsets).data(), + offsets.size(), + *rx_offset_sizes, + *rx_offset_displs, + static_cast(partition_idx), + handle.get_stream()); + } else { + device_gatherv(minor_comm, + offsets.data(), + static_cast(nullptr), + offsets.size(), + std::vector{}, + std::vector{}, + static_cast(partition_idx), + handle.get_stream()); + } + } + } + device_group_end(minor_comm); + } handle.sync_stream(); // this is required before edge_partition_values.clear(); edge_partition_values.clear(); if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); - } // to ensure that memory is freed -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime13 = std::chrono::steady_clock::now(); + } // to ensure that memory is freed +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime11 = std::chrono::steady_clock::now(); #endif if (rx_values && (size_dataframe_buffer(*rx_values) > 0)) { - auto j = static_cast(minor_comm_rank % num_concurrent_loops); + auto j = static_cast(minor_comm_rank % num_concurrent_loops); + auto partition_idx = i + j; + auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + auto old_size = selected_ranks.size(); + size_t output_range_size{}; + if constexpr (filter_input_key) { + output_range_size = local_key_list_sizes[partition_idx]; + } else { + auto const& segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + output_range_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : graph_view.local_vertex_partition_range_size(); + } + selected_ranks.resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + selected_ranks.begin() + old_size, + selected_ranks.end(), + minor_comm_size); + if (rx_offsets) { + rmm::device_uvector lasts((*rx_offset_displs).size(), handle.get_stream()); + raft::update_device(lasts.data(), + (*rx_offset_displs).data() + 1, + (*rx_offset_displs).size() - 1, + handle.get_stream()); + auto num_elements = (*rx_offset_displs).back() + (*rx_offset_sizes).back(); + lasts.set_element_async(lasts.size() - 1, num_elements, handle.get_stream()); + + if ((*rx_offsets).index() == 0) { + auto& offsets = std::get<0>(*rx_offsets); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + selected_ranks.data(), selected_ranks.size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } else { + assert((*rx_offsets).index() == 1); + auto& offsets = std::get<1>(*rx_offsets); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + selected_ranks.data(), selected_ranks.size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + offsets.resize(0, handle.get_stream()); + offsets.shrink_to_fit(handle.get_stream()); + } + } + // FIXME: we may use 8 bit ranks to further cut sort time if (selected_ranks.size() <= std::numeric_limits::max()) { - rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); + rmm::device_uvector rx_positions(selected_ranks.size(), handle.get_stream()); thrust::sequence( - handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), uint32_t{0}); + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), uint32_t{0}); thrust::stable_sort_by_key(handle.get_thrust_policy(), selected_ranks.begin(), selected_ranks.end(), - rx_offsets.begin()); + rx_positions.begin()); // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value - rx_offsets.resize((*rx_displs).back() + (*rx_sizes).back(), handle.get_stream()); + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); thrust::scatter(handle.get_thrust_policy(), get_dataframe_buffer_begin(*rx_values), get_dataframe_buffer_end(*rx_values), - rx_offsets.begin(), + rx_positions.begin(), tmp_vertex_value_output_first); } else { - rmm::device_uvector rx_offsets(selected_ranks.size(), handle.get_stream()); + rmm::device_uvector rx_positions(selected_ranks.size(), handle.get_stream()); thrust::sequence( - handle.get_thrust_policy(), rx_offsets.begin(), rx_offsets.end(), size_t{0}); + handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), size_t{0}); thrust::stable_sort_by_key(handle.get_thrust_policy(), selected_ranks.begin(), selected_ranks.end(), - rx_offsets.begin()); + rx_positions.begin()); // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value - rx_offsets.resize((*rx_displs).back() + (*rx_sizes).back(), handle.get_stream()); + rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), + handle.get_stream()); thrust::scatter(handle.get_thrust_policy(), get_dataframe_buffer_begin(*rx_values), get_dataframe_buffer_end(*rx_values), - rx_offsets.begin(), + rx_positions.begin(), tmp_vertex_value_output_first); } } handle.sync_stream(); -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete - auto subtime14 = std::chrono::steady_clock::now(); +#if PER_V_PERFORMANCE_MEASUREMENT + auto subtime12 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; std::chrono::duration subdur2 = subtime3 - subtime2; @@ -3033,14 +3448,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration subdur9 = subtime10 - subtime9; std::chrono::duration subdur10 = subtime11 - subtime10; std::chrono::duration subdur11 = subtime12 - subtime11; - std::chrono::duration subdur12 = subtime13 - subtime12; - std::chrono::duration subdur13 = subtime14 - subtime13; std::cerr << "sub (per_v) took (" << subdur0.count() << "," << subdur1.count() << "," << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," - << subdur11.count() << "," << subdur12.count() << "," << subdur13.count() << ")" - << std::endl; + << subdur11.count() << ")" << std::endl; #endif } else { device_group_start(minor_comm); @@ -3048,9 +3460,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto partition_idx = i + j; device_reduce(minor_comm, - get_dataframe_buffer_begin(major_output_buffers[j]), + get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]), tmp_vertex_value_output_first, - size_dataframe_buffer(major_output_buffers[j]), + size_dataframe_buffer(edge_partition_major_output_buffers[j]), ReduceOp::compatible_raft_comms_op, static_cast(partition_idx), handle.get_stream()); @@ -3060,7 +3472,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time4 = std::chrono::steady_clock::now(); #endif @@ -3153,7 +3565,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } -#if PER_V_PERFORMANCE_MEASUREMENT // FIXME: delete +#if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time5 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = time1 - time0; @@ -3161,10 +3573,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::chrono::duration dur2 = time3 - time2; std::chrono::duration dur3 = time4 - time3; std::chrono::duration dur4 = time5 - time4; - std::cerr << "\t\t" - << "detail::per_v (pre, filter, post, ep, comm) took (" << dur0.count() << "," + std::cerr << "detail::per_v (pre, filter, post, ep, comm) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() - << ")" << std::endl; + << ") num_concurrent_loops=" << num_concurrent_loops << std::endl; #endif } From 969a943289fe40a2af8278c71723df8155274af2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 8 Oct 2024 15:28:58 -0700 Subject: [PATCH 097/126] performance-optimize shuffle&global-reduce in trasnform_reduce_v_frontier_outgoing_e --- .../cugraph/utilities/shuffle_comm.cuh | 136 ++++ .../cugraph/utilities/thrust_tuple_utils.hpp | 26 + ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 663 ++++++++++++++++-- 3 files changed, 759 insertions(+), 66 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 303bb5694cf..39a8ed1a7b7 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -871,6 +871,142 @@ auto shuffle_values(raft::comms::comms_t const& comm, return std::make_tuple(std::move(rx_value_buffer), rx_counts); } +// Add gaps in the receive buffer to enforce that the sent data offset and the received data offset +// have the same alignment for every rank. This is faster assuming that @p alignment ensures cache +// line alignment in both send & receive buffer (tested with NCCL 2.23.4) +template +auto shuffle_values( + raft::comms::comms_t const& comm, + TxValueIterator tx_value_first, + std::vector const& tx_value_counts, + size_t alignment, // # elements + std::optional::value_type> fill_value, + rmm::cuda_stream_view stream_view) +{ + using value_t = typename thrust::iterator_traits::value_type; + + auto const comm_size = comm.get_size(); + + std::vector tx_value_displacements(tx_value_counts.size()); + std::exclusive_scan( + tx_value_counts.begin(), tx_value_counts.end(), tx_value_displacements.begin(), size_t{0}); + + std::vector tx_unaligned_counts(comm_size); + std::vector tx_displacements(comm_size); + std::vector tx_aligned_counts(comm_size); + std::vector tx_aligned_displacements(comm_size); + std::vector rx_unaligned_counts(comm_size); + std::vector rx_displacements(comm_size); + std::vector rx_aligned_counts(comm_size); + std::vector rx_aligned_displacements(comm_size); + std::vector tx_ranks(comm_size); + std::iota(tx_ranks.begin(), tx_ranks.end(), int{0}); + auto rx_ranks = tx_ranks; + for (size_t i = 0; i < tx_value_counts.size(); ++i) { + tx_unaligned_counts[i] = 0; + if (tx_value_displacements[i] % alignment != 0) { + tx_unaligned_counts[i] = + std::min(alignment - (tx_value_displacements[i] % alignment), tx_value_counts[i]); + } + tx_displacements[i] = tx_value_displacements[i]; + tx_aligned_counts[i] = tx_value_counts[i] - tx_unaligned_counts[i]; + tx_aligned_displacements[i] = tx_value_displacements[i] + tx_unaligned_counts[i]; + } + + rmm::device_uvector d_tx_unaligned_counts(tx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_tx_aligned_counts(tx_aligned_counts.size(), stream_view); + rmm::device_uvector d_rx_unaligned_counts(rx_unaligned_counts.size(), stream_view); + rmm::device_uvector d_rx_aligned_counts(rx_aligned_counts.size(), stream_view); + raft::update_device(d_tx_unaligned_counts.data(), + tx_unaligned_counts.data(), + tx_unaligned_counts.size(), + stream_view); + raft::update_device( + d_tx_aligned_counts.data(), tx_aligned_counts.data(), tx_aligned_counts.size(), stream_view); + std::vector tx_counts(comm_size, size_t{1}); + std::vector tx_offsets(comm_size); + std::iota(tx_offsets.begin(), tx_offsets.end(), size_t{0}); + auto rx_counts = tx_counts; + auto rx_offsets = tx_offsets; + cugraph::device_multicast_sendrecv(comm, + d_tx_unaligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_unaligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + d_tx_aligned_counts.data(), + tx_counts, + tx_offsets, + tx_ranks, + d_rx_aligned_counts.data(), + rx_counts, + rx_offsets, + rx_ranks, + stream_view); + raft::update_host(rx_unaligned_counts.data(), + d_rx_unaligned_counts.data(), + d_rx_unaligned_counts.size(), + stream_view); + raft::update_host( + rx_aligned_counts.data(), d_rx_aligned_counts.data(), d_rx_aligned_counts.size(), stream_view); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + size_t offset{0}; + for (size_t i = 0; i < rx_counts.size(); ++i) { + auto target_alignment = (alignment - rx_unaligned_counts[i]) % alignment; + auto cur_alignment = offset % alignment; + if (target_alignment >= cur_alignment) { + offset += target_alignment - cur_alignment; + } else { + offset += (target_alignment + alignment) - cur_alignment; + } + rx_displacements[i] = offset; + rx_aligned_displacements[i] = rx_displacements[i] + rx_unaligned_counts[i]; + offset = rx_aligned_displacements[i] + rx_aligned_counts[i]; + } + + auto rx_values = allocate_dataframe_buffer( + rx_aligned_displacements.back() + rx_aligned_counts.back(), stream_view); + if (fill_value) { + thrust::fill(rmm::exec_policy_nosync(stream_view), + get_dataframe_buffer_begin(rx_values), + get_dataframe_buffer_end(rx_values), + *fill_value); + } + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_unaligned_counts, + tx_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_unaligned_counts, + rx_displacements, + rx_ranks, + stream_view); + cugraph::device_multicast_sendrecv(comm, + tx_value_first, + tx_aligned_counts, + tx_aligned_displacements, + tx_ranks, + get_dataframe_buffer_begin(rx_values), + rx_aligned_counts, + rx_aligned_displacements, + rx_ranks, + stream_view); + + return std::make_tuple(std::move(rx_values), + tx_unaligned_counts, + tx_aligned_counts, + tx_displacements, + rx_unaligned_counts, + rx_aligned_counts, + rx_displacements); +} + // this uses less memory than calling shuffle_values then sort & unique but requires comm.get_size() // - 1 communication steps template diff --git a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp index 2c36ed33359..29b9d132ef8 100644 --- a/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp +++ b/cpp/include/cugraph/utilities/thrust_tuple_utils.hpp @@ -64,6 +64,18 @@ size_t sum_thrust_tuple_element_sizes(std::index_sequence) return (... + sizeof(typename thrust::tuple_element::type)); } +template +size_t min_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::min(sizeof(typename thrust::tuple_element::type)...); +} + +template +size_t max_thrust_tuple_element_sizes(std::index_sequence) +{ + return std::max(sizeof(typename thrust::tuple_element::type)...); +} + template auto thrust_tuple_to_std_tuple(TupleType tup, std::index_sequence) { @@ -181,6 +193,20 @@ constexpr size_t sum_thrust_tuple_element_sizes() std::make_index_sequence::value>()); } +template +constexpr size_t min_thrust_tuple_element_sizes() +{ + return detail::min_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + +template +constexpr size_t max_thrust_tuple_element_sizes() +{ + return detail::max_thrust_tuple_element_sizes( + std::make_index_sequence::value>()); +} + template auto thrust_tuple_to_std_tuple(TupleType tup) { diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 29d9549eb7f..67ceb2a482f 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -109,14 +110,151 @@ struct transform_reduce_v_frontier_call_e_op_t { } }; -template -auto sort_and_reduce_buffer_elements( +template +struct update_keep_flag_t { + using input_key_t = + typename thrust::iterator_traits::value_type; // uint32_t (compressed) or + // key_t (i.e. vertex_t) + + raft::device_span bitmap{}; + raft::device_span keep_flags{}; + key_t v_range_first{}; + InputKeyIterator input_key_first{}; + thrust::optional invalid_input_key{}; + + __device__ void operator()(size_t i) const + { + auto v = *(input_key_first + i); + if (invalid_input_key && (v == *invalid_input_key)) { + return; // just discard + } + input_key_t v_offset{}; + if constexpr ((sizeof(key_t) == 8) && std::is_same_v) { + v_offset = v; + } else { + v_offset = v - v_range_first; + } + cuda::atomic_ref bitmap_word( + bitmap[packed_bool_offset(v_offset)]); + auto old = bitmap_word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + if ((old & packed_bool_mask(v_offset)) == packed_bool_empty_mask()) { + cuda::atomic_ref keep_flag_word( + keep_flags[packed_bool_offset(i)]); + keep_flag_word.fetch_or(packed_bool_mask(i), cuda::std::memory_order_relaxed); + } + } +}; + +template +std::tuple, optional_dataframe_buffer_type_t> +sort_and_reduce_buffer_elements( raft::handle_t const& handle, - decltype(allocate_dataframe_buffer(0, rmm::cuda_stream_view{}))&& key_buffer, - decltype(allocate_optional_dataframe_buffer(0, - rmm::cuda_stream_view{}))&& payload_buffer, - ReduceOp reduce_op) + dataframe_buffer_type_t&& key_buffer, + optional_dataframe_buffer_type_t&& payload_buffer, + ReduceOp reduce_op, + std::conditional_t, std::tuple, std::byte /* dummy */> + vertex_range, + std::optional invalid_key /* drop (key, (payload)) pairs with invalid key */) { + constexpr bool compressed = + std::is_integral_v && (sizeof(key_t) == 8) && + std::is_same_v; // we currently compress only when key_t is an integral + // type (i.e. vertex_t) + static_assert(compressed || std::is_same_v); + + if constexpr (std::is_integral_v && + (std::is_same_v || + std::is_same_v>)) { // try to use + // bitmap for + // filtering + key_t range_size = std::get<1>(vertex_range) - std::get<0>(vertex_range); + if (static_cast(size_dataframe_buffer(key_buffer)) >= + static_cast(range_size) * + 0.125 /* tuning parameter */) { // use bitmap for filtering + rmm::device_uvector bitmap(packed_bool_size(range_size), handle.get_stream()); + rmm::device_uvector keep_flags(packed_bool_size(size_dataframe_buffer(key_buffer)), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + thrust::fill( + handle.get_thrust_policy(), keep_flags.begin(), keep_flags.end(), packed_bool_empty_mask()); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(size_dataframe_buffer(key_buffer)), + update_keep_flag_t{ + raft::device_span(bitmap.data(), bitmap.size()), + raft::device_span(keep_flags.data(), keep_flags.size()), + std::get<0>(vertex_range), + get_dataframe_buffer_begin(key_buffer), + to_thrust_optional(invalid_key)}); + auto stencil_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [keep_flags = raft::device_span(keep_flags.data(), + keep_flags.size())] __device__(size_t i) { + return (keep_flags[packed_bool_offset(i)] & packed_bool_mask(i)) != + packed_bool_empty_mask(); + })); + if constexpr (std::is_same_v) { + resize_dataframe_buffer( + key_buffer, + thrust::distance(get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + thrust::sort(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer)); + } else { + static_assert(std::is_same_v>); + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + stencil_first, + is_not_equal_t{true})), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); + thrust::sort_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + } + + if constexpr (compressed) { + rmm::device_uvector output_key_buffer(key_buffer.size(), handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + key_buffer.begin(), + key_buffer.end(), + output_key_buffer.begin(), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(uint32_t v_offset) { + return static_cast(v_first + v_offset); + })); + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); + } else { + return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + } + } + } + if constexpr (std::is_same_v) { thrust::sort(handle.get_thrust_policy(), get_dataframe_buffer_begin(key_buffer), @@ -128,28 +266,155 @@ auto sort_and_reduce_buffer_elements( get_optional_dataframe_buffer_begin(payload_buffer)); } + auto output_key_buffer = allocate_dataframe_buffer(0, handle.get_stream()); if constexpr (std::is_same_v) { - auto it = thrust::unique(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer)); - resize_dataframe_buffer( - key_buffer, - static_cast(thrust::distance(get_dataframe_buffer_begin(key_buffer), it)), - handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + get_dataframe_buffer_begin(output_key_buffer), + thrust::copy_if(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + get_dataframe_buffer_begin(output_key_buffer), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + } else { + resize_dataframe_buffer( + key_buffer, + thrust::distance( + get_dataframe_buffer_begin(key_buffer), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); } else if constexpr (std::is_same_v>) { - auto it = thrust::unique_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer)); - resize_dataframe_buffer(key_buffer, - static_cast(thrust::distance( - get_dataframe_buffer_begin(key_buffer), thrust::get<0>(it))), - handle.get_stream()); - resize_dataframe_buffer(payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + if constexpr (compressed) { + resize_dataframe_buffer( + output_key_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + auto tmp_payload_buffer = allocate_dataframe_buffer( + size_dataframe_buffer(payload_buffer), handle.get_stream()); + auto input_pair_first = + thrust::make_zip_iterator(input_key_first, get_dataframe_buffer_begin(payload_buffer)); + auto output_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_key_buffer), + get_dataframe_buffer_begin(tmp_payload_buffer)); + resize_dataframe_buffer( + output_key_buffer, + thrust::distance( + output_pair_first, + thrust::copy_if(handle.get_thrust_policy(), + input_pair_first, + input_pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + output_pair_first, + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return false; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return false; + } else { + return true; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + tmp_payload_buffer, size_dataframe_buffer(output_key_buffer), handle.get_stream()); + payload_buffer = std::move(tmp_payload_buffer); + } else { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance( + pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [key_first = get_dataframe_buffer_begin(key_buffer), + invalid_key = to_thrust_optional(invalid_key)] __device__(size_t i) { + auto key = *(key_first + i); + if (invalid_key && (key == *invalid_key)) { + return true; + } else if ((i != 0) && (key == *(key_first + (i - 1)))) { + return true; + } else { + return false; + } + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + output_key_buffer = std::move(key_buffer); + } + shrink_to_fit_dataframe_buffer(output_key_buffer, handle.get_stream()); shrink_to_fit_dataframe_buffer(payload_buffer, handle.get_stream()); } else { + if (invalid_key) { + auto pair_first = thrust::make_zip_iterator(get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer)); + resize_dataframe_buffer( + key_buffer, + thrust::distance(pair_first, + thrust::remove_if(handle.get_thrust_policy(), + pair_first, + pair_first + size_dataframe_buffer(key_buffer), + cuda::proclaim_return_type( + [invalid_key = *invalid_key] __device__(auto kv) { + auto key = thrust::get<0>(kv); + return key == invalid_key; + }))), + handle.get_stream()); + resize_dataframe_buffer( + payload_buffer, size_dataframe_buffer(key_buffer), handle.get_stream()); + } auto num_uniques = thrust::count_if(handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), @@ -161,20 +426,37 @@ auto sort_and_reduce_buffer_elements( auto new_payload_buffer = allocate_dataframe_buffer(num_uniques, handle.get_stream()); - thrust::reduce_by_key(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - get_optional_dataframe_buffer_begin(payload_buffer), - get_dataframe_buffer_begin(new_key_buffer), - get_dataframe_buffer_begin(new_payload_buffer), - thrust::equal_to(), - reduce_op); - - key_buffer = std::move(new_key_buffer); - payload_buffer = std::move(new_payload_buffer); + if constexpr (compressed) { + auto input_key_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(key_buffer), + cuda::proclaim_return_type( + [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + return static_cast(v_first + v_offset); + })); + thrust::reduce_by_key(handle.get_thrust_policy(), + input_key_first, + input_key_first + size_dataframe_buffer(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } else { + thrust::reduce_by_key(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + get_optional_dataframe_buffer_begin(payload_buffer), + get_dataframe_buffer_begin(new_key_buffer), + get_dataframe_buffer_begin(new_payload_buffer), + thrust::equal_to(), + reduce_op); + } + + output_key_buffer = std::move(new_key_buffer); + payload_buffer = std::move(new_payload_buffer); } - return std::make_tuple(std::move(key_buffer), std::move(payload_buffer)); + return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); } #if 1 // FIXME: delete @@ -219,7 +501,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, // 1. fill the buffer -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time0 = std::chrono::steady_clock::now(); #endif @@ -241,28 +523,41 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, edge_value_input, e_op_wrapper, do_expensive_check); -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time1 = std::chrono::steady_clock::now(); + auto time1 = std::chrono::steady_clock::now(); auto size_before_lreduce = size_dataframe_buffer(key_buffer); #endif // 2. reduce the buffer + std:: + conditional_t, std::tuple, std::byte /* dummy */> + vertex_range{}; + if constexpr (std::is_integral_v) { + vertex_range = std::make_tuple(graph_view.local_edge_partition_dst_range_first(), + graph_view.local_edge_partition_dst_range_last()); + } std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + std::nullopt); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time2 = std::chrono::steady_clock::now(); - auto time3 = std::chrono::steady_clock::now(); - auto time4 = std::chrono::steady_clock::now(); - auto size_after_lreduce = size_dataframe_buffer(key_buffer); + auto time2 = std::chrono::steady_clock::now(); + auto time3 = std::chrono::steady_clock::now(); + auto time4 = std::chrono::steady_clock::now(); + auto size_after_lreduce = size_dataframe_buffer(key_buffer); auto size_before_greduce = size_after_lreduce; #endif + bool aligned_path = false; // FIXME: delete + double fill_ratio = 0.0; // FIXME: delete if constexpr (GraphViewType::is_multi_gpu) { // FIXME: this step is unnecessary if major_comm_size== 1 - auto& comm = handle.get_comms(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); @@ -270,15 +565,41 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); + constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; + + std::conditional_t max_vertex_partition_size{ + 0}; + std::conditional_t, std::byte /* dummy */> + h_vertex_firsts{}; + if constexpr (try_compression) { h_vertex_firsts = std::vector(major_comm_size); } std::vector h_vertex_lasts(major_comm_size); for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { auto vertex_partition_id = detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + if constexpr (try_compression) { + max_vertex_partition_size = std::max( + graph_view.vertex_partition_range_size(vertex_partition_id), max_vertex_partition_size); + h_vertex_firsts[i] = graph_view.vertex_partition_range_first(vertex_partition_id); + } h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id); } + std::conditional_t>, + std::byte /* dummy */> + d_vertex_firsts{}; rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + d_vertex_firsts = + rmm::device_uvector(h_vertex_firsts.size(), handle.get_stream()); + raft::update_device((*d_vertex_firsts).data(), + h_vertex_firsts.data(), + h_vertex_firsts.size(), + handle.get_stream()); + } + } raft::update_device( d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), @@ -292,6 +613,32 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, d_vertex_lasts.begin(), d_vertex_lasts.end(), d_tx_buffer_last_boundaries.begin()); + std::conditional_t>, + std::byte /* dummy */> + compressed_v_buffer{}; + if constexpr (try_compression) { + if (d_vertex_firsts) { + compressed_v_buffer = + rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + (*compressed_v_buffer).begin(), + cuda::proclaim_return_type( + [firsts = raft::device_span((*d_vertex_firsts).data(), + (*d_vertex_firsts).size()), + lasts = raft::device_span( + d_vertex_lasts.data(), d_vertex_lasts.size())] __device__(auto v) { + auto major_comm_rank = thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); + return static_cast(v - firsts[major_comm_rank]); + })); + resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + } + } std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); raft::update_host(h_tx_buffer_last_boundaries.data(), d_tx_buffer_last_boundaries.data(), @@ -302,42 +649,226 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); time3 = std::chrono::steady_clock::now(); #endif - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values( - major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); - key_buffer = std::move(rx_key_buffer); - + constexpr size_t cache_line_size = 128; + size_t min_element_size{cache_line_size}; + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + min_element_size = std::min(sizeof(uint32_t), min_element_size); + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } if constexpr (!std::is_same_v) { - auto rx_payload_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, std::ignore) = shuffle_values( - major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); - payload_buffer = std::move(rx_payload_buffer); + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(payload_t), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = std::min(min_thrust_tuple_element_sizes(), min_element_size); + } } -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete + auto alignment = cache_line_size / min_element_size; + std::optional, key_t>> + invalid_key{std::nullopt}; + + size_t local_key_buffer_size{}; + if constexpr (try_compression) { + if (compressed_v_buffer) { + local_key_buffer_size = size_dataframe_buffer(*compressed_v_buffer); + } else { + local_key_buffer_size = size_dataframe_buffer(key_buffer); + } + } else { + local_key_buffer_size = size_dataframe_buffer(key_buffer); + } + auto avg_key_buffer_size = + host_scalar_allreduce( + major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / + major_comm_size; + if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { + aligned_path = true; // FIXME: delete + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + invalid_key = std::numeric_limits::max(); + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = invalid_vertex_id_v; + } + } else { + invalid_key = key_t{}; + thrust::get<0>(*invalid_key) = invalid_vertex_id_v; + } + + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + alignment, + std::make_optional(std::get<1>(*invalid_key)), + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + std::make_optional(std::get<0>(*invalid_key)), + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + invalid_key, + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(payload_buffer), + tx_counts, + alignment, + std::nullopt, + handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } else { + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, std::ignore) = + shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } + } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - time4 = std::chrono::steady_clock::now(); + time4 = std::chrono::steady_clock::now(); size_before_greduce = size_dataframe_buffer(key_buffer); #endif - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, std::move(key_buffer), std::move(payload_buffer), reduce_op); + if constexpr (std::is_integral_v) { + vertex_range = std::make_tuple(graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last()); + fill_ratio = static_cast(size_dataframe_buffer(key_buffer)) / + static_cast(std::get<1>(vertex_range) - + std::get<0>(vertex_range)); // FIXME: delete + } + if constexpr (try_compression) { + if (compressed_v_buffer) { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(*compressed_v_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); + } else { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); + } + } else { + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + vertex_range, + invalid_key); + } } -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT // FIXME: delete +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time5 = std::chrono::steady_clock::now(); - auto size_after_greduce = size_dataframe_buffer(key_buffer); + auto size_after_greduce = size_dataframe_buffer(key_buffer); std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; std::chrono::duration dur3 = time4 - time3; std::chrono::duration dur4 = time5 - time4; - std::cerr << "\tprim (fill,lreduce,g-prep,g-shuffle,g-s&r) took (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << "," << dur3.count() << "," << dur4.count() << ") l_size=(" << size_before_lreduce << "," << size_after_lreduce << ") g_size=(" << size_before_greduce << "," << size_after_greduce << ")" << std::endl; + std::cerr << "\tprim (fill,lreduce,g-prep,g-shuffle,g-s&r) took (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() + << ") l_size=(" << size_before_lreduce << "," << size_after_lreduce << ") g_size=(" + << size_before_greduce << "," << size_after_greduce << ")" + << " aligned_path=" << aligned_path << " fill_ratio=" << fill_ratio << std::endl; #endif if constexpr (!std::is_same_v) { From 498e54e7235c149bff0970254b57d873876cf181 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 9 Oct 2024 12:52:55 -0700 Subject: [PATCH 098/126] pre-compute dcs range bitmap --- .../cugraph/edge_partition_device_view.cuh | 12 + cpp/include/cugraph/edge_partition_view.hpp | 8 + cpp/include/cugraph/graph.hpp | 13 + cpp/include/cugraph/graph_view.hpp | 69 ++--- .../prims/detail/per_v_transform_reduce_e.cuh | 6 +- cpp/src/structure/graph_impl.cuh | 248 ++++++++++-------- cpp/src/structure/graph_view_impl.cuh | 3 + 7 files changed, 222 insertions(+), 137 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 6c1ede94a5b..ff6d13fd523 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -204,6 +204,7 @@ class edge_partition_device_view_t view) : detail::edge_partition_device_view_base_t(view.offsets(), view.indices()), dcs_nzd_vertices_(detail::to_thrust_optional(view.dcs_nzd_vertices())), + dcs_nzd_range_bitmap_(detail::to_thrust_optional(view.dcs_nzd_range_bitmap())), major_hypersparse_first_(detail::to_thrust_optional(view.major_hypersparse_first())), major_range_first_(view.major_range_first()), major_range_last_(view.major_range_last()), @@ -515,6 +516,7 @@ class edge_partition_device_view_t> for consistency (see dcs_nzd_range_bitmap()) __host__ __device__ thrust::optional dcs_nzd_vertices() const { return dcs_nzd_vertices_ ? thrust::optional{(*dcs_nzd_vertices_).data()} @@ -528,10 +530,20 @@ class edge_partition_device_view_t> dcs_nzd_range_bitmap() + const + { + return dcs_nzd_range_bitmap_ + ? thrust::make_optional>( + (*dcs_nzd_range_bitmap_).data(), (*dcs_nzd_range_bitmap_).size()) + : thrust::nullopt; + } + private: // should be trivially copyable to device thrust::optional> dcs_nzd_vertices_{thrust::nullopt}; + thrust::optional> dcs_nzd_range_bitmap_{thrust::nullopt}; thrust::optional major_hypersparse_first_{thrust::nullopt}; vertex_t major_range_first_{0}; diff --git a/cpp/include/cugraph/edge_partition_view.hpp b/cpp/include/cugraph/edge_partition_view.hpp index 42465273718..f0693f4b1a9 100644 --- a/cpp/include/cugraph/edge_partition_view.hpp +++ b/cpp/include/cugraph/edge_partition_view.hpp @@ -56,6 +56,7 @@ class edge_partition_view_t offsets, raft::device_span indices, std::optional> dcs_nzd_vertices, + std::optional> dcs_nzd_range_bitmap, std::optional major_hypersparse_first, vertex_t major_range_first, vertex_t major_range_last, @@ -64,6 +65,7 @@ class edge_partition_view_t(offsets, indices), dcs_nzd_vertices_(dcs_nzd_vertices), + dcs_nzd_range_bitmap_(dcs_nzd_range_bitmap), major_hypersparse_first_(major_hypersparse_first), major_range_first_(major_range_first), major_range_last_(major_range_last), @@ -78,6 +80,11 @@ class edge_partition_view_t> dcs_nzd_range_bitmap() const + { + return dcs_nzd_range_bitmap_; + } + std::optional major_hypersparse_first() const { return major_hypersparse_first_; } vertex_t major_range_first() const { return major_range_first_; } @@ -90,6 +97,7 @@ class edge_partition_view_t> dcs_nzd_vertices_{std::nullopt}; + std::optional> dcs_nzd_range_bitmap_{std::nullopt}; std::optional major_hypersparse_first_{std::nullopt}; vertex_t major_range_first_{0}; diff --git a/cpp/include/cugraph/graph.hpp b/cpp/include/cugraph/graph.hpp index 2be77f57e40..290f4b3c4db 100644 --- a/cpp/include/cugraph/graph.hpp +++ b/cpp/include/cugraph/graph.hpp @@ -103,6 +103,11 @@ class graph_t>>( (*edge_partition_dcs_nzd_vertices_).size()) : std::nullopt; + auto dcs_nzd_range_bitmaps = + edge_partition_dcs_nzd_range_bitmaps_ + ? std::make_optional>>( + (*edge_partition_dcs_nzd_range_bitmaps_).size()) + : std::nullopt; for (size_t i = 0; i < offsets.size(); ++i) { offsets[i] = raft::device_span(edge_partition_offsets_[i].data(), edge_partition_offsets_[i].size()); @@ -113,6 +118,11 @@ class graph_t((*edge_partition_dcs_nzd_vertices_)[i].data(), (*edge_partition_dcs_nzd_vertices_)[i].size()); } + if (dcs_nzd_range_bitmaps) { + (*dcs_nzd_range_bitmaps)[i] = + raft::device_span((*edge_partition_dcs_nzd_range_bitmaps_)[i].data(), + (*edge_partition_dcs_nzd_range_bitmaps_)[i].size()); + } } std::conditional_t{ this->number_of_vertices(), this->number_of_edges(), @@ -227,6 +238,8 @@ class graph_t>> edge_partition_dcs_nzd_vertices_{ std::nullopt}; + std::optional>> edge_partition_dcs_nzd_range_bitmaps_{ + std::nullopt}; partition_t partition_{}; // segment offsets within the vertex partition based on vertex degree diff --git a/cpp/include/cugraph/graph_view.hpp b/cpp/include/cugraph/graph_view.hpp index feb64c0aa8e..6d3da3740bf 100644 --- a/cpp/include/cugraph/graph_view.hpp +++ b/cpp/include/cugraph/graph_view.hpp @@ -392,6 +392,8 @@ class graph_view_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta); std::vector vertex_partition_range_offsets() const @@ -645,6 +647,9 @@ class graph_view_t>> edge_partition_dcs_nzd_vertices_{}; + std::optional>> + edge_partition_dcs_nzd_range_bitmaps_{}; partition_t partition_{}; @@ -964,29 +971,29 @@ class graph_view_t> local_edge_partition_hypersparse_degree_offsets( size_t partition_idx = 0) const { - assert(partition_idx == 0); - return hypersparse_degree_offsets_; + assert(partition_idx == 0); + return hypersparse_degree_offsets_; } vertex_partition_view_t local_vertex_partition_view() const { - return vertex_partition_view_t(this->number_of_vertices()); + return vertex_partition_view_t(this->number_of_vertices()); } edge_partition_view_t local_edge_partition_view( size_t partition_idx = 0) const { - assert(partition_idx == 0); // there is only one edge partition in single-GPU - return edge_partition_view_t( - offsets_, indices_, this->number_of_vertices()); + assert(partition_idx == 0); // there is only one edge partition in single-GPU + return edge_partition_view_t( + offsets_, indices_, this->number_of_vertices()); } // FIXME: deprecated, replaced with compute_number_of_edges (which works with or without edge // masking) edge_t number_of_edges() const { - CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented."); - return this->number_of_edges_; + CUGRAPH_EXPECTS(!(this->has_edge_mask()), "unimplemented."); + return this->number_of_edges_; } edge_t compute_number_of_edges(raft::handle_t const& handle) const; @@ -1014,96 +1021,92 @@ class graph_view_t>> local_sorted_unique_edge_srcs() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_srcs(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_src_chunk_start_offsets() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_src_chunk_start_offsets(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } - std::optional local_sorted_unique_edge_src_chunk_size() const { - return std::nullopt; } + std::optional local_sorted_unique_edge_src_chunk_size() const { return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_src_vertex_partition_offsets() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dsts() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dsts(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dst_chunk_start_offsets() const { - return std::nullopt; + return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dst_chunk_start_offsets(size_t partition_idx = 0) const { - assert(partition_idx == 0); - return std::nullopt; + assert(partition_idx == 0); + return std::nullopt; } - std::optional local_sorted_unique_edge_dst_chunk_size() const { - return std::nullopt; } + std::optional local_sorted_unique_edge_dst_chunk_size() const { return std::nullopt; } template std::enable_if_t>> local_sorted_unique_edge_dst_vertex_partition_offsets() const { - return std::nullopt; + return std::nullopt; } void attach_edge_mask(edge_property_view_t edge_mask_view) { - edge_mask_view_ = edge_mask_view; + edge_mask_view_ = edge_mask_view; } - void clear_edge_mask() { - edge_mask_view_ = std::nullopt; } + void clear_edge_mask() { edge_mask_view_ = std::nullopt; } - bool has_edge_mask() const { - return edge_mask_view_.has_value(); } + bool has_edge_mask() const { return edge_mask_view_.has_value(); } std::optional> edge_mask_view() const { - return edge_mask_view_; + return edge_mask_view_; } private: @@ -1115,6 +1118,6 @@ class graph_view_t> hypersparse_degree_offsets_{std::nullopt}; std::optional> edge_mask_view_{std::nullopt}; - }; +}; } // namespace cugraph diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index ca03765a191..ffa0917fbe6 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1971,7 +1971,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, stream_pool_indices = init_stream_pool_indices( static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * - 0.1), + 0.2), tmp_buffer_size_per_loop, graph_view.number_of_local_edge_partitions(), max_segments, @@ -2239,6 +2239,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); +#if 1 + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); +#else // FIXME: we can pre-compute this & store in graph_t rmm::device_uvector segment_bitmap( packed_bool_size((*segment_offsets)[4] - (*segment_offsets)[3]), loop_stream); @@ -2258,6 +2261,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, bitmap[packed_bool_offset(major_offset)]); word.fetch_or(packed_bool_mask(major_offset), cuda::std::memory_order_relaxed); }); +#endif auto range_offset_first = std::min( (edge_partition.major_range_first() + (*segment_offsets)[3] > diff --git a/cpp/src/structure/graph_impl.cuh b/cpp/src/structure/graph_impl.cuh index 4c9166cb01a..c0e9b7f0a54 100644 --- a/cpp/src/structure/graph_impl.cuh +++ b/cpp/src/structure/graph_impl.cuh @@ -146,8 +146,7 @@ update_local_sorted_unique_edge_majors_minors( auto num_segments_per_vertex_partition = static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); - auto use_dcs = - num_segments_per_vertex_partition > (detail::num_sparse_segments_per_vertex_partition + 2); + auto use_dcs = edge_partition_dcs_nzd_vertices.has_value(); std::optional>> local_sorted_unique_edge_majors{ std::nullopt}; @@ -165,17 +164,16 @@ update_local_sorted_unique_edge_majors_minors( // majors/minors to support storing edge major/minor properties in (key, value) pairs. // 1. Update local_sorted_unique_edge_minors & local_sorted_unique_edge_minor_offsets - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "update_local_sorted_unique_edge_majors_minors 1" << std::endl; - { + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { auto [minor_range_first, minor_range_last] = meta.partition.local_edge_partition_minor_range(); auto minor_range_size = meta.partition.local_edge_partition_minor_range_size(); - rmm::device_uvector minor_bitmaps( - (minor_range_size + (sizeof(uint32_t) * 8 - 1)) / (sizeof(uint32_t) * 8), - handle.get_stream()); - thrust::fill( - handle.get_thrust_policy(), minor_bitmaps.begin(), minor_bitmaps.end(), uint32_t{0}); + rmm::device_uvector minor_bitmaps(packed_bool_size(minor_range_size), + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + minor_bitmaps.begin(), + minor_bitmaps.end(), + packed_bool_empty_mask()); for (size_t i = 0; i < edge_partition_indices.size(); ++i) { thrust::for_each(handle.get_thrust_policy(), edge_partition_indices[i].begin(), @@ -199,7 +197,6 @@ update_local_sorted_unique_edge_majors_minors( << std::endl; if (max_minor_properties_fill_ratio < detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { - std::cerr << "K,V pairs" << std::endl; auto const chunk_size = static_cast(std::min(1.0 / max_minor_properties_fill_ratio, 1024.0)); @@ -287,102 +284,102 @@ update_local_sorted_unique_edge_majors_minors( } // 2. Update local_sorted_unique_edge_majors & local_sorted_unique_edge_major_offsets - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "update_local_sorted_unique_edge_majors_minors 2" << std::endl; - - std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - num_local_unique_edge_major_counts[i] = thrust::count_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), - has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); - } - auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), - num_local_unique_edge_major_counts.end()); - vertex_t aggregate_major_range_size{0}; - for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { - aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); - } + if (detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold > 0.0) { + std::vector num_local_unique_edge_major_counts(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + num_local_unique_edge_major_counts[i] = thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(static_cast(edge_partition_offsets[i].size() - 1)), + has_nzd_t{edge_partition_offsets[i].data(), vertex_t{0}}); + } + auto num_local_unique_edge_majors = std::reduce(num_local_unique_edge_major_counts.begin(), + num_local_unique_edge_major_counts.end()); - auto max_major_properties_fill_ratio = - host_scalar_allreduce(comm, - static_cast(num_local_unique_edge_majors) / - static_cast(aggregate_major_range_size), - raft::comms::op_t::MAX, - handle.get_stream()); + vertex_t aggregate_major_range_size{0}; + for (size_t i = 0; i < meta.partition.number_of_local_edge_partitions(); ++i) { + aggregate_major_range_size += meta.partition.local_edge_partition_major_range_size(i); + } - std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio - << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" - << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold - << std::endl; - if (max_major_properties_fill_ratio < - detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { - auto const chunk_size = - static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); - - local_sorted_unique_edge_majors = std::vector>{}; - local_sorted_unique_edge_major_chunk_start_offsets = - std::vector>{}; - - (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); - (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); - for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { - auto [major_range_first, major_range_last] = - meta.partition.local_edge_partition_major_range(i); - auto sparse_range_last = - use_dcs - ? (major_range_first + - meta.edge_partition_segment_offsets[num_segments_per_vertex_partition * i + - detail::num_sparse_segments_per_vertex_partition]) - : major_range_last; + auto max_major_properties_fill_ratio = + host_scalar_allreduce(comm, + static_cast(num_local_unique_edge_majors) / + static_cast(aggregate_major_range_size), + raft::comms::op_t::MAX, + handle.get_stream()); - rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], - handle.get_stream()); - CUGRAPH_EXPECTS( - sparse_range_last - major_range_first < std::numeric_limits::max(), - "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), work-around required."); - auto cur_size = thrust::distance( - unique_edge_majors.begin(), - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(major_range_first), - thrust::make_counting_iterator(sparse_range_last), + std::cout << "max_major_properties_fill_ratio=" << max_major_properties_fill_ratio + << " detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold=" + << detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold + << std::endl; + if (max_major_properties_fill_ratio < + detail::edge_partition_src_dst_property_values_kv_pair_fill_ratio_threshold) { + auto const chunk_size = + static_cast(std::min(1.0 / max_major_properties_fill_ratio, 1024.0)); + + local_sorted_unique_edge_majors = std::vector>{}; + local_sorted_unique_edge_major_chunk_start_offsets = + std::vector>{}; + + (*local_sorted_unique_edge_majors).reserve(edge_partition_offsets.size()); + (*local_sorted_unique_edge_major_chunk_start_offsets).reserve(edge_partition_offsets.size()); + for (size_t i = 0; i < edge_partition_offsets.size(); ++i) { + auto [major_range_first, major_range_last] = + meta.partition.local_edge_partition_major_range(i); + auto sparse_range_last = + use_dcs + ? (major_range_first + + meta + .edge_partition_segment_offsets[num_segments_per_vertex_partition * i + + detail::num_sparse_segments_per_vertex_partition]) + : major_range_last; + + rmm::device_uvector unique_edge_majors(num_local_unique_edge_major_counts[i], + handle.get_stream()); + CUGRAPH_EXPECTS(sparse_range_last - major_range_first < std::numeric_limits::max(), + "copy_if will fail (https://github.com/NVIDIA/thrust/issues/1302), " + "work-around required."); + auto cur_size = thrust::distance( unique_edge_majors.begin(), - has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); - if (use_dcs) { - thrust::copy(handle.get_thrust_policy(), - (*edge_partition_dcs_nzd_vertices)[i].begin(), - (*edge_partition_dcs_nzd_vertices)[i].end(), - unique_edge_majors.begin() + cur_size); + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(major_range_first), + thrust::make_counting_iterator(sparse_range_last), + unique_edge_majors.begin(), + has_nzd_t{edge_partition_offsets[i].data(), major_range_first})); + if (use_dcs) { + thrust::copy(handle.get_thrust_policy(), + (*edge_partition_dcs_nzd_vertices)[i].begin(), + (*edge_partition_dcs_nzd_vertices)[i].end(), + unique_edge_majors.begin() + cur_size); + } + + auto num_chunks = static_cast( + ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); + rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, + handle.get_stream()); + + auto chunk_start_vertex_first = + thrust::make_transform_iterator(thrust::make_counting_iterator(vertex_t{0}), + detail::multiply_and_add_t{ + static_cast(chunk_size), major_range_first}); + thrust::lower_bound(handle.get_thrust_policy(), + unique_edge_majors.begin(), + unique_edge_majors.end(), + chunk_start_vertex_first, + chunk_start_vertex_first + num_chunks, + unique_edge_major_chunk_start_offsets.begin()); + unique_edge_major_chunk_start_offsets.set_element( + num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); + + (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); + (*local_sorted_unique_edge_major_chunk_start_offsets) + .push_back(std::move(unique_edge_major_chunk_start_offsets)); } - - auto num_chunks = static_cast( - ((major_range_last - major_range_first) + (chunk_size - size_t{1})) / chunk_size); - rmm::device_uvector unique_edge_major_chunk_start_offsets(num_chunks + size_t{1}, - handle.get_stream()); - - auto chunk_start_vertex_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - detail::multiply_and_add_t{static_cast(chunk_size), major_range_first}); - thrust::lower_bound(handle.get_thrust_policy(), - unique_edge_majors.begin(), - unique_edge_majors.end(), - chunk_start_vertex_first, - chunk_start_vertex_first + num_chunks, - unique_edge_major_chunk_start_offsets.begin()); - unique_edge_major_chunk_start_offsets.set_element( - num_chunks, static_cast(unique_edge_majors.size()), handle.get_stream()); - - (*local_sorted_unique_edge_majors).push_back(std::move(unique_edge_majors)); - (*local_sorted_unique_edge_major_chunk_start_offsets) - .push_back(std::move(unique_edge_major_chunk_start_offsets)); + local_sorted_unique_edge_major_chunk_size = chunk_size; } - local_sorted_unique_edge_major_chunk_size = chunk_size; } - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "update_local_sorted_unique_edge_majors_minors 3" << std::endl; return std::make_tuple(std::move(local_sorted_unique_edge_majors), std::move(local_sorted_unique_edge_major_chunk_start_offsets), @@ -393,6 +390,50 @@ update_local_sorted_unique_edge_majors_minors( std::move(local_sorted_unique_edge_minor_vertex_partition_offsets)); } +template +std::enable_if_t>> +compute_edge_partition_dcs_nzd_range_bitmaps( + raft::handle_t const& handle, + graph_meta_t const& meta, + std::vector> const& edge_partition_dcs_nzd_vertices) +{ + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + auto num_segments_per_vertex_partition = + static_cast(meta.edge_partition_segment_offsets.size() / minor_comm_size); + + std::vector> edge_partition_dcs_nzd_range_bitmaps{}; + edge_partition_dcs_nzd_range_bitmaps.reserve(edge_partition_dcs_nzd_vertices.size()); + for (size_t i = 0; i < edge_partition_dcs_nzd_vertices.size(); ++i) { + raft::host_span segment_offsets( + meta.edge_partition_segment_offsets.data() + num_segments_per_vertex_partition * i, + num_segments_per_vertex_partition); + rmm::device_uvector bitmap( + packed_bool_size(segment_offsets[detail::num_sparse_segments_per_vertex_partition + 1] - + segment_offsets[detail::num_sparse_segments_per_vertex_partition]), + handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + auto major_range_first = meta.partition.local_edge_partition_major_range_first(i); + auto major_hypersparse_first = + major_range_first + segment_offsets[detail::num_sparse_segments_per_vertex_partition]; + thrust::for_each(handle.get_thrust_policy(), + edge_partition_dcs_nzd_vertices[i].begin(), + edge_partition_dcs_nzd_vertices[i].end(), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + major_hypersparse_first] __device__(auto major) { + auto offset = major - major_hypersparse_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + }); + edge_partition_dcs_nzd_range_bitmaps.push_back(std::move(bitmap)); + } + + return edge_partition_dcs_nzd_range_bitmaps; +} + } // namespace template @@ -441,8 +482,6 @@ graph_t> const& edge_partition_indices, std::optional>> const& edge_partition_dcs_nzd_vertices, + std::optional>> const& + edge_partition_dcs_nzd_range_bitmaps, graph_view_meta_t meta) : detail::graph_base_t( meta.number_of_vertices, meta.number_of_edges, meta.properties), edge_partition_offsets_(edge_partition_offsets), edge_partition_indices_(edge_partition_indices), edge_partition_dcs_nzd_vertices_(edge_partition_dcs_nzd_vertices), + edge_partition_dcs_nzd_range_bitmaps_(edge_partition_dcs_nzd_range_bitmaps), partition_(meta.partition), edge_partition_segment_offsets_(meta.edge_partition_segment_offsets), edge_partition_hypersparse_degree_offsets_(meta.edge_partition_hypersparse_degree_offsets), From b911cdd1b3fe72aa4be3817b96293319719715a8 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 9 Oct 2024 23:15:47 -0700 Subject: [PATCH 099/126] bug fix --- .../cugraph/utilities/shuffle_comm.cuh | 2 ++ .../detail/extract_transform_v_frontier_e.cuh | 20 +++++++++---------- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 1 - 3 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/include/cugraph/utilities/shuffle_comm.cuh b/cpp/include/cugraph/utilities/shuffle_comm.cuh index 39a8ed1a7b7..98fa2cb1706 100644 --- a/cpp/include/cugraph/utilities/shuffle_comm.cuh +++ b/cpp/include/cugraph/utilities/shuffle_comm.cuh @@ -51,6 +51,8 @@ namespace cugraph { namespace detail { +constexpr size_t cache_line_size = 128; + template struct compute_group_id_count_pair_t { GroupIdIterator group_id_first{}; diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 8bd1bb299a7..29da5f0d126 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -1195,21 +1195,19 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete +#if EXTRACT_PERFORMANCE_MEASUREMENT auto subtime3 = std::chrono::steady_clock::now(); #endif std::vector tmp_buffer_sizes(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; - // FIXME: tmp_buffer_idx.value() implicitly synchronizes to copy the results to host - tmp_buffer_sizes[j] = tmp_buffer_idx.value(loop_stream); + tmp_buffer_sizes[j] = tmp_buffer_idx.value(loop_stream); } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete +#if EXTRACT_PERFORMANCE_MEASUREMENT if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime4 = std::chrono::steady_clock::now(); #endif @@ -1245,8 +1243,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::chrono::duration subdur3 = subtime4 - subtime3; std::chrono::duration subdur4 = subtime5 - subtime4; std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," - << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << ")" - << std::endl; + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() + << ") loop_count=" << loop_count << std::endl; #endif } #if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -1267,7 +1265,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::vector buffer_sizes(key_buffers.size()); static_assert(!std::is_same_v || !std::is_same_v); for (size_t i = 0; i < key_buffers.size(); ++i) { - if constexpr (!std::is_same_v) { + if constexpr (!std::is_same_v) { buffer_sizes[i] = size_optional_dataframe_buffer(key_buffers[i]); } else { buffer_sizes[i] = size_optional_dataframe_buffer(value_buffers[i]); diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 67ceb2a482f..88b3bd110f8 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -653,7 +653,6 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, RAFT_CUDA_TRY(cudaDeviceSynchronize()); time3 = std::chrono::steady_clock::now(); #endif - constexpr size_t cache_line_size = 128; size_t min_element_size{cache_line_size}; if constexpr (std::is_same_v) { if constexpr (try_compression) { From 468794996f6655f42529f90d05f46bdd9645c893 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 10 Oct 2024 23:49:06 -0700 Subject: [PATCH 100/126] update unrenumber functions --- cpp/include/cugraph/graph_functions.hpp | 68 +++-- .../approx_weighted_matching_impl.cuh | 5 +- cpp/src/community/detail/common_methods.cuh | 5 +- cpp/src/community/detail/refine_impl.cuh | 15 +- cpp/src/lookup/lookup_src_dst_impl.cuh | 5 +- ..._v_pair_transform_dst_nbr_intersection.cuh | 95 +++---- ...m_reduce_dst_key_aggregated_outgoing_e.cuh | 5 +- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 4 +- cpp/src/structure/renumber_utils_impl.cuh | 127 ++++++++- .../structure/renumber_utils_mg_v32_e32.cu | 14 + .../structure/renumber_utils_mg_v64_e64.cu | 14 + .../structure/renumber_utils_sg_v32_e32.cu | 14 + .../structure/renumber_utils_sg_v64_e64.cu | 14 + cpp/src/traversal/extract_bfs_paths_impl.cuh | 14 +- cpp/src/utilities/collect_comm.cuh | 245 +++++++++--------- 15 files changed, 421 insertions(+), 223 deletions(-) diff --git a/cpp/include/cugraph/graph_functions.hpp b/cpp/include/cugraph/graph_functions.hpp index 4ec29dec363..b7db152d476 100644 --- a/cpp/include/cugraph/graph_functions.hpp +++ b/cpp/include/cugraph/graph_functions.hpp @@ -246,7 +246,7 @@ void unrenumber_int_vertices(raft::handle_t const& handle, * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -286,7 +286,7 @@ std::enable_if_t unrenumber_local_int_edges( * * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -310,6 +310,38 @@ std::enable_if_t unrenumber_local_int_edges(raft::handle_t con vertex_t num_vertices, bool do_expensive_check = false); +/** + * @brief Unrenumber local internal edge destinations to external vertices based on the providied @p + * renumber_map_labels. + * + * Note cugraph::invalid_id::value remains unchanged. This function requires the input + * edge destination vertices to unrenumber to be sorted and unique. + * + * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. + * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if + * true) as major indices in storing edges using a 2D sparse matrix. + * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) + * or multi-GPU (true). + * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and + * handles to various CUDA libraries) to run graph algorithms. + * @param sorted_unique_edge_dsts Span object storing the pointer to the edge destination internal + * vertices to be unrenumbered and the size of the pointed array. The input edge desntiation + * internal vertices should be sorted and unique. The input edge desntiation internal vertices are + * renumbered to external vertices in-place. + * @param renumber_map Span object storing pointer to the external vertices corresponding to the + * internal vertices (assigned to this process in multi-GPU) and the size of the array. + * @param vertex_partition_range_lasts Last local internal vertices (exclusive, assigned to each + * process in multi-GPU). + * @param do_expensive_check A flag to run expensive checks for input arguments (if set to `true`). + */ +template +void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_vertices /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check = false); + /** * @brief Renumber local external vertices to internal vertices based on the provided @p * renumber_map_labels. @@ -348,7 +380,7 @@ void renumber_local_ext_vertices(raft::handle_t const& handle, * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam edge_type_t Type of edge types. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -390,7 +422,7 @@ decompress_to_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -423,7 +455,7 @@ symmetrize_edgelist(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -465,7 +497,7 @@ symmetrize_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -507,7 +539,7 @@ transpose_graph( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -551,7 +583,7 @@ transpose_graph_storage( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -627,7 +659,7 @@ void relabel(raft::handle_t const& handle, * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -680,7 +712,7 @@ extract_induced_subgraphs( * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -745,7 +777,7 @@ create_graph_from_edgelist(raft::handle_t const& handle, * @tparam edge_type_t Type of edge type. Needs to be an integral type, currently only int32_t is * supported * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -809,7 +841,7 @@ create_graph_from_edgelist( * @tparam vertex_t Type of vertex identifiers. Needs to be an integral type. * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -831,7 +863,7 @@ std::tuple, rmm::device_uvector> get_two * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -858,7 +890,7 @@ rmm::device_uvector compute_in_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -885,7 +917,7 @@ rmm::device_uvector compute_out_weight_sums( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -912,7 +944,7 @@ weight_t compute_max_in_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -939,7 +971,7 @@ weight_t compute_max_out_weight_sum( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and @@ -965,7 +997,7 @@ weight_t compute_total_edge_weight( * @tparam edge_t Type of edge identifiers. Needs to be an integral type. * @tparam weight_t Type of edge weights. Needs to be a floating point type. * @tparam store_transposed Flag indicating whether to use sources (if false) or destinations (if - * true) as major indices in storing edges using a 2D sparse matrix. transposed. + * true) as major indices in storing edges using a 2D sparse matrix. * @tparam multi_gpu Flag indicating whether template instantiation should target single-GPU (false) * or multi-GPU (true). * @param handle RAFT handle object to encapsulate resources (e.g. CUDA stream, communicator, and diff --git a/cpp/src/community/approx_weighted_matching_impl.cuh b/cpp/src/community/approx_weighted_matching_impl.cuh index a0ccfa52ffc..869ed4e7ae6 100644 --- a/cpp/src/community/approx_weighted_matching_impl.cuh +++ b/cpp/src/community/approx_weighted_matching_impl.cuh @@ -243,11 +243,12 @@ std::tuple, weight_t> approximate_weighted_matchin major_comm_size, minor_comm_size}; - candidates_of_candidates = cugraph::collect_values_for_keys(handle, + candidates_of_candidates = cugraph::collect_values_for_keys(comm, target_candidate_map.view(), candidates.begin(), candidates.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { candidates_of_candidates.resize(candidates.size(), handle.get_stream()); diff --git a/cpp/src/community/detail/common_methods.cuh b/cpp/src/community/detail/common_methods.cuh index e17abdb3703..18fb3fdb251 100644 --- a/cpp/src/community/detail/common_methods.cuh +++ b/cpp/src/community/detail/common_methods.cuh @@ -289,11 +289,12 @@ rmm::device_uvector update_clustering_by_delta_modularity( invalid_vertex_id::value, std::numeric_limits::max(), handle.get_stream()); - vertex_cluster_weights_v = cugraph::collect_values_for_keys(handle, + vertex_cluster_weights_v = cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), next_clusters_v.begin(), next_clusters_v.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); src_cluster_weights = edge_src_property_t, weight_t>(handle, diff --git a/cpp/src/community/detail/refine_impl.cuh b/cpp/src/community/detail/refine_impl.cuh index 272e3d71f83..d11e38dbf9d 100644 --- a/cpp/src/community/detail/refine_impl.cuh +++ b/cpp/src/community/detail/refine_impl.cuh @@ -182,11 +182,12 @@ refine_clustering( comm_size, major_comm_size, minor_comm_size}; vertex_louvain_cluster_weights = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, cluster_key_weight_map.view(), louvain_assignment_of_vertices.begin(), louvain_assignment_of_vertices.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { vertex_louvain_cluster_weights.resize(louvain_assignment_of_vertices.size(), @@ -468,11 +469,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; louvain_of_leiden_keys_used_in_edge_reduction = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_used_in_edge_reduction.begin(), leiden_keys_used_in_edge_reduction.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { louvain_of_leiden_keys_used_in_edge_reduction.resize( leiden_keys_used_in_edge_reduction.size(), handle.get_stream()); @@ -859,11 +861,12 @@ refine_clustering( // comm_size, major_comm_size, minor_comm_size}; lovain_of_leiden_cluster_keys = - cugraph::collect_values_for_keys(handle, + cugraph::collect_values_for_keys(comm, leiden_to_louvain_map.view(), leiden_keys_to_read_louvain.begin(), leiden_keys_to_read_louvain.end(), - vertex_to_gpu_id_op); + vertex_to_gpu_id_op, + handle.get_stream()); } else { lovain_of_leiden_cluster_keys.resize(leiden_keys_to_read_louvain.size(), handle.get_stream()); diff --git a/cpp/src/lookup/lookup_src_dst_impl.cuh b/cpp/src/lookup/lookup_src_dst_impl.cuh index 1c8c39fd6dd..45bbf870d80 100644 --- a/cpp/src/lookup/lookup_src_dst_impl.cuh +++ b/cpp/src/lookup/lookup_src_dst_impl.cuh @@ -115,12 +115,13 @@ struct lookup_container_t::lookup_con auto const minor_comm_size = minor_comm.get_size(); value_buffer = cugraph::collect_values_for_keys( - handle, + comm, kv_store_object->view(), edge_ids_to_lookup.begin(), edge_ids_to_lookup.end(), cugraph::detail::compute_gpu_id_from_ext_edge_id_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); } else { cugraph::resize_dataframe_buffer( value_buffer, edge_ids_to_lookup.size(), handle.get_stream()); diff --git a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh index ce5e5d3e8cf..f03e8f54fb2 100644 --- a/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh +++ b/cpp/src/prims/per_v_pair_transform_dst_nbr_intersection.cuh @@ -250,11 +250,14 @@ void per_v_pair_transform_dst_nbr_intersection( } auto num_input_pairs = static_cast(thrust::distance(vertex_pair_first, vertex_pair_last)); - std::optional> unique_vertices{std::nullopt}; + std::optional> sorted_unique_vertices{std::nullopt}; std::optional(size_t{0}, rmm::cuda_stream_view{}))> - property_buffer_for_unique_vertices{std::nullopt}; + property_buffer_for_sorted_unique_vertices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { - unique_vertices = rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); + auto& comm = handle.get_comms(); + + sorted_unique_vertices = + rmm::device_uvector(num_input_pairs * 2, handle.get_stream()); auto elem0_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -262,7 +265,7 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem0_first, elem0_first + num_input_pairs, - (*unique_vertices).begin()); + (*sorted_unique_vertices).begin()); auto elem1_first = thrust::make_transform_iterator( vertex_pair_first, cugraph::thrust_tuple_get::value_type, @@ -270,25 +273,25 @@ void per_v_pair_transform_dst_nbr_intersection( thrust::copy(handle.get_thrust_policy(), elem1_first, elem1_first + num_input_pairs, - (*unique_vertices).begin() + num_input_pairs); - thrust::sort(handle.get_thrust_policy(), (*unique_vertices).begin(), (*unique_vertices).end()); - (*unique_vertices) - .resize(thrust::distance((*unique_vertices).begin(), + (*sorted_unique_vertices).begin() + num_input_pairs); + thrust::sort(handle.get_thrust_policy(), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end()); + (*sorted_unique_vertices) + .resize(thrust::distance((*sorted_unique_vertices).begin(), thrust::unique(handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end())), + (*sorted_unique_vertices).begin(), + (*sorted_unique_vertices).end())), handle.get_stream()); - std::tie(unique_vertices, property_buffer_for_unique_vertices) = - collect_values_for_unique_int_vertices(handle, - std::move(*unique_vertices), - vertex_value_input_first, - graph_view.vertex_partition_range_lasts()); - thrust::sort_by_key( - handle.get_thrust_policy(), - (*unique_vertices).begin(), - (*unique_vertices).end(), - (*property_buffer_for_unique_vertices).begin()); // necessary for binary search + property_buffer_for_sorted_unique_vertices = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span((*sorted_unique_vertices).data(), + (*sorted_unique_vertices).size()), + vertex_value_input_first, + graph_view.vertex_partition_range_lasts(), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } rmm::device_uvector vertex_pair_indices(num_input_pairs, handle.get_stream()); @@ -412,32 +415,32 @@ void per_v_pair_transform_dst_nbr_intersection( do_expensive_check); } - if (unique_vertices) { - auto vertex_value_input_for_unique_vertices_first = - get_dataframe_buffer_begin(*property_buffer_for_unique_vertices); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(this_chunk_size), - detail::call_intersection_op_t< - GraphViewType, - decltype(vertex_value_input_for_unique_vertices_first), - typename decltype(r_nbr_intersection_property_values0)::const_pointer, - IntersectionOp, - decltype(chunk_vertex_pair_index_first), - VertexPairIterator, - VertexPairValueOutputIterator>{edge_partition, - thrust::make_optional>( - (*unique_vertices).data(), (*unique_vertices).size()), - vertex_value_input_for_unique_vertices_first, - intersection_op, - intersection_offsets.data(), - intersection_indices.data(), - r_nbr_intersection_property_values0.data(), - r_nbr_intersection_property_values1.data(), - chunk_vertex_pair_index_first, - vertex_pair_first, - vertex_pair_value_output_first}); + if (sorted_unique_vertices) { + auto vertex_value_input_for_sorted_unique_vertices_first = + get_dataframe_buffer_begin(*property_buffer_for_sorted_unique_vertices); + thrust::for_each(handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(this_chunk_size), + detail::call_intersection_op_t< + GraphViewType, + decltype(vertex_value_input_for_sorted_unique_vertices_first), + typename decltype(r_nbr_intersection_property_values0)::const_pointer, + IntersectionOp, + decltype(chunk_vertex_pair_index_first), + VertexPairIterator, + VertexPairValueOutputIterator>{ + edge_partition, + thrust::make_optional>( + (*sorted_unique_vertices).data(), (*sorted_unique_vertices).size()), + vertex_value_input_for_sorted_unique_vertices_first, + intersection_op, + intersection_offsets.data(), + intersection_indices.data(), + r_nbr_intersection_property_values0.data(), + r_nbr_intersection_property_values1.data(), + chunk_vertex_pair_index_first, + vertex_pair_first, + vertex_pair_value_output_first}); } else { thrust::for_each(handle.get_thrust_policy(), thrust::make_counting_iterator(size_t{0}), diff --git a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh index 5a5e9332094..c13816242bc 100644 --- a/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh +++ b/cpp/src/prims/per_v_transform_reduce_dst_key_aggregated_outgoing_e.cuh @@ -924,11 +924,12 @@ void per_v_transform_reduce_dst_key_aggregated_outgoing_e( auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); std::tie(unique_minor_keys, values_for_unique_keys) = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, kv_store_view, std::move(unique_minor_keys), cugraph::detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); if constexpr (KVStoreViewType::binary_search) { multi_gpu_minor_key_value_map_ptr = diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 88b3bd110f8..80e16c1ffaf 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -813,7 +813,6 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); time4 = std::chrono::steady_clock::now(); - size_before_greduce = size_dataframe_buffer(key_buffer); #endif if constexpr (std::is_integral_v) { @@ -825,6 +824,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, } if constexpr (try_compression) { if (compressed_v_buffer) { + size_before_greduce = size_dataframe_buffer(*compressed_v_buffer); // FIXME: delete std::tie(key_buffer, payload_buffer) = detail::sort_and_reduce_buffer_elements( handle, @@ -834,6 +834,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, vertex_range, invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); } else { + size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete std::tie(key_buffer, payload_buffer) = detail::sort_and_reduce_buffer_elements( handle, @@ -844,6 +845,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); } } else { + size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete std::tie(key_buffer, payload_buffer) = detail::sort_and_reduce_buffer_elements( handle, diff --git a/cpp/src/structure/renumber_utils_impl.cuh b/cpp/src/structure/renumber_utils_impl.cuh index 3efa58d9632..8f69a3c152d 100644 --- a/cpp/src/structure/renumber_utils_impl.cuh +++ b/cpp/src/structure/renumber_utils_impl.cuh @@ -21,6 +21,7 @@ #include #include +#include #include #include #include @@ -363,7 +364,7 @@ void renumber_ext_vertices(raft::handle_t const& handle, } std::unique_ptr> renumber_map_ptr{nullptr}; - if (multi_gpu) { + if constexpr (multi_gpu) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); @@ -402,11 +403,12 @@ void renumber_ext_vertices(raft::handle_t const& handle, rmm::device_uvector int_vertices_for_sorted_unique_ext_vertices(0, handle.get_stream()); auto [unique_ext_vertices, int_vertices_for_unique_ext_vertices] = - collect_values_for_unique_keys(handle, + collect_values_for_unique_keys(comm, local_renumber_map.view(), std::move(sorted_unique_ext_vertices), detail::compute_gpu_id_from_ext_vertex_t{ - comm_size, major_comm_size, minor_comm_size}); + comm_size, major_comm_size, minor_comm_size}, + handle.get_stream()); renumber_map_ptr = std::make_unique>( unique_ext_vertices.begin(), @@ -573,7 +575,6 @@ void unrenumber_int_vertices(raft::handle_t const& handle, auto local_int_vertex_first = vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - auto local_int_vertex_last = vertex_partition_range_lasts[vertex_partition_id]; rmm::device_uvector sorted_unique_int_vertices(num_vertices, handle.get_stream()); sorted_unique_int_vertices.resize( @@ -595,16 +596,20 @@ void unrenumber_int_vertices(raft::handle_t const& handle, sorted_unique_int_vertices.end())), handle.get_stream()); - auto [unique_int_vertices, ext_vertices_for_unique_int_vertices] = - collect_values_for_unique_int_vertices(handle, - std::move(sorted_unique_int_vertices), - renumber_map_labels, - vertex_partition_range_lasts); + auto ext_vertices_for_sorted_unique_int_vertices = + collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + renumber_map_labels, + vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); kv_store_t renumber_map( - unique_int_vertices.begin(), - unique_int_vertices.begin() + unique_int_vertices.size(), - ext_vertices_for_unique_int_vertices.begin(), + sorted_unique_int_vertices.begin(), + sorted_unique_int_vertices.end(), + ext_vertices_for_sorted_unique_int_vertices.begin(), invalid_vertex_id::value, invalid_vertex_id::value, handle.get_stream()); @@ -667,4 +672,102 @@ std::enable_if_t unrenumber_local_int_edges(raft::handle_t con do_expensive_check); } +template +void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check) +{ + if (do_expensive_check) { + CUGRAPH_EXPECTS( + thrust::count_if(handle.get_thrust_policy(), + sorted_unique_edge_dsts.begin(), + sorted_unique_edge_dsts.end(), + [int_vertex_last = vertex_partition_range_lasts.back()] __device__(auto v) { + return v != invalid_vertex_id_v && + !is_valid_vertex(int_vertex_last, v); + }) == 0, + "Invalid input arguments: there are out-of-range vertices in sorted_unique_edge_dsts."); + CUGRAPH_EXPECTS( + thrust::is_sorted( + handle.get_thrust_policy(), sorted_unique_edge_dsts.begin(), sorted_unique_edge_dsts.end()), + "Invalid input arguments: the input internal edge destinations are not sorted."); + CUGRAPH_EXPECTS( + static_cast(thrust::count_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(sorted_unique_edge_dsts.size()), + detail::is_first_in_run_t{sorted_unique_edge_dsts.data()})) == + sorted_unique_edge_dsts.size(), + "Invalid input arguments: the input internal edge destinations have duplicates."); + } + + if constexpr (multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_size = major_comm.get_size(); + auto const major_comm_rank = major_comm.get_rank(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto const minor_comm_rank = minor_comm.get_rank(); + + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank); + auto local_int_vertex_first = vertex_partition_id == 0 + ? vertex_t{0} + : vertex_partition_range_lasts[vertex_partition_id - 1]; + + rmm::device_uvector ext_vertices_for_sorted_unique_edge_dsts(0, handle.get_stream()); + if constexpr (store_transposed) { + std::vector minor_comm_vertex_partition_range_lasts(minor_comm_size); + for (int i = 0; i < minor_comm_size; ++i) { + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, major_comm_rank, i); + minor_comm_vertex_partition_range_lasts[i] = + vertex_partition_range_lasts[vertex_partition_id]; + } + ext_vertices_for_sorted_unique_edge_dsts = collect_values_for_sorted_unique_int_vertices( + minor_comm, + raft::device_span(sorted_unique_edge_dsts.data(), + sorted_unique_edge_dsts.size()), + renumber_map.begin(), + minor_comm_vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); + } else { + std::vector major_comm_vertex_partition_range_lasts(major_comm_size); + for (int i = 0; i < major_comm_size; ++i) { + auto vertex_partition_id = + partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( + major_comm_size, minor_comm_size, i, minor_comm_rank); + major_comm_vertex_partition_range_lasts[i] = + vertex_partition_range_lasts[vertex_partition_id]; + } + ext_vertices_for_sorted_unique_edge_dsts = collect_values_for_sorted_unique_int_vertices( + major_comm, + raft::device_span(sorted_unique_edge_dsts.data(), + sorted_unique_edge_dsts.size()), + renumber_map.begin(), + major_comm_vertex_partition_range_lasts, + local_int_vertex_first, + handle.get_stream()); + } + thrust::copy(handle.get_thrust_policy(), + ext_vertices_for_sorted_unique_edge_dsts.begin(), + ext_vertices_for_sorted_unique_edge_dsts.end(), + sorted_unique_edge_dsts.begin()); + } else { + unrenumber_local_int_vertices(handle, + sorted_unique_edge_dsts.data(), + sorted_unique_edge_dsts.size(), + renumber_map.data(), + vertex_t{0}, + vertex_partition_range_lasts[0], + do_expensive_check); + } +} + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_mg_v32_e32.cu b/cpp/src/structure/renumber_utils_mg_v32_e32.cu index 93b18aeab86..987ad8a64e6 100644 --- a/cpp/src/structure/renumber_utils_mg_v32_e32.cu +++ b/cpp/src/structure/renumber_utils_mg_v32_e32.cu @@ -64,4 +64,18 @@ template void unrenumber_local_int_edges( std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_mg_v64_e64.cu b/cpp/src/structure/renumber_utils_mg_v64_e64.cu index d528ade2a4c..b5911351cca 100644 --- a/cpp/src/structure/renumber_utils_mg_v64_e64.cu +++ b/cpp/src/structure/renumber_utils_mg_v64_e64.cu @@ -64,4 +64,18 @@ template void unrenumber_local_int_edges( std::optional>> const& edgelist_intra_partition_segment_offsets, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_sg_v32_e32.cu b/cpp/src/structure/renumber_utils_sg_v32_e32.cu index c1f4807d4a5..d106ac7ff67 100644 --- a/cpp/src/structure/renumber_utils_sg_v32_e32.cu +++ b/cpp/src/structure/renumber_utils_sg_v32_e32.cu @@ -69,4 +69,18 @@ template void unrenumber_local_int_edges(raft::handle_t co int32_t num_vertices, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/structure/renumber_utils_sg_v64_e64.cu b/cpp/src/structure/renumber_utils_sg_v64_e64.cu index 7a6e5d368a9..e8caca1c941 100644 --- a/cpp/src/structure/renumber_utils_sg_v64_e64.cu +++ b/cpp/src/structure/renumber_utils_sg_v64_e64.cu @@ -69,4 +69,18 @@ template void unrenumber_local_int_edges(raft::handle_t co int64_t num_vertices, bool do_expensive_check); +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + +template void unrenumber_sorted_unique_local_int_edge_dsts( + raft::handle_t const& handle, + raft::device_span sorted_unique_edge_dsts /* [INOUT] */, + raft::device_span renumber_map, + std::vector const& vertex_partition_range_lasts, + bool do_expensive_check); + } // namespace cugraph diff --git a/cpp/src/traversal/extract_bfs_paths_impl.cuh b/cpp/src/traversal/extract_bfs_paths_impl.cuh index 40030e2e39c..d228460bec3 100644 --- a/cpp/src/traversal/extract_bfs_paths_impl.cuh +++ b/cpp/src/traversal/extract_bfs_paths_impl.cuh @@ -220,11 +220,15 @@ std::tuple, vertex_t> extract_bfs_paths( detail::decrement_position{}); if constexpr (multi_gpu) { - current_frontier = collect_values_for_int_vertices(handle, - current_frontier.begin(), - current_frontier.end(), - predecessors, - h_vertex_partition_range_lasts); + auto& comm = handle.get_comms(); + current_frontier = + collect_values_for_int_vertices(comm, + current_frontier.begin(), + current_frontier.end(), + predecessors, + h_vertex_partition_range_lasts, + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); } else { thrust::transform(handle.get_thrust_policy(), current_frontier.begin(), diff --git a/cpp/src/utilities/collect_comm.cuh b/cpp/src/utilities/collect_comm.cuh index 2197409fe26..dc4267aac57 100644 --- a/cpp/src/utilities/collect_comm.cuh +++ b/cpp/src/utilities/collect_comm.cuh @@ -50,79 +50,73 @@ namespace cugraph { -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template -decltype(allocate_dataframe_buffer(0, - rmm::cuda_stream_view{})) -collect_values_for_keys(raft::handle_t const& handle, - KVStoreViewType kv_store_view, - KeyIterator collect_key_first, - KeyIterator collect_key_last, - KeyToGPUIdOp key_to_gpu_id_op) +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template +dataframe_buffer_type_t collect_values_for_keys( + raft::comms::comms_t const& comm, + KVStoreViewType kv_store_view, + KeyIterator collect_key_first, + KeyIterator collect_key_last, + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; static_assert(std::is_same_v::value_type, key_t>); using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - // 1. collect values for the unique keys in [collect_key_first, collect_key_last) rmm::device_uvector unique_keys(thrust::distance(collect_key_first, collect_key_last), - handle.get_stream()); + stream_view); thrust::copy( - handle.get_thrust_policy(), collect_key_first, collect_key_last, unique_keys.begin()); - thrust::sort(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end()); + rmm::exec_policy_nosync(stream_view), collect_key_first, collect_key_last, unique_keys.begin()); + thrust::sort(rmm::exec_policy_nosync(stream_view), unique_keys.begin(), unique_keys.end()); unique_keys.resize( thrust::distance( unique_keys.begin(), - thrust::unique(handle.get_thrust_policy(), unique_keys.begin(), unique_keys.end())), - handle.get_stream()); + thrust::unique(rmm::exec_policy(stream_view), unique_keys.begin(), unique_keys.end())), + stream_view); - auto values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); { - rmm::device_uvector rx_unique_keys(0, handle.get_stream()); + rmm::device_uvector rx_unique_keys(0, stream_view); std::vector rx_value_counts{}; std::tie(rx_unique_keys, rx_value_counts) = groupby_gpu_id_and_shuffle_values( comm, unique_keys.begin(), unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); - std::tie(rx_values_for_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + auto rx_values_for_unique_keys = allocate_dataframe_buffer(0, stream_view); + std::tie(rx_values_for_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); values_for_unique_keys = std::move(rx_values_for_unique_keys); } // 2. build a kv_store_t object for the k, v pairs in unique_keys, values_for_unique_keys. - kv_store_t unique_key_value_store( - handle.get_stream()); + kv_store_t unique_key_value_store(stream_view); if constexpr (KVStoreViewType::binary_search) { unique_key_value_store = kv_store_t(std::move(unique_keys), std::move(values_for_unique_keys), kv_store_view.invalid_value(), false, - handle.get_stream()); + stream_view); } else { auto kv_pair_first = thrust::make_zip_iterator( thrust::make_tuple(unique_keys.begin(), get_dataframe_buffer_begin(values_for_unique_keys))); auto valid_kv_pair_last = - thrust::remove_if(handle.get_thrust_policy(), + thrust::remove_if(rmm::exec_policy(stream_view), kv_pair_first, kv_pair_first + unique_keys.size(), [invalid_value = kv_store_view.invalid_value()] __device__(auto pair) { @@ -136,176 +130,173 @@ collect_values_for_keys(raft::handle_t const& handle, get_dataframe_buffer_begin(values_for_unique_keys), kv_store_view.invalid_key(), kv_store_view.invalid_value(), - handle.get_stream()); + stream_view); - unique_keys.resize(0, handle.get_stream()); - resize_dataframe_buffer(values_for_unique_keys, 0, handle.get_stream()); - unique_keys.shrink_to_fit(handle.get_stream()); - shrink_to_fit_dataframe_buffer(values_for_unique_keys, handle.get_stream()); + unique_keys.resize(0, stream_view); + resize_dataframe_buffer(values_for_unique_keys, 0, stream_view); + unique_keys.shrink_to_fit(stream_view); + shrink_to_fit_dataframe_buffer(values_for_unique_keys, stream_view); } auto unique_key_value_store_view = unique_key_value_store.view(); // 3. find values for [collect_key_first, collect_key_last) auto value_buffer = allocate_dataframe_buffer( - thrust::distance(collect_key_first, collect_key_last), handle.get_stream()); - unique_key_value_store_view.find(collect_key_first, - collect_key_last, - get_dataframe_buffer_begin(value_buffer), - handle.get_stream()); + thrust::distance(collect_key_first, collect_key_last), stream_view); + unique_key_value_store_view.find( + collect_key_first, collect_key_last, get_dataframe_buffer_begin(value_buffer), stream_view); return value_buffer; } -// for the keys in kv_store_view, key_to_gpu_id_op(key) should coincide with comm.get_rank() -template +// for the keys in kv_store_view, key_to_comm_rank_op(key) should coincide with comm.get_rank() +template std::tuple, - decltype(allocate_dataframe_buffer( - 0, cudaStream_t{nullptr}))> + dataframe_buffer_type_t> collect_values_for_unique_keys( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, KVStoreViewType kv_store_view, rmm::device_uvector&& collect_unique_keys, - KeyToGPUIdOp key_to_gpu_id_op) + KeyToCommRankOp key_to_comm_rank_op, + rmm::cuda_stream_view stream_view) { using key_t = typename KVStoreViewType::key_type; using value_t = typename KVStoreViewType::value_type; - auto& comm = handle.get_comms(); - - auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, handle.get_stream()); + auto values_for_collect_unique_keys = allocate_dataframe_buffer(0, stream_view); { auto [rx_unique_keys, rx_value_counts] = groupby_gpu_id_and_shuffle_values( comm, collect_unique_keys.begin(), collect_unique_keys.end(), - [key_to_gpu_id_op] __device__(auto val) { return key_to_gpu_id_op(val); }, - handle.get_stream()); + [key_to_comm_rank_op] __device__(auto val) { return key_to_comm_rank_op(val); }, + stream_view); auto values_for_rx_unique_keys = - allocate_dataframe_buffer(rx_unique_keys.size(), handle.get_stream()); + allocate_dataframe_buffer(rx_unique_keys.size(), stream_view); kv_store_view.find(rx_unique_keys.begin(), rx_unique_keys.end(), get_dataframe_buffer_begin(values_for_rx_unique_keys), - handle.get_stream()); + stream_view); - std::tie(values_for_collect_unique_keys, std::ignore) = - shuffle_values(comm, - get_dataframe_buffer_begin(values_for_rx_unique_keys), - rx_value_counts, - handle.get_stream()); + std::tie(values_for_collect_unique_keys, std::ignore) = shuffle_values( + comm, get_dataframe_buffer_begin(values_for_rx_unique_keys), rx_value_counts, stream_view); } return std::make_tuple(std::move(collect_unique_keys), std::move(values_for_collect_unique_keys)); } template -std::tuple< - rmm::device_uvector, - decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr}))> -collect_values_for_unique_int_vertices(raft::handle_t const& handle, - rmm::device_uvector&& collect_unique_int_vertices, - ValueIterator local_value_first, - std::vector const& vertex_partition_range_lasts) +dataframe_buffer_type_t::value_type> +collect_values_for_sorted_unique_int_vertices( + raft::comms::comms_t const& comm, + raft::device_span collect_sorted_unique_int_vertices, + ValueIterator local_value_first, + std::vector const& comm_rank_vertex_partition_range_lasts, + vertex_t local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using value_t = typename thrust::iterator_traits::value_type; - auto& comm = handle.get_comms(); - auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_size = major_comm.get_size(); - auto const major_comm_rank = major_comm.get_rank(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - auto const minor_comm_rank = minor_comm.get_rank(); + // 1.find tx_counts - // 1. groupby and shuffle internal vertices + rmm::device_uvector d_range_lasts(comm_rank_vertex_partition_range_lasts.size(), + stream_view); + raft::update_device(d_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.data(), + comm_rank_vertex_partition_range_lasts.size(), + stream_view); - rmm::device_uvector d_vertex_partition_range_lasts(vertex_partition_range_lasts.size(), - handle.get_stream()); - raft::update_device(d_vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.data(), - vertex_partition_range_lasts.size(), - handle.get_stream()); + rmm::device_uvector d_offsets(d_range_lasts.size() - 1, stream_view); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + collect_sorted_unique_int_vertices.begin(), + collect_sorted_unique_int_vertices.end(), + d_range_lasts.begin(), + d_range_lasts.begin() + (d_range_lasts.size() - 1), + d_offsets.begin()); - auto [rx_int_vertices, rx_int_vertex_counts] = groupby_gpu_id_and_shuffle_values( - comm, - collect_unique_int_vertices.begin(), - collect_unique_int_vertices.end(), - detail::compute_gpu_id_from_int_vertex_t{ - raft::device_span(d_vertex_partition_range_lasts.data(), - d_vertex_partition_range_lasts.size()), - major_comm_size, - minor_comm_size}, - handle.get_stream()); - - // 2: Lookup return values - - auto vertex_partition_id = - partition_manager::compute_vertex_partition_id_from_graph_subcomm_ranks( - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank); - auto local_int_vertex_first = - vertex_partition_id == 0 ? vertex_t{0} : vertex_partition_range_lasts[vertex_partition_id - 1]; - - auto value_buffer = - allocate_dataframe_buffer(rx_int_vertices.size(), handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + std::vector h_offsets(d_offsets.size() + 2); + raft::update_host(h_offsets.data() + 1, d_offsets.data(), d_offsets.size(), stream_view); + h_offsets[0] = 0; + h_offsets.back() = collect_sorted_unique_int_vertices.size(); + RAFT_CUDA_TRY(cudaStreamSynchronize(stream_view)); + + std::vector tx_counts(comm_rank_vertex_partition_range_lasts.size()); + std::adjacent_difference(h_offsets.begin() + 1, h_offsets.end(), tx_counts.begin()); + + // 2. shuffle sorted unique internal vertices to the owning ranks + + auto [rx_int_vertices, rx_counts] = + shuffle_values(comm, collect_sorted_unique_int_vertices.begin(), tx_counts, stream_view); + + // 3.Lookup return values + + auto value_buffer = allocate_dataframe_buffer(rx_int_vertices.size(), stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), rx_int_vertices.begin(), rx_int_vertices.end(), get_dataframe_buffer_begin(value_buffer), - [local_value_first, local_int_vertex_first] __device__(auto v) { - return local_value_first[v - local_int_vertex_first]; + [local_value_first, local_vertex_partition_range_first] __device__(auto v) { + return local_value_first[v - local_vertex_partition_range_first]; }); + rx_int_vertices.resize(0, stream_view); + rx_int_vertices.shrink_to_fit(stream_view); - // 3: Shuffle results back to original GPU + // 4. Shuffle results back to the original ranks - std::tie(value_buffer, std::ignore) = shuffle_values( - comm, get_dataframe_buffer_begin(value_buffer), rx_int_vertex_counts, handle.get_stream()); + std::tie(value_buffer, std::ignore) = + shuffle_values(comm, get_dataframe_buffer_begin(value_buffer), rx_counts, stream_view); - return std::make_tuple(std::move(collect_unique_int_vertices), std::move(value_buffer)); + return value_buffer; } template -decltype(allocate_dataframe_buffer::value_type>( - 0, cudaStream_t{nullptr})) +dataframe_buffer_type_t::value_type> collect_values_for_int_vertices( - raft::handle_t const& handle, + raft::comms::comms_t const& comm, VertexIterator collect_vertex_first, VertexIterator collect_vertex_last, ValueIterator local_value_first, std::vector::value_type> const& - vertex_partition_range_lasts) + comm_rank_vertex_partition_range_lasts, + typename thrust::iterator_traits::value_type local_vertex_partition_range_first, + rmm::cuda_stream_view stream_view) { using vertex_t = typename thrust::iterator_traits::value_type; using value_t = typename thrust::iterator_traits::value_type; size_t input_size = thrust::distance(collect_vertex_first, collect_vertex_last); - rmm::device_uvector sorted_unique_int_vertices(input_size, handle.get_stream()); + rmm::device_uvector sorted_unique_int_vertices(input_size, stream_view); - raft::copy( - sorted_unique_int_vertices.data(), collect_vertex_first, input_size, handle.get_stream()); + raft::copy(sorted_unique_int_vertices.data(), collect_vertex_first, input_size, stream_view); - thrust::sort(handle.get_thrust_policy(), + thrust::sort(rmm::exec_policy_nosync(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); - auto last = thrust::unique(handle.get_thrust_policy(), + auto last = thrust::unique(rmm::exec_policy(stream_view), sorted_unique_int_vertices.begin(), sorted_unique_int_vertices.end()); sorted_unique_int_vertices.resize(thrust::distance(sorted_unique_int_vertices.begin(), last), - handle.get_stream()); - - auto [unique_int_vertices, tmp_value_buffer] = collect_values_for_unique_int_vertices( - handle, std::move(sorted_unique_int_vertices), local_value_first, vertex_partition_range_lasts); + stream_view); - kv_store_t kv_map(std::move(unique_int_vertices), + auto tmp_value_buffer = collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(sorted_unique_int_vertices.data(), + sorted_unique_int_vertices.size()), + local_value_first, + comm_rank_vertex_partition_range_lasts, + local_vertex_partition_range_first, + stream_view); + + kv_store_t kv_map(std::move(sorted_unique_int_vertices), std::move(tmp_value_buffer), invalid_vertex_id::value, false, - handle.get_stream()); + stream_view); auto device_view = detail::kv_binary_search_store_device_view_t(kv_map.view()); - auto value_buffer = allocate_dataframe_buffer(input_size, handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), + auto value_buffer = allocate_dataframe_buffer(input_size, stream_view); + thrust::transform(rmm::exec_policy_nosync(stream_view), collect_vertex_first, collect_vertex_last, get_dataframe_buffer_begin(value_buffer), From 6c91fba3c2b42df8f1c9bdc919ef7eb428ca1ec3 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 11 Oct 2024 18:35:56 -0700 Subject: [PATCH 101/126] merge multiple allreduce opreations to a single allreduce operation --- .../prims/detail/per_v_transform_reduce_e.cuh | 223 +++++++++++------- 1 file changed, 135 insertions(+), 88 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index ffa0917fbe6..faa82476b8c 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -972,14 +972,14 @@ __host__ __device__ int priority_to_rank( } template -rmm::device_uvector compute_priorities( +void compute_priorities( raft::comms::comms_t const& comm, ValueIterator value_first, + raft::device_span priorities, std::optional> hypersparse_key_offsets, // we may not have values for the entire "range_size" if // hypersparse_key_offsets.has_value() is true size_t contiguous_size, - size_t range_size, int root, int subgroup_size /* faster interconnect within a subgroup */, typename thrust::iterator_traits::value_type init, @@ -993,8 +993,6 @@ rmm::device_uvector compute_priorities( // more than one, the GPU with (comm_rank == root) has the highest priority, GPUs in the same DGX // node should be the next) - rmm::device_uvector priorities(range_size, stream_view); - if (ignore_local_values) { thrust::fill(rmm::exec_policy_nosync(stream_view), priorities.begin(), @@ -1034,8 +1032,6 @@ rmm::device_uvector compute_priorities( is_not_equal_t::value_type>{init}); } } - - return priorities; } // return selected ranks if root. @@ -2032,6 +2028,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif auto loop_count = std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); +#if PER_V_PERFORMANCE_MEASUREMENT + std::vector bcast_sizes(loop_count); +#endif std::conditional_t>, @@ -2074,7 +2073,19 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, : local_key_list_sizes[partition_idx], handle.get_stream()); use_tmp_bcast_buffer = true; +#if PER_V_PERFORMANCE_MEASUREMENT + bcast_sizes[j] = + (v_list_bitmap ? packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]) + : local_key_list_sizes[partition_idx]) * + sizeof(uint32_t); +#endif } +#if PER_V_PERFORMANCE_MEASUREMENT + else { + bcast_sizes[j] = local_key_list_sizes[partition_idx] * sizeof(vertex_t); + } +#endif } if (!use_tmp_bcast_buffer) { edge_partition_key_buffers.push_back(allocate_dataframe_buffer( @@ -2666,12 +2677,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } if constexpr (std::is_same_v>) { - std::vector, - rmm::device_uvector, - rmm::device_uvector>> - edge_partition_priorities{}; - edge_partition_priorities.reserve(loop_count); - std::conditional_t>>, std::byte /* dummy */> @@ -2684,34 +2689,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } std::vector edge_partition_allreduce_sizes(loop_count); + std::vector edge_partition_allreduce_displacements(loop_count); std::vector edge_partition_contiguous_sizes(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - bool process_local_edges = true; - if constexpr (filter_input_key) { - if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } - } - - std::optional> hypersparse_non_deg1_key_offsets{ - std::nullopt}; - if constexpr (filter_input_key) { - if (edge_partition_hypersparse_key_offset_vectors) { - hypersparse_non_deg1_key_offsets = raft::device_span( - (*edge_partition_hypersparse_key_offset_vectors)[j].data(), - (*edge_partition_hypersparse_key_offset_vectors)[j].size() - - (edge_partition_deg1_hypersparse_key_offset_counts - ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] - : size_t{0})); - (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = - *hypersparse_non_deg1_key_offsets; - } - } - + auto partition_idx = i + j; auto const& output_buffer = edge_partition_major_output_buffers[j]; size_t allreduce_size{}; @@ -2740,18 +2722,69 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } edge_partition_allreduce_sizes[j] = allreduce_size; edge_partition_contiguous_sizes[j] = contiguous_size; + } + std::exclusive_scan(edge_partition_allreduce_sizes.begin(), + edge_partition_allreduce_sizes.end(), + edge_partition_allreduce_displacements.begin(), + size_t{0}); + std::variant, + rmm::device_uvector, + rmm::device_uvector> + aggregate_priorities = rmm::device_uvector(0, handle.get_stream()); + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities) + .resize(edge_partition_allreduce_displacements.back() + + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } else if (minor_comm_size <= + std::numeric_limits::max()) { // priority == uint16_t + aggregate_priorities = rmm::device_uvector( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } else { // priority == uint32_t + aggregate_priorities = rmm::device_uvector( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); + } + if (stream_pool_indices) { handle.sync_stream(); } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); + + std::optional> hypersparse_non_deg1_key_offsets{ + std::nullopt}; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + hypersparse_non_deg1_key_offsets = raft::device_span( + (*edge_partition_hypersparse_key_offset_vectors)[j].data(), + (*edge_partition_hypersparse_key_offset_vectors)[j].size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = + *hypersparse_non_deg1_key_offsets; + } + } + + auto const& output_buffer = edge_partition_major_output_buffers[j]; + + bool process_local_edges = true; + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } + } - std::variant, - rmm::device_uvector, - rmm::device_uvector> - priorities = rmm::device_uvector(0, loop_stream); if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t - priorities = compute_priorities( + compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), + raft::device_span( + std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), hypersparse_non_deg1_key_offsets, - contiguous_size, - allreduce_size, + edge_partition_contiguous_sizes[j], static_cast(partition_idx), subgroup_size, init, @@ -2759,61 +2792,59 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, loop_stream); } else if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint16_t - priorities = compute_priorities( + compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), + raft::device_span( + std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), hypersparse_non_deg1_key_offsets, - contiguous_size, - allreduce_size, + edge_partition_contiguous_sizes[j], static_cast(partition_idx), subgroup_size, init, process_local_edges ? false : true /* ignore_local_values */, loop_stream); } else { // priority == uint32_t - priorities = compute_priorities( + compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), + raft::device_span( + std::get<2>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), hypersparse_non_deg1_key_offsets, - contiguous_size, - allreduce_size, + edge_partition_contiguous_sizes[j], static_cast(partition_idx), subgroup_size, init, process_local_edges ? false : true /* ignore_local_values */, loop_stream); } - edge_partition_priorities.push_back(std::move(priorities)); } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT auto subtime6 = std::chrono::steady_clock::now(); #endif - device_group_start(minor_comm); - for (size_t j = 0; j < loop_count; ++j) { - auto& priorities = edge_partition_priorities[j]; - if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t - device_allreduce(minor_comm, - std::get<0>(priorities).data(), - std::get<0>(priorities).data(), - std::get<0>(priorities).size(), - raft::comms::op_t::MIN, - handle.get_stream()); - } else if (minor_comm_size <= - std::numeric_limits::max()) { // priority == uint16_t - CUGRAPH_FAIL( - "unimplemented."); // currently, raft does not support allreduce on uint16_t. - } else { // priority == uint32_t - device_allreduce(minor_comm, - std::get<2>(priorities).data(), - std::get<2>(priorities).data(), - std::get<2>(priorities).size(), - raft::comms::op_t::MIN, - handle.get_stream()); - } + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + device_allreduce(minor_comm, + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).data(), + std::get<0>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); + } else if (minor_comm_size <= + std::numeric_limits::max()) { // priority == uint16_t + CUGRAPH_FAIL( + "unimplemented."); // currently, raft does not support allreduce on uint16_t. + } else { // priority == uint32_t + device_allreduce(minor_comm, + std::get<2>(aggregate_priorities).data(), + std::get<2>(aggregate_priorities).data(), + std::get<2>(aggregate_priorities).size(), + raft::comms::op_t::MIN, + handle.get_stream()); } - device_group_end(minor_comm); if (stream_pool_indices) { handle.sync_stream(); } #if PER_V_PERFORMANCE_MEASUREMENT auto subtime7 = std::chrono::steady_clock::now(); @@ -2846,54 +2877,65 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto contiguous_size = edge_partition_contiguous_sizes[j]; - auto& priorities = edge_partition_priorities[j]; std::variant, std::optional>> selected_ranks_or_flags = rmm::device_uvector(0, loop_stream); if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + auto priorities = raft::device_span( + std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); selected_ranks_or_flags = compute_selected_ranks_from_priorities( minor_comm, - raft::device_span(std::get<0>(priorities).data(), - std::get<0>(priorities).size()), + priorities, hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, process_local_edges ? false : true /* ignore_local_values */, loop_stream); - std::get<0>(priorities).resize(0, loop_stream); - std::get<0>(priorities).shrink_to_fit(loop_stream); } else if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint16_t + auto priorities = raft::device_span( + std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); selected_ranks_or_flags = compute_selected_ranks_from_priorities( minor_comm, - raft::device_span(std::get<1>(priorities).data(), - std::get<1>(priorities).size()), + priorities, hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, process_local_edges ? false : true /* ignore_local_values */, loop_stream); - std::get<1>(priorities).resize(0, loop_stream); - std::get<1>(priorities).shrink_to_fit(loop_stream); } else { // priority_t == uint32_t + auto priorities = raft::device_span( + std::get<2>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]); selected_ranks_or_flags = compute_selected_ranks_from_priorities( minor_comm, - raft::device_span(std::get<2>(priorities).data(), - std::get<2>(priorities).size()), + priorities, hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, process_local_edges ? false : true /* ignore_local_values */, loop_stream); - std::get<2>(priorities).resize(0, loop_stream); - std::get<2>(priorities).shrink_to_fit(loop_stream); } edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); } -#if PER_V_PERFORMANCE_MEASUREMENT if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t + std::get<0>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } else if (minor_comm_size <= + std::numeric_limits::max()) { // priority == uint16_t + std::get<1>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } else { + std::get<2>(aggregate_priorities).resize(0, handle.get_stream()); + std::get<2>(aggregate_priorities).shrink_to_fit(handle.get_stream()); + } + if (stream_pool_indices) { handle.sync_stream(); } +#if PER_V_PERFORMANCE_MEASUREMENT auto subtime8 = std::chrono::steady_clock::now(); #endif @@ -3457,6 +3499,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, << subdur5.count() << "," << subdur6.count() << "," << subdur7.count() << "," << subdur8.count() << "," << subdur9.count() << "," << subdur10.count() << "," << subdur11.count() << ")" << std::endl; + raft::print_host_vector("bcast_sizes", bcast_sizes.data(), bcast_sizes.size(), std::cerr); + raft::print_host_vector("edge_partition_allreduce_sizes", + edge_partition_allreduce_sizes.data(), + edge_partition_allreduce_sizes.size(), + std::cerr); #endif } else { device_group_start(minor_comm); From 0bd734f5995442abd7417c5e470c6c6342dd810a Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 12 Oct 2024 21:42:23 -0700 Subject: [PATCH 102/126] update detail::per_v_transform_reudce_e --- .../detail/extract_transform_v_frontier_e.cuh | 18 +- .../prims/detail/per_v_transform_reduce_e.cuh | 2320 +++++++++++------ ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 1 + 3 files changed, 1473 insertions(+), 866 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 29da5f0d126..e451c18f4be 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -475,7 +475,7 @@ __global__ static void extract_transform_v_frontier_e_high_degree( } } -#define EXTRACT_PERFORMANCE_MEASUREMENT 1 +#define EXTRACT_PERFORMANCE_MEASUREMENT 1 // FIXME: delete template (size_t{0}, loop_stream)); } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete +#if EXTRACT_PERFORMANCE_MEASUREMENT auto subtime2 = std::chrono::steady_clock::now(); #endif @@ -1235,7 +1235,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete +#if EXTRACT_PERFORMANCE_MEASUREMENT auto subtime5 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; @@ -1247,7 +1247,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, << ") loop_count=" << loop_count << std::endl; #endif } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete +#if EXTRACT_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time2 = std::chrono::steady_clock::now(); #endif @@ -1298,7 +1298,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } -#if EXTRACT_PERFORMANCE_MEASUREMENT // FIXME: delete +#if EXTRACT_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time3 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = time1 - time0; diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index faa82476b8c..6309e79c42e 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -898,15 +898,13 @@ rank_to_priority(int rank, int comm_size, vertex_t offset /* to evenly distribute traffic */) { - using cast_t = - std::conditional_t, - int16_t, - std::conditional_t, - int32_t, - int64_t>>; // to prevent overflow (assuming that - // comm_size <= - // std::numeric_limits::max()) - if (rank == root) { // no need for communication (priority 0) + static_assert(std::is_same_v || std::is_same_v); + using cast_t = std::conditional_t, + int16_t, + int64_t>; // to prevent overflow (assuming that comm_size <= + // std::numeric_limits::max()) no need + // for communication (priority 0) + if (rank == root) { return priority_t{0}; } else if (rank / subgroup_size == root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in @@ -940,14 +938,11 @@ __host__ __device__ int priority_to_rank( int comm_size, vertex_t offset /* to evenly distribute traffict */) { - using cast_t = - std::conditional_t, - int16_t, - std::conditional_t, - int32_t, - int64_t>>; // to prevent overflow (assuming that - // comm_size <= - // std::numeric_limits::max()) + static_assert(std::is_same_v || std::is_same_v); + using cast_t = std::conditional_t, + int16_t, + int64_t>; // to prevent overflow (assuming that comm_size <= + // std::numeric_limits::max()) if (priority == priority_t{0}) { return root; } else if (priority < static_cast(subgroup_size)) { @@ -976,7 +971,7 @@ void compute_priorities( raft::comms::comms_t const& comm, ValueIterator value_first, raft::device_span priorities, - std::optional> + std::optional, raft::device_span>> hypersparse_key_offsets, // we may not have values for the entire "range_size" if // hypersparse_key_offsets.has_value() is true size_t contiguous_size, @@ -1015,21 +1010,39 @@ void compute_priorities( priorities.begin() + contiguous_size, priorities.end(), std::numeric_limits::max()); - auto priority_first = thrust::make_transform_iterator( - (*hypersparse_key_offsets).begin(), - cuda::proclaim_return_type( - [root, subgroup_size, comm_rank, comm_size] __device__(auto offset) { - return rank_to_priority( - comm_rank, root, subgroup_size, comm_size, static_cast(offset)); - })); - thrust::scatter_if( - rmm::exec_policy_nosync(stream_view), - priority_first, - priority_first + (*hypersparse_key_offsets).size(), - (*hypersparse_key_offsets).begin(), - value_first + contiguous_size, - priorities.begin(), - is_not_equal_t::value_type>{init}); + if ((*hypersparse_key_offsets).index() == 0) { + auto priority_first = thrust::make_transform_iterator( + std::get<0>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(uint32_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<0>(*hypersparse_key_offsets).size(), + std::get<0>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } else { + auto priority_first = thrust::make_transform_iterator( + std::get<1>(*hypersparse_key_offsets).begin(), + cuda::proclaim_return_type( + [root, subgroup_size, comm_rank, comm_size] __device__(size_t offset) { + return rank_to_priority( + comm_rank, root, subgroup_size, comm_size, static_cast(offset)); + })); + thrust::scatter_if( + rmm::exec_policy_nosync(stream_view), + priority_first, + priority_first + std::get<1>(*hypersparse_key_offsets).size(), + std::get<1>(*hypersparse_key_offsets).begin(), + value_first + contiguous_size, + priorities.begin(), + is_not_equal_t::value_type>{init}); + } } } } @@ -1038,11 +1051,14 @@ void compute_priorities( // otherwise, it is sufficient to just return bool flags indiciating whether this rank's values are // selected or not. template -std::variant /* root */, std::optional>> +std::variant, + int, + priority_t>> /* root, store selected ranks */, + std::optional> /* store bitmap */> compute_selected_ranks_from_priorities( raft::comms::comms_t const& comm, raft::device_span priorities, - std::optional> + std::optional, raft::device_span>> hypersparse_key_offsets, // we may not have values for the entire "range_size" if // hypersparse_key_offsets.has_value() is true size_t contiguous_size, @@ -1054,8 +1070,10 @@ compute_selected_ranks_from_priorities( auto const comm_rank = comm.get_rank(); auto const comm_size = comm.get_size(); + using rank_t = std::conditional_t, int, priority_t>; + if (comm_rank == root) { - rmm::device_uvector selected_ranks(priorities.size(), stream_view); + rmm::device_uvector selected_ranks(priorities.size(), stream_view); auto offset_priority_pair_first = thrust::make_zip_iterator(thrust::make_counting_iterator(vertex_t{0}), priorities.begin()); thrust::transform(rmm::exec_policy_nosync(stream_view), @@ -1069,7 +1087,7 @@ compute_selected_ranks_from_priorities( ? comm_size : priority_to_rank( priority, root, subgroup_size, comm_size, offset); - return rank; + return static_cast(rank); }); return selected_ranks; } else { @@ -1077,7 +1095,9 @@ compute_selected_ranks_from_priorities( if (!ignore_local_values) { keep_flags = rmm::device_uvector( packed_bool_size(hypersparse_key_offsets - ? (contiguous_size + (*hypersparse_key_offsets).size()) + ? (contiguous_size + ((*hypersparse_key_offsets).index() == 0 + ? std::get<0>(*hypersparse_key_offsets).size() + : std::get<1>(*hypersparse_key_offsets).size())) : contiguous_size), stream_view); thrust::fill(rmm::exec_policy_nosync(stream_view), @@ -1108,32 +1128,63 @@ compute_selected_ranks_from_priorities( } }); if (hypersparse_key_offsets) { - auto pair_first = - thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), - (*hypersparse_key_offsets).begin()); - thrust::for_each( - rmm::exec_policy_nosync(stream_view), - pair_first, - pair_first + (*hypersparse_key_offsets).size(), - [priorities = raft::device_span(priorities.data(), priorities.size()), - keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), - root, - subgroup_size, - comm_rank, - comm_size] __device__(auto pair) { - auto offset = thrust::get<1>(pair); - auto priority = priorities[offset]; - auto rank = (priority == std::numeric_limits::max()) - ? comm_size - : priority_to_rank( - priority, root, subgroup_size, comm_size, offset); - if (rank == comm_rank) { - cuda::atomic_ref word( - keep_flags[packed_bool_offset(thrust::get<0>(pair))]); - word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), - cuda::std::memory_order_relaxed); - } - }); + if ((*hypersparse_key_offsets).index() == 0) { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<0>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<0>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } else { + auto pair_first = + thrust::make_zip_iterator(thrust::make_counting_iterator(size_t{contiguous_size}), + std::get<1>(*hypersparse_key_offsets).begin()); + thrust::for_each( + rmm::exec_policy_nosync(stream_view), + pair_first, + pair_first + std::get<1>(*hypersparse_key_offsets).size(), + [priorities = raft::device_span(priorities.data(), priorities.size()), + keep_flags = raft::device_span((*keep_flags).data(), (*keep_flags).size()), + root, + subgroup_size, + comm_rank, + comm_size] __device__(auto pair) { + auto offset = thrust::get<1>(pair); + auto priority = priorities[offset]; + auto rank = + (priority == std::numeric_limits::max()) + ? comm_size + : priority_to_rank( + priority, root, subgroup_size, comm_size, static_cast(offset)); + if (rank == comm_rank) { + cuda::atomic_ref word( + keep_flags[packed_bool_offset(thrust::get<0>(pair))]); + word.fetch_or(packed_bool_mask(thrust::get<0>(pair)), + cuda::std::memory_order_relaxed); + } + }); + } } } return keep_flags; @@ -1245,18 +1296,20 @@ void per_v_transform_reduce_e_edge_partition( handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[2]; } - segment_key_iterator_t segment_key_first{}; + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; } else { segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); } - segment_key_first += (*key_segment_offsets)[2]; + *segment_key_first += (*key_segment_offsets)[2]; detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, - segment_key_first, - segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1277,18 +1330,20 @@ void per_v_transform_reduce_e_edge_partition( handle.get_device_properties().maxGridSize[0]); auto segment_output_buffer = output_buffer; if constexpr (update_major) { segment_output_buffer += (*key_segment_offsets)[1]; } - segment_key_iterator_t segment_key_first{}; + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; } else { segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); } - segment_key_first += (*key_segment_offsets)[1]; + *segment_key_first += (*key_segment_offsets)[1]; detail::per_v_transform_reduce_e_mid_degree <<>>( edge_partition, - segment_key_first, - segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), + *segment_key_first, + *segment_key_first + ((*key_segment_offsets)[2] - (*key_segment_offsets)[1]), edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1311,7 +1366,9 @@ void per_v_transform_reduce_e_edge_partition( ? detail::per_v_transform_reduce_e_kernel_high_degree_reduce_any_block_size : detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - segment_key_iterator_t segment_key_first{}; + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; } else { @@ -1320,8 +1377,8 @@ void per_v_transform_reduce_e_edge_partition( detail::per_v_transform_reduce_e_high_degree <<>>( edge_partition, - segment_key_first, - segment_key_first + (*key_segment_offsets)[1], + *segment_key_first, + *segment_key_first + (*key_segment_offsets)[1], edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1347,7 +1404,9 @@ void per_v_transform_reduce_e_edge_partition( raft::grid_1d_thread_t update_grid(num_keys, detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); - segment_key_iterator_t segment_key_first{}; + std::optional + segment_key_first{}; // std::optional as thrust::transform_iterator's default constructor + // is a deleted function, segment_key_first should always have a value if constexpr (use_input_key) { segment_key_first = edge_partition_key_first; } else { @@ -1356,8 +1415,8 @@ void per_v_transform_reduce_e_edge_partition( detail::per_v_transform_reduce_e_low_degree <<>>( edge_partition, - segment_key_first, - segment_key_first + num_keys, + *segment_key_first, + *segment_key_first + num_keys, edge_partition_src_value_input, edge_partition_dst_value_input, edge_partition_e_value_input, @@ -1452,9 +1511,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, static_assert(is_arithmetic_or_thrust_tuple_of_arithmetic::value); - constexpr bool try_bitmap = GraphViewType::is_multi_gpu && - !std::is_same_v && - std::is_same_v; + constexpr bool try_bitmap = + GraphViewType::is_multi_gpu && use_input_key && std::is_same_v; [[maybe_unused]] constexpr auto max_segments = detail::num_sparse_segments_per_vertex_partition + size_t{1}; @@ -1852,6 +1910,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // 6. compute optional bitmap info & compressed vertex list + bool v_compressible{false}; std:: conditional_t>, std::byte /* dummy */> v_list_bitmap{}; @@ -1864,8 +1923,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (minor_comm_size > 1) { auto const minor_comm_rank = minor_comm.get_rank(); - bool v_compressible{false}; - if constexpr (sizeof(vertex_t) > sizeof(uint32_t)) { + if constexpr (sizeof(vertex_t) == 8) { vertex_t local_v_list_max_range_size{0}; for (int i = 0; i < minor_comm_size; ++i) { auto range_size = local_v_list_range_lasts[i] - local_v_list_range_firsts[i]; @@ -1911,6 +1969,34 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } + bool uint32_key_output_offset = false; + if constexpr (GraphViewType::is_multi_gpu && update_major && + std::is_same_v>) { + size_t max_key_offset_size = std::numeric_limits::max(); + if constexpr (filter_input_key) { + max_key_offset_size = std::reduce( + local_key_list_sizes.begin(), local_key_list_sizes.end(), size_t{0}, [](auto l, auto r) { + return std::max(l, r); + }); + } else { + static_assert(!use_input_key); + for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(i)); + auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(i); + + auto output_range_size = + segment_offsets ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + + max_key_offset_size = std::max(static_cast(output_range_size), max_key_offset_size); + } + } + uint32_key_output_offset = + (max_key_offset_size <= static_cast(std::numeric_limits::max())); + } + // 7. set-up stream pool std::optional> stream_pool_indices{std::nullopt}; @@ -2010,11 +2096,15 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, decltype(minor_tmp_buffer->mutable_view().value_first())>, void /* dummy */>; + auto counters = allocate_optional_dataframe_buffer< + std::conditional_t>( + num_concurrent_loops, handle.get_stream()); + if constexpr (!GraphViewType::is_multi_gpu || !use_input_key) { if (stream_pool_indices) { handle.sync_stream(); } } - // 9. proces local edge partitions + // 9. process local edge partitions #if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -2032,17 +2122,24 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::vector bcast_sizes(loop_count); #endif - std::conditional_t>, - std::byte /* dummy */> + std::conditional_t< + GraphViewType::is_multi_gpu && use_input_key, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> edge_partition_key_buffers{}; std::conditional_t>>, + std::optional, rmm::device_uvector>>>, std::byte /* dummy */> - edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in the - // hypersparse region + edge_partition_hypersparse_key_offset_vectors{}; // drop zero local degree keys in th + // hypersparse regione std::conditional_t>, std::byte /* dummy */> edge_partition_deg1_hypersparse_key_offset_counts{}; + std::vector process_local_edges(loop_count, true); + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); @@ -2053,43 +2150,57 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::conditional_t, std::optional>>, std::byte /* dummy */> - edge_partition_tmp_bcast_buffers{std::nullopt}; - if constexpr (std::is_same_v) { - if (v_list_bitmap || compressed_v_list) { - edge_partition_tmp_bcast_buffers = std::vector>{}; - (*edge_partition_tmp_bcast_buffers).reserve(loop_count); + edge_partition_bitmap_buffers{std::nullopt}; + if constexpr (try_bitmap) { + if (v_list_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); } } for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - bool use_tmp_bcast_buffer = false; - if constexpr (std::is_same_v) { - if (edge_partition_tmp_bcast_buffers) { - (*edge_partition_tmp_bcast_buffers) - .emplace_back(v_list_bitmap - ? packed_bool_size(local_v_list_range_lasts[partition_idx] - - local_v_list_range_firsts[partition_idx]) - : local_key_list_sizes[partition_idx], + auto partition_idx = i + j; + + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]), handle.get_stream()); - use_tmp_bcast_buffer = true; + use_bitmap_buffer = true; #if PER_V_PERFORMANCE_MEASUREMENT - bcast_sizes[j] = - (v_list_bitmap ? packed_bool_size(local_v_list_range_lasts[partition_idx] - - local_v_list_range_firsts[partition_idx]) - : local_key_list_sizes[partition_idx]) * - sizeof(uint32_t); + bcast_sizes[j] = packed_bool_size(local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]) * + sizeof(uint32_t); #endif } #if PER_V_PERFORMANCE_MEASUREMENT else { - bcast_sizes[j] = local_key_list_sizes[partition_idx] * sizeof(vertex_t); + bcast_sizes[j] = local_key_list_sizes[partition_idx] * + (v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)); } #endif } - if (!use_tmp_bcast_buffer) { - edge_partition_key_buffers.push_back(allocate_dataframe_buffer( - local_key_list_sizes[partition_idx], handle.get_stream())); + if (!use_bitmap_buffer) { + bool allocated{false}; + if constexpr (try_bitmap) { + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_key_list_sizes[partition_idx], handle.get_stream())); + allocated = true; + } + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_key_list_sizes[partition_idx], handle.get_stream())); + } + } + + if constexpr (filter_input_key) { + if (static_cast(partition_idx) == minor_comm_rank) { + process_local_edges[j] = false; + } } } #if PER_V_PERFORMANCE_MEASUREMENT @@ -2099,28 +2210,31 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, device_group_start(minor_comm); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - bool use_tmp_bcast_buffer = false; - if constexpr (std::is_same_v) { + auto partition_idx = i + j; + if constexpr (try_bitmap) { if (v_list_bitmap) { device_bcast(minor_comm, (*v_list_bitmap).data(), - get_dataframe_buffer_begin((*edge_partition_tmp_bcast_buffers)[j]), - size_dataframe_buffer((*edge_partition_tmp_bcast_buffers)[j]), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), static_cast(partition_idx), handle.get_stream()); - use_tmp_bcast_buffer = true; } else if (compressed_v_list) { device_bcast(minor_comm, (*compressed_v_list).data(), - get_dataframe_buffer_begin((*edge_partition_tmp_bcast_buffers)[j]), - size_dataframe_buffer((*edge_partition_tmp_bcast_buffers)[j]), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } else { + device_bcast(minor_comm, + sorted_unique_key_first, + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + local_key_list_sizes[partition_idx], static_cast(partition_idx), handle.get_stream()); - use_tmp_bcast_buffer = true; } - } - if (!use_tmp_bcast_buffer) { + } else { device_bcast(minor_comm, sorted_unique_key_first, get_dataframe_buffer_begin(edge_partition_key_buffers[j]), @@ -2135,10 +2249,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, subtime2 = std::chrono::steady_clock::now(); #endif - if constexpr (std::is_same_v) { - if (edge_partition_tmp_bcast_buffers) { - // copy data (in the sparse region first if filter_input_key is true) to - // edge_partition_key_buffers[j] + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + // copy keys from temporary bitmap buffers to key buffers (copy only the sparse segments + // if filter_input_key is true) for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -2146,56 +2260,58 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - bool process_local_edges{true}; - if constexpr (filter_input_key) { - process_local_edges = (static_cast(partition_idx) != minor_comm_rank); + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + } else { + keys = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); } - auto keys = allocate_dataframe_buffer( - process_local_edges ? local_key_list_sizes[partition_idx] : size_t{0}, loop_stream); - if (process_local_edges) { + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + if (process_local_edges[j]) { auto range_first = local_v_list_range_firsts[partition_idx]; - if (v_list_bitmap) { - auto range_last = local_v_list_range_lasts[partition_idx]; - if constexpr (filter_input_key) { - if (graph_view.use_dcs()) { // skip copying the hypersparse segment - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - range_first = - std::min(range_first, *(edge_partition.major_hypersparse_first())); - range_last = std::min(range_last, *(edge_partition.major_hypersparse_first())); - } + auto range_last = local_v_list_range_lasts[partition_idx]; + if constexpr (filter_input_key) { + if (graph_view.use_dcs()) { // skip copying the hypersparse segment + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + range_last = std::min(range_last, *(edge_partition.major_hypersparse_first())); } - auto const& rx_tmps = (*edge_partition_tmp_bcast_buffers)[j]; - rmm::device_scalar dummy(size_t{0}, loop_stream); - retrieve_vertex_list_from_bitmap( - raft::device_span(rx_tmps.data(), rx_tmps.size()), - get_dataframe_buffer_begin(keys), - raft::device_span(dummy.data(), size_t{1}), - range_first, - range_last, - loop_stream); - } else { - assert(compressed_v_list); - auto const& rx_tmps = (*edge_partition_tmp_bcast_buffers)[j]; - auto input_first = rx_tmps.begin(); - auto input_last = rx_tmps.end(); - if constexpr (filter_input_key) { - if (graph_view.use_dcs()) { // skip copying the hypersparse segment - auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - input_last = input_first + key_segment_offsets[3]; - } + } + if (range_first < range_last) { + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span( + counters.data() + j, + size_t{1}), // dummy, we already know the counts (i.e. + // (*key_segment_offset_vectors)[partition_idx][3]) + range_first, + range_last, + loop_stream); } - thrust::transform( - rmm::exec_policy_nosync(loop_stream), - input_first, - input_last, - get_dataframe_buffer_begin(keys), - cuda::proclaim_return_type([range_first] __device__(uint32_t v_offset) { - return static_cast(range_first + v_offset); - })); } + } else { + rx_bitmap.resize(0, loop_stream); + rx_bitmap.shrink_to_fit(loop_stream); } edge_partition_key_buffers.push_back(std::move(keys)); } @@ -2205,22 +2321,91 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (filter_input_key) { if (graph_view.use_dcs()) { edge_partition_hypersparse_key_offset_vectors = - std::vector>{}; + std::vector, rmm::device_uvector>>{}; (*edge_partition_hypersparse_key_offset_vectors).reserve(loop_count); - edge_partition_deg1_hypersparse_key_offset_counts = std::vector(loop_count); + edge_partition_deg1_hypersparse_key_offset_counts = std::vector(loop_count, 0); + + std::conditional_t, + rmm::device_uvector>>, + std::vector>>>, + std::byte /* dummy */> + edge_partition_new_key_buffers{}; + bool allocate_new_key_buffer{true}; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { allocate_new_key_buffer = false; } + } + if (allocate_new_key_buffer) { // allocate new key buffers and copy the sparse segment + // keys to the new key buffers + if constexpr (try_bitmap) { + edge_partition_new_key_buffers = std::vector< + std::variant, rmm::device_uvector>>{}; + } else { + edge_partition_new_key_buffers = std::vector>{}; + } + (*edge_partition_new_key_buffers).reserve(loop_count); - std::vector> edge_partition_count_scalars{}; - edge_partition_count_scalars.reserve(loop_count); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - std::optional>> - edge_partition_tmp_key_buffers{}; - bool direct_copy = false; /// directly copy to edge_partition_key_buffers[] - if constexpr (std::is_same_v) { - if (edge_partition_tmp_bcast_buffers) { direct_copy = true; } - } - if (!direct_copy) { // copy the hypersparse keys to a temporary key buffer first - edge_partition_tmp_key_buffers = std::vector>{}; - (*edge_partition_tmp_key_buffers).reserve(loop_count); + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + if constexpr (try_bitmap) { + if (v_compressible) { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<0>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<0>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } else { + auto new_key_buffer = rmm::device_uvector( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy( + rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + std::get<1>(edge_partition_key_buffers[j]).resize(0, loop_stream); + std::get<1>(edge_partition_key_buffers[j]).shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } else { + auto new_key_buffer = allocate_dataframe_buffer( + process_local_edges[j] ? local_key_list_sizes[partition_idx] : size_t{0}, + loop_stream); + if (process_local_edges[j]) { + thrust::copy(rmm::exec_policy_nosync(loop_stream), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + + key_segment_offsets[3], + get_dataframe_buffer_begin(new_key_buffer)); + } else { + edge_partition_key_buffers[j].resize(0, loop_stream); + edge_partition_key_buffers[j].shrink_to_fit(loop_stream); + } + (*edge_partition_new_key_buffers).push_back(std::move(new_key_buffer)); + } + } } for (size_t j = 0; j < loop_count; ++j) { @@ -2231,85 +2416,66 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - - auto& keys = edge_partition_key_buffers[j]; - auto offsets = rmm::device_uvector( - process_local_edges ? (key_segment_offsets[4] - key_segment_offsets[3]) : vertex_t{0}, - loop_stream); - rmm::device_scalar count(size_t{0}, loop_stream); - auto tmp_keys = allocate_dataframe_buffer( - edge_partition_tmp_key_buffers ? (key_segment_offsets[4] - key_segment_offsets[3]) - : vertex_t{0}, - loop_stream); + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } - if (process_local_edges) { + if (process_local_edges[j]) { auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(partition_idx)); auto const& segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); -#if 1 auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); -#else - // FIXME: we can pre-compute this & store in graph_t - rmm::device_uvector segment_bitmap( - packed_bool_size((*segment_offsets)[4] - (*segment_offsets)[3]), loop_stream); - thrust::fill(rmm::exec_policy_nosync(loop_stream), - segment_bitmap.begin(), - segment_bitmap.end(), - packed_bool_empty_mask()); - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - *(edge_partition.dcs_nzd_vertices()), - *(edge_partition.dcs_nzd_vertices()) + *(edge_partition.dcs_nzd_vertex_count()), - [bitmap = raft::device_span(segment_bitmap.data(), segment_bitmap.size()), - major_hypersparse_first = - *(edge_partition.major_hypersparse_first())] __device__(auto major) { - auto major_offset = major - major_hypersparse_first; - cuda::atomic_ref word( - bitmap[packed_bool_offset(major_offset)]); - word.fetch_or(packed_bool_mask(major_offset), cuda::std::memory_order_relaxed); - }); -#endif - auto range_offset_first = std::min( - (edge_partition.major_range_first() + (*segment_offsets)[3] > - local_v_list_range_firsts[partition_idx]) - ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) - - local_v_list_range_firsts[partition_idx]) - : vertex_t{0}, - local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]); - - if constexpr (std::is_same_v) { - if (edge_partition_tmp_bcast_buffers) { - auto const& rx_tmps = (*edge_partition_tmp_bcast_buffers)[j]; - if (v_list_bitmap) { - auto range_offset_last = - std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) > - local_v_list_range_firsts[partition_idx]) - ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) - - local_v_list_range_firsts[partition_idx]) - : vertex_t{0}, - local_v_list_range_lasts[partition_idx] - - local_v_list_range_firsts[partition_idx]); + if constexpr (try_bitmap) { + if (v_list_bitmap) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto range_offset_first = + std::min((edge_partition.major_range_first() + (*segment_offsets)[3] > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[3]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + auto range_offset_last = + std::min(((edge_partition.major_range_first() + (*segment_offsets)[4]) > + local_v_list_range_firsts[partition_idx]) + ? ((edge_partition.major_range_first() + (*segment_offsets)[4]) - + local_v_list_range_firsts[partition_idx]) + : vertex_t{0}, + local_v_list_range_lasts[partition_idx] - + local_v_list_range_firsts[partition_idx]); + if (range_offset_first < range_offset_last) { auto count_first = thrust::make_transform_iterator( thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), cuda::proclaim_return_type( [range_bitmap = - raft::device_span(rx_tmps.data(), rx_tmps.size()), + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), range_offset_first] __device__(size_t i) { auto word = range_bitmap[i]; if (i == packed_bool_offset(range_offset_first)) { word &= ~packed_bool_partial_mask( range_offset_first % - packed_bools_per_word()); // exclude the bits in the sparse region + packed_bools_per_word()); // clear the bits in the sparse region } return static_cast(__popc(word)); })); rmm::device_uvector count_displacements( - rx_tmps.size() - packed_bool_offset(range_offset_first), loop_stream); + rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream); thrust::exclusive_scan(rmm::exec_policy_nosync(loop_stream), count_first, count_first + count_displacements.size(), @@ -2318,7 +2484,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, thrust::make_counting_iterator(range_offset_first), cuda::proclaim_return_type( [range_bitmap = - raft::device_span(rx_tmps.data(), rx_tmps.size()), + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), count_displacements = raft::device_span( count_displacements.data(), count_displacements.size()), range_offset_first, @@ -2328,7 +2494,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, packed_bool_offset(range_offset_first)) { word &= ~packed_bool_partial_mask( range_offset_first % - packed_bools_per_word()); // exclude the bits in the sparse region + packed_bools_per_word()); // clear the bits in the sparse region } return static_cast( start_offset + @@ -2337,78 +2503,188 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, __popc(word & packed_bool_partial_mask(range_offset % packed_bools_per_word()))); })); - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_counting_iterator(local_v_list_range_firsts[partition_idx] + - range_offset_first), - offset_first); - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (range_offset_last - range_offset_first), - thrust::make_transform_iterator( - thrust::make_counting_iterator(range_offset_first), - cuda::proclaim_return_type( - [range_bitmap = - raft::device_span(rx_tmps.data(), rx_tmps.size()), - segment_bitmap = raft::device_span( - segment_bitmap.data(), segment_bitmap.size()), - range_first = local_v_list_range_firsts[partition_idx], - major_hypersparse_first = - *(edge_partition - .major_hypersparse_first())] __device__(auto range_offset) { - auto segment_offset = - (range_first + range_offset) - major_hypersparse_first; - return ((range_bitmap[packed_bool_offset(range_offset)] & - packed_bool_mask(range_offset)) != packed_bool_empty_mask()) && - ((segment_bitmap[packed_bool_offset(segment_offset)] & - packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); - })), - thrust::make_zip_iterator( - get_dataframe_buffer_begin(keys) + key_segment_offsets[3], offsets.begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + auto flag_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(range_offset_first), + cuda::proclaim_return_type( + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = + *(edge_partition + .major_hypersparse_first())] __device__(auto range_offset) { + auto segment_offset = + (range_first + range_offset) - major_hypersparse_first; + return ((range_bitmap[packed_bool_offset(range_offset)] & + packed_bool_mask(range_offset)) != packed_bool_empty_mask()) && + ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (keys.index() == 0) { + if (offsets.index() == 0) { + auto input_pair_first = thrust::make_zip_iterator( + thrust::make_counting_iterator(range_offset_first), + thrust::make_transform_iterator(offset_first, + typecast_t{})); + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (range_offset_last - range_offset_first), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = thrust::make_zip_iterator( + thrust::make_counting_iterator(range_offset_first), offset_first); + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (range_offset_last - range_offset_first), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + if (offsets.index() == 0) { + auto input_pair_first = thrust::make_zip_iterator( + thrust::make_counting_iterator(local_v_list_range_firsts[partition_idx] + + range_offset_first), + thrust::make_transform_iterator(offset_first, + typecast_t{})); + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (range_offset_last - range_offset_first), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = thrust::make_zip_iterator( + thrust::make_counting_iterator(local_v_list_range_firsts[partition_idx] + + range_offset_first), + offset_first); + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (range_offset_last - range_offset_first), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } } else { - assert(compressed_v_list); - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_transform_iterator( - rx_tmps.begin() + key_segment_offsets[3], - cuda::proclaim_return_type( - [range_first = - local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { - return static_cast(range_first + v_offset); - })), - thrust::make_counting_iterator(static_cast(key_segment_offsets[3]))); - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), - thrust::make_transform_iterator( - rx_tmps.begin() + key_segment_offsets[3], - cuda::proclaim_return_type( - [segment_bitmap = raft::device_span( - segment_bitmap.data(), segment_bitmap.size()), - range_first = local_v_list_range_firsts[partition_idx], - major_hypersparse_first = *( - edge_partition.major_hypersparse_first())] __device__(auto v_offset) { - auto segment_offset = - (range_first + v_offset) - major_hypersparse_first; - return ((segment_bitmap[packed_bool_offset(segment_offset)] & - packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); - })), - thrust::make_zip_iterator( - get_dataframe_buffer_begin(keys) + key_segment_offsets[3], offsets.begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + thrust::fill(rmm::exec_policy_nosync(loop_stream), + counters.data() + j, + counters.data() + (j + 1), + size_t{0}); } } } - if (edge_partition_tmp_key_buffers) { - auto input_pair_first = - thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), - thrust::make_counting_iterator(size_t{0})) + - key_segment_offsets[3]; - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), - thrust::make_transform_iterator( + if (edge_partition_new_key_buffers) { + auto& new_keys = (*edge_partition_new_key_buffers)[j]; + if constexpr (try_bitmap) { + assert(!v_list_bitmap); + if (keys.index() == 0) { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + range_first = local_v_list_range_firsts[partition_idx], + major_hypersparse_first = + *(edge_partition + .major_hypersparse_first())] __device__(uint32_t v_offset) { + auto v = range_first + static_cast(v_offset); + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<0>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<0>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], + cuda::proclaim_return_type( + [segment_bitmap = raft::device_span(segment_bitmap.data(), + segment_bitmap.size()), + major_hypersparse_first = + *(edge_partition.major_hypersparse_first())] __device__(vertex_t v) { + auto segment_offset = v - major_hypersparse_first; + return ((segment_bitmap[packed_bool_offset(segment_offset)] & + packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(std::get<1>(keys)), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(std::get<1>(new_keys)) + + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } else { + auto flag_first = thrust::make_transform_iterator( get_dataframe_buffer_begin(keys) + key_segment_offsets[3], cuda::proclaim_return_type( [segment_bitmap = raft::device_span(segment_bitmap.data(), @@ -2419,28 +2695,51 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, thrust_tuple_get_or_identity(key) - major_hypersparse_first; return ((segment_bitmap[packed_bool_offset(segment_offset)] & packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); - })), - thrust::make_zip_iterator(get_dataframe_buffer_begin(tmp_keys), offsets.begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + })); + if (offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(uint32_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<0>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(keys), + thrust::make_counting_iterator(size_t{0})) + + key_segment_offsets[3]; + detail::copy_if_nosync( + input_pair_first, + input_pair_first + (key_segment_offsets[4] - key_segment_offsets[3]), + flag_first, + thrust::make_zip_iterator( + get_dataframe_buffer_begin(new_keys) + key_segment_offsets[3], + std::get<1>(offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } } } (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); - edge_partition_count_scalars.push_back(std::move(count)); - if (edge_partition_tmp_key_buffers) { - (*edge_partition_tmp_key_buffers).push_back(std::move(tmp_keys)); + if (edge_partition_new_key_buffers) { + edge_partition_key_buffers[j] = std::move((*edge_partition_new_key_buffers)[j]); } } - if (edge_partition_tmp_bcast_buffers) { (*edge_partition_tmp_bcast_buffers).clear(); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (edge_partition_bitmap_buffers) { (*edge_partition_bitmap_buffers).clear(); } std::vector h_counts(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - h_counts[j] = edge_partition_count_scalars[j].value(loop_stream); - } + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -2448,34 +2747,104 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - auto process_local_edges = (static_cast(partition_idx) != minor_comm_rank); - if (process_local_edges) { + if (process_local_edges[j]) { auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - if (edge_partition_tmp_key_buffers) { - auto const& tmp_keys = (*edge_partition_tmp_key_buffers)[j]; - thrust::copy(rmm::exec_policy_nosync(loop_stream), - get_dataframe_buffer_begin(tmp_keys), - get_dataframe_buffer_begin(tmp_keys) + h_counts[j], - get_dataframe_buffer_begin(edge_partition_key_buffers[j]) + - key_segment_offsets[3]); + auto& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + resize_dataframe_buffer( + std::get<0>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } else { + resize_dataframe_buffer( + std::get<1>(keys), key_segment_offsets[3] + h_counts[j], loop_stream); + } + } else { + resize_dataframe_buffer(keys, key_segment_offsets[3] + h_counts[j], loop_stream); + } + // skip shrink_to_fit to cut execution time + + auto& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(h_counts[j], loop_stream); + } else { + std::get<1>(offsets).resize(h_counts[j], loop_stream); } - resize_dataframe_buffer( - edge_partition_key_buffers[j], key_segment_offsets[3] + h_counts[j], loop_stream); - (*edge_partition_hypersparse_key_offset_vectors)[j].resize(h_counts[j], loop_stream); // skip shrink_to_fit to cut execution time - (*edge_partition_deg1_hypersparse_key_offset_counts)[j] = - size_dataframe_buffer((*edge_partition_hypersparse_key_offset_vectors)[j]) - - static_cast(thrust::distance( - get_dataframe_buffer_begin((*edge_partition_hypersparse_key_offset_vectors)[j]), - thrust::lower_bound( - rmm::exec_policy_nosync(loop_stream), - get_dataframe_buffer_begin((*edge_partition_hypersparse_key_offset_vectors)[j]), - get_dataframe_buffer_end((*edge_partition_hypersparse_key_offset_vectors)[j]), - local_key_list_sizes[partition_idx] - - (*local_key_list_deg1_sizes)[partition_idx]))); } } + + { // update edge_partition_deg1_hypersparse_key_offset_counts + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + std::vector h_ptrs( + loop_count); // pointers to hypersparse key offset vectors + std::vector h_scalars( + loop_count * 2); // (key offset vector sizes, start degree 1 key offset) + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + if (process_local_edges[j]) { + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + if (offsets.index() == 0) { + h_ptrs[j] = static_cast(std::get<0>(offsets).data()); + h_scalars[j * 2] = std::get<0>(offsets).size(); + } else { + h_ptrs[j] = static_cast(std::get<1>(offsets).data()); + h_scalars[j * 2] = std::get<1>(offsets).size(); + } + h_scalars[j * 2 + 1] = + local_key_list_sizes[partition_idx] - (*local_key_list_deg1_sizes)[partition_idx]; + } else { + h_ptrs[j] = static_cast(nullptr); + h_scalars[j * 2] = size_t{0}; + h_scalars[j * 2 + 1] = size_t{0}; + } + } + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + rmm::device_uvector d_scalars(h_scalars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_scalars.data(), h_scalars.data(), h_scalars.size(), handle.get_stream()); + rmm::device_uvector d_counts(loop_count, handle.get_stream()); + thrust::transform( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(loop_count), + d_counts.begin(), + cuda::proclaim_return_type( + [d_ptrs = raft::device_span(d_ptrs.data(), d_ptrs.size()), + d_scalars = raft::device_span(d_scalars.data(), d_scalars.size()), + uint32_key_output_offset] __device__(auto i) { + auto first = d_ptrs[i]; + if (first != static_cast(nullptr)) { + auto size = d_scalars[i * 2]; + auto start_offset = d_scalars[i * 2 + 1]; + if (uint32_key_output_offset) { + auto casted_first = static_cast(first); + return size - static_cast(thrust::distance( + casted_first, + thrust::lower_bound(thrust::seq, + casted_first, + casted_first + size, + static_cast(start_offset)))); + } else { + auto casted_first = static_cast(first); + return size - + static_cast(thrust::distance( + casted_first, + thrust::lower_bound( + thrust::seq, casted_first, casted_first + size, start_offset))); + } + } else { + return size_t{0}; + } + })); + raft::update_host((*edge_partition_deg1_hypersparse_key_offset_counts).data(), + d_counts.data(), + d_counts.size(), + handle.get_stream()); + handle.sync_stream(); + } } } } @@ -2491,6 +2860,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu && update_major) { edge_partition_major_output_buffers.reserve(loop_count); } + for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; auto loop_stream = stream_pool_indices @@ -2499,18 +2869,30 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu && update_major) { size_t buffer_size{0}; - if constexpr (use_input_key) { - buffer_size = size_dataframe_buffer(edge_partition_key_buffers[j]); - } else { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto const& segment_offsets = - graph_view.local_edge_partition_segment_offsets(partition_idx); + if (process_local_edges[j]) { + if constexpr (use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + buffer_size = size_dataframe_buffer(std::get<0>(keys)); + } else { + buffer_size = size_dataframe_buffer(std::get<1>(keys)); + } + } else { + buffer_size = size_dataframe_buffer(keys); + } + } else { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); - buffer_size = segment_offsets - ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ - : edge_partition.major_range_size(); + buffer_size = + segment_offsets + ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ + : edge_partition.major_range_size(); + } } edge_partition_major_output_buffers.push_back( allocate_dataframe_buffer(buffer_size, loop_stream)); @@ -2522,141 +2904,169 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto edge_partition_e_mask = - edge_mask_view - ? thrust::make_optional< - detail::edge_partition_edge_property_device_view_t>( - *edge_mask_view, partition_idx) - : thrust::nullopt; - auto edge_partition_stream_pool_indices = - stream_pool_indices ? std::make_optional>( - (*stream_pool_indices).data() + j * max_segments, max_segments) - : std::nullopt; - - T major_init{}; - T major_identity_element{}; - if constexpr (update_major) { - if constexpr (std::is_same_v>) { // if any edge has a non-init value, one - // of the non-init values will be - // selected. - major_init = init; - major_identity_element = init; - } else { - major_init = ReduceOp::identity_element; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - major_init = (static_cast(partition_idx) == minor_comm_rank) - ? init - : ReduceOp::identity_element; + if (process_local_edges[j]) { + auto partition_idx = i + j; + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto edge_partition_e_mask = + edge_mask_view + ? thrust::make_optional< + detail::edge_partition_edge_property_device_view_t>( + *edge_mask_view, partition_idx) + : thrust::nullopt; + auto edge_partition_stream_pool_indices = + stream_pool_indices ? std::make_optional>( + (*stream_pool_indices).data() + j * max_segments, max_segments) + : std::nullopt; + + T major_init{}; + T major_identity_element{}; + if constexpr (update_major) { + if constexpr (std::is_same_v>) { // if any edge has a non-init value, + // one of the non-init values will + // be selected. + major_init = init; + major_identity_element = init; } else { - major_init = init; + major_init = ReduceOp::identity_element; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + major_init = (static_cast(partition_idx) == minor_comm_rank) + ? init + : ReduceOp::identity_element; + } else { + major_init = init; + } + major_identity_element = ReduceOp::identity_element; } - major_identity_element = ReduceOp::identity_element; } - } - auto edge_partition_key_first = sorted_unique_key_first; - auto edge_partition_key_last = sorted_unique_nzd_key_last; - if constexpr (GraphViewType::is_multi_gpu && use_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - edge_partition_key_first = get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_key_last = get_dataframe_buffer_end(edge_partition_key_buffers[j]); - } - } - - std::optional> key_segment_offsets{std::nullopt}; - if constexpr (use_input_key) { - if (key_segment_offset_vectors) { - key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - if constexpr (filter_input_key) { - if (edge_partition_hypersparse_key_offset_vectors) { - (*key_segment_offsets).back() = static_cast( - thrust::distance(edge_partition_key_first, edge_partition_key_last)); - *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back(); + std::optional> key_segment_offsets{std::nullopt}; + if constexpr (use_input_key) { + if (key_segment_offset_vectors) { + key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + if constexpr (filter_input_key) { + if (edge_partition_hypersparse_key_offset_vectors) { + (*key_segment_offsets).back() = + size_dataframe_buffer(edge_partition_major_output_buffers[j]); + *((*key_segment_offsets).rbegin() + 1) = (*key_segment_offsets).back(); + } } } + } else { + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + if (segment_offsets) { + key_segment_offsets = std::vector((*segment_offsets).size()); + std::transform((*segment_offsets).begin(), + (*segment_offsets).end(), + (*key_segment_offsets).begin(), + [](vertex_t offset) { return static_cast(offset); }); + } } - } else { - auto const& segment_offsets = - graph_view.local_edge_partition_segment_offsets(partition_idx); - if (segment_offsets) { - key_segment_offsets = std::vector((*segment_offsets).size()); - std::transform((*segment_offsets).begin(), - (*segment_offsets).end(), - (*key_segment_offsets).begin(), - [](vertex_t offset) { return static_cast(offset); }); - } - } - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; - edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; - if constexpr (GraphViewType::is_storage_transposed) { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); - } else { - edge_partition_src_value_input = - edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); - edge_partition_dst_value_input = - edge_partition_dst_input_device_view_t(edge_dst_value_input); - } - auto edge_partition_e_value_input = - edge_partition_e_input_device_view_t(edge_value_input, partition_idx); - - std::conditional_t, - edge_partition_minor_output_device_view_t>, - VertexValueOutputIterator> - output_buffer{}; - if constexpr (GraphViewType::is_multi_gpu) { - if constexpr (update_major) { - output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]); + edge_partition_src_input_device_view_t edge_partition_src_value_input{}; + edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; + if constexpr (GraphViewType::is_storage_transposed) { + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input, partition_idx); } else { - output_buffer = - edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + edge_partition_src_value_input = + edge_partition_src_input_device_view_t(edge_src_value_input, partition_idx); + edge_partition_dst_value_input = + edge_partition_dst_input_device_view_t(edge_dst_value_input); + } + auto edge_partition_e_value_input = + edge_partition_e_input_device_view_t(edge_value_input, partition_idx); + + std::conditional_t, + edge_partition_minor_output_device_view_t>, + VertexValueOutputIterator> + output_buffer{}; + if constexpr (GraphViewType::is_multi_gpu) { + if constexpr (update_major) { + output_buffer = get_dataframe_buffer_begin(edge_partition_major_output_buffers[j]); + } else { + output_buffer = + edge_partition_minor_output_device_view_t(minor_tmp_buffer->mutable_view()); + } + } else { + output_buffer = tmp_vertex_value_output_first; } - } else { - output_buffer = tmp_vertex_value_output_first; - } - bool process_local_edges = true; - if constexpr (filter_input_key) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } - } + bool processed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_v_list_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_first + std::get<0>(keys).size(), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + processed = true; + } + } + if (!processed) { + auto edge_partition_key_first = sorted_unique_key_first; + auto edge_partition_key_last = sorted_unique_nzd_key_last; + if constexpr (GraphViewType::is_multi_gpu && use_input_key) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + edge_partition_key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + edge_partition_key_last = get_dataframe_buffer_end(std::get<1>(keys)); + } else { + edge_partition_key_first = get_dataframe_buffer_begin(keys); + edge_partition_key_last = get_dataframe_buffer_end(keys); + } + } - if (process_local_edges) { - per_v_transform_reduce_e_edge_partition( - handle, - edge_partition, - edge_partition_key_first, - edge_partition_key_last, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - output_buffer, - e_op, - major_init, - major_identity_element, - reduce_op, - pred_op, - key_segment_offsets ? std::make_optional>( - (*key_segment_offsets).data(), (*key_segment_offsets).size()) - : std::nullopt, - edge_partition_stream_pool_indices); + per_v_transform_reduce_e_edge_partition( + handle, + edge_partition, + edge_partition_key_first, + edge_partition_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_buffer, + e_op, + major_init, + major_identity_element, + reduce_op, + pred_op, + key_segment_offsets ? std::make_optional>( + (*key_segment_offsets).data(), (*key_segment_offsets).size()) + : std::nullopt, + edge_partition_stream_pool_indices); + } } } if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } @@ -2670,21 +3080,22 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const minor_comm_size = minor_comm.get_size(); if constexpr (use_input_key) { - if (minor_comm_size > 1) { - edge_partition_key_buffers.clear(); - edge_partition_key_buffers.shrink_to_fit(); - } + edge_partition_key_buffers.clear(); + edge_partition_key_buffers.shrink_to_fit(); } if constexpr (std::is_same_v>) { - std::conditional_t>>, - std::byte /* dummy */> + std::conditional_t< + filter_input_key, + std::optional, raft::device_span>>>, + std::byte /* dummy */> edge_partition_hypersparse_non_deg1_key_offset_spans{}; if constexpr (filter_input_key) { if (edge_partition_hypersparse_key_offset_vectors) { - edge_partition_hypersparse_non_deg1_key_offset_spans = - std::vector>(loop_count); + edge_partition_hypersparse_non_deg1_key_offset_spans = std::vector< + std::variant, raft::device_span>>( + loop_count); } } @@ -2727,20 +3138,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_allreduce_sizes.end(), edge_partition_allreduce_displacements.begin(), size_t{0}); - std::variant, - rmm::device_uvector, - rmm::device_uvector> + std::variant, rmm::device_uvector> aggregate_priorities = rmm::device_uvector(0, handle.get_stream()); if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t std::get<0>(aggregate_priorities) - .resize(edge_partition_allreduce_displacements.back() + - edge_partition_allreduce_sizes.back(), - handle.get_stream()); - } else if (minor_comm_size <= - std::numeric_limits::max()) { // priority == uint16_t - aggregate_priorities = rmm::device_uvector( - edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), - handle.get_stream()); + .resize( + edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), + handle.get_stream()); } else { // priority == uint32_t aggregate_priorities = rmm::device_uvector( edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), @@ -2754,16 +3158,28 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - std::optional> hypersparse_non_deg1_key_offsets{ - std::nullopt}; + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; if constexpr (filter_input_key) { if (edge_partition_hypersparse_key_offset_vectors) { - hypersparse_non_deg1_key_offsets = raft::device_span( - (*edge_partition_hypersparse_key_offset_vectors)[j].data(), - (*edge_partition_hypersparse_key_offset_vectors)[j].size() - - (edge_partition_deg1_hypersparse_key_offset_counts - ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] - : size_t{0})); + auto const& offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + + if (offsets.index() == 0) { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<0>(offsets).data(), + std::get<0>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } else { + hypersparse_non_deg1_key_offsets = raft::device_span( + std::get<1>(offsets).data(), + std::get<1>(offsets).size() - + (edge_partition_deg1_hypersparse_key_offset_counts + ? (*edge_partition_deg1_hypersparse_key_offset_counts)[j] + : size_t{0})); + } (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j] = *hypersparse_non_deg1_key_offsets; } @@ -2771,53 +3187,33 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto const& output_buffer = edge_partition_major_output_buffers[j]; - bool process_local_edges = true; - if constexpr (filter_input_key) { - if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } - } - if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - raft::device_span( - std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], - edge_partition_allreduce_sizes[j]), - hypersparse_non_deg1_key_offsets, - edge_partition_contiguous_sizes[j], - static_cast(partition_idx), - subgroup_size, - init, - process_local_edges ? false : true /* ignore_local_values */, - loop_stream); - } else if (minor_comm_size <= - std::numeric_limits::max()) { // priority == uint16_t - compute_priorities( - minor_comm, - get_dataframe_buffer_begin(output_buffer), - raft::device_span( - std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], - edge_partition_allreduce_sizes[j]), + raft::device_span(std::get<0>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), hypersparse_non_deg1_key_offsets, edge_partition_contiguous_sizes[j], static_cast(partition_idx), subgroup_size, init, - process_local_edges ? false : true /* ignore_local_values */, + process_local_edges[j] ? false : true /* ignore_local_values */, loop_stream); } else { // priority == uint32_t compute_priorities( minor_comm, get_dataframe_buffer_begin(output_buffer), - raft::device_span( - std::get<2>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], - edge_partition_allreduce_sizes[j]), + raft::device_span(std::get<1>(aggregate_priorities).data() + + edge_partition_allreduce_displacements[j], + edge_partition_allreduce_sizes[j]), hypersparse_non_deg1_key_offsets, edge_partition_contiguous_sizes[j], static_cast(partition_idx), subgroup_size, init, - process_local_edges ? false : true /* ignore_local_values */, + process_local_edges[j] ? false : true /* ignore_local_values */, loop_stream); } } @@ -2833,15 +3229,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::get<0>(aggregate_priorities).size(), raft::comms::op_t::MIN, handle.get_stream()); - } else if (minor_comm_size <= - std::numeric_limits::max()) { // priority == uint16_t - CUGRAPH_FAIL( - "unimplemented."); // currently, raft does not support allreduce on uint16_t. - } else { // priority == uint32_t + } else { // priority == uint32_t device_allreduce(minor_comm, - std::get<2>(aggregate_priorities).data(), - std::get<2>(aggregate_priorities).data(), - std::get<2>(aggregate_priorities).size(), + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).data(), + std::get<1>(aggregate_priorities).size(), raft::comms::op_t::MIN, handle.get_stream()); } @@ -2851,7 +3243,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, #endif std::vector< - std::variant, std::optional>>> + std::variant, rmm::device_uvector>, + std::optional>>> edge_partition_selected_ranks_or_flags{}; edge_partition_selected_ranks_or_flags.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { @@ -2860,14 +3253,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - bool process_local_edges = true; - if constexpr (filter_input_key) { - if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } - } - auto const& output_buffer = edge_partition_major_output_buffers[j]; - std::optional> hypersparse_non_deg1_key_offsets{ - std::nullopt}; + std::optional< + std::variant, raft::device_span>> + hypersparse_non_deg1_key_offsets{std::nullopt}; if constexpr (filter_input_key) { if (edge_partition_hypersparse_key_offset_vectors) { hypersparse_non_deg1_key_offsets = @@ -2877,48 +3266,51 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto contiguous_size = edge_partition_contiguous_sizes[j]; - std::variant, std::optional>> - selected_ranks_or_flags = rmm::device_uvector(0, loop_stream); + std::variant, rmm::device_uvector>, + std::optional>> + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + rmm::device_uvector(0, loop_stream)); if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t auto priorities = raft::device_span( std::get<0>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], edge_partition_allreduce_sizes[j]); - selected_ranks_or_flags = compute_selected_ranks_from_priorities( + auto tmp = compute_selected_ranks_from_priorities( minor_comm, priorities, hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, - process_local_edges ? false : true /* ignore_local_values */, - loop_stream); - } else if (minor_comm_size <= - std::numeric_limits::max()) { // priority == uint16_t - auto priorities = raft::device_span( - std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], - edge_partition_allreduce_sizes[j]); - selected_ranks_or_flags = compute_selected_ranks_from_priorities( - minor_comm, - priorities, - hypersparse_non_deg1_key_offsets, - contiguous_size, - static_cast(partition_idx), - subgroup_size, - process_local_edges ? false : true /* ignore_local_values */, + process_local_edges[j] ? false : true /* ignore_local_values */, loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } } else { // priority_t == uint32_t auto priorities = raft::device_span( - std::get<2>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], + std::get<1>(aggregate_priorities).data() + edge_partition_allreduce_displacements[j], edge_partition_allreduce_sizes[j]); - selected_ranks_or_flags = compute_selected_ranks_from_priorities( + auto tmp = compute_selected_ranks_from_priorities( minor_comm, priorities, hypersparse_non_deg1_key_offsets, contiguous_size, static_cast(partition_idx), subgroup_size, - process_local_edges ? false : true /* ignore_local_values */, + process_local_edges[j] ? false : true /* ignore_local_values */, loop_stream); + if (tmp.index() == 0) { + selected_ranks_or_flags = + std::variant, rmm::device_uvector>( + std::move(std::get<0>(tmp))); + } else { + selected_ranks_or_flags = std::move(std::get<1>(tmp)); + } } edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); } @@ -2926,13 +3318,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t std::get<0>(aggregate_priorities).resize(0, handle.get_stream()); std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream()); - } else if (minor_comm_size <= - std::numeric_limits::max()) { // priority == uint16_t + } else { std::get<1>(aggregate_priorities).resize(0, handle.get_stream()); std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream()); - } else { - std::get<2>(aggregate_priorities).resize(0, handle.get_stream()); - std::get<2>(aggregate_priorities).shrink_to_fit(handle.get_stream()); } if (stream_pool_indices) { handle.sync_stream(); } #if PER_V_PERFORMANCE_MEASUREMENT @@ -2941,8 +3329,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::vector> edge_partition_values{}; edge_partition_values.reserve(loop_count); - std::vector> edge_partition_count_scalars{}; - edge_partition_count_scalars.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -2950,258 +3336,260 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) : handle.get_stream(); - bool process_local_edges = true; - if constexpr (filter_input_key) { - if (static_cast(partition_idx) == minor_comm_rank) { process_local_edges = false; } - } - auto& output_buffer = edge_partition_major_output_buffers[j]; - auto values = - allocate_dataframe_buffer(size_dataframe_buffer(output_buffer), loop_stream); - rmm::device_scalar count(size_t{0}, loop_stream); - if (minor_comm_rank == static_cast(partition_idx)) { - if (process_local_edges) { + auto values = allocate_dataframe_buffer( + process_local_edges[j] ? size_dataframe_buffer(output_buffer) : size_t{0}, loop_stream); + if (process_local_edges[j]) { + if (minor_comm_rank == static_cast(partition_idx)) { assert(!use_input_key); assert(edge_partition_selected_ranks_or_flags[j].index() == 0); auto const& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<0>(selected_ranks).begin(), + cuda::proclaim_return_type([minor_comm_rank] __device__(auto rank) { + return static_cast(rank) == minor_comm_rank; + })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + copy_if_nosync( + get_dataframe_buffer_begin(output_buffer), + get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + thrust::make_transform_iterator( + std::get<1>(selected_ranks).begin(), + cuda::proclaim_return_type( + [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + get_dataframe_buffer_begin(values), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } else { + assert(edge_partition_selected_ranks_or_flags[j].index() == 1); + auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); + size_t input_end_offset{}; + if constexpr (filter_input_key) { + input_end_offset = edge_partition_contiguous_sizes[j]; + if (edge_partition_hypersparse_non_deg1_key_offset_spans) { + auto const& span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (span.index() == 0) { + input_end_offset += std::get<0>(span).size(); + } else { + input_end_offset += std::get<1>(span).size(); + } + } + } else { + input_end_offset = edge_partition_allreduce_sizes[j]; + } copy_if_nosync( get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_begin(output_buffer) + edge_partition_allreduce_sizes[j], + get_dataframe_buffer_begin(output_buffer) + input_end_offset, thrust::make_transform_iterator( - selected_ranks.begin(), + thrust::make_counting_iterator(size_t{0}), cuda::proclaim_return_type( - [minor_comm_rank] __device__(auto rank) { return rank == minor_comm_rank; })), + [keep_flags = raft::device_span( + (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { + auto word = keep_flags[packed_bool_offset(offset)]; + return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); + })), get_dataframe_buffer_begin(values), - raft::device_span(count.data(), size_t{1}), + raft::device_span(counters.data() + j, size_t{1}), loop_stream); + (*keep_flags).resize(0, loop_stream); + (*keep_flags).shrink_to_fit(loop_stream); } - } else { - assert(edge_partition_selected_ranks_or_flags[j].index() == 1); - auto& keep_flags = std::get<1>(edge_partition_selected_ranks_or_flags[j]); - size_t input_end_offset{}; - if constexpr (filter_input_key) { - input_end_offset = - edge_partition_contiguous_sizes[j] + - (edge_partition_hypersparse_non_deg1_key_offset_spans - ? (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size() - : size_t{0}); - } else { - input_end_offset = edge_partition_allreduce_sizes[j]; - } - copy_if_nosync( - get_dataframe_buffer_begin(output_buffer), - get_dataframe_buffer_begin(output_buffer) + input_end_offset, - thrust::make_transform_iterator( - thrust::make_counting_iterator(size_t{0}), - cuda::proclaim_return_type( - [keep_flags = raft::device_span( - (*keep_flags).data(), (*keep_flags).size())] __device__(size_t offset) { - auto word = keep_flags[packed_bool_offset(offset)]; - return ((word & packed_bool_mask(offset)) != packed_bool_empty_mask()); - })), - get_dataframe_buffer_begin(values), - raft::device_span(count.data(), size_t{1}), - loop_stream); - (*keep_flags).resize(0, loop_stream); - (*keep_flags).shrink_to_fit(loop_stream); } edge_partition_values.push_back(std::move(values)); - edge_partition_count_scalars.push_back(std::move(count)); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } std::vector copy_sizes(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - copy_sizes[j] = edge_partition_count_scalars[j].value(loop_stream); - } + raft::update_host(copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); - std::optional>, - std::vector>>> + std::optional< + std::vector, rmm::device_uvector>>> edge_partition_deg1_hypersparse_output_offset_vectors{}; if (graph_view.use_dcs()) { - size_t max_output_range_size{0}; - if constexpr (filter_input_key) { - max_output_range_size = std::reduce(local_key_list_sizes.begin(), - local_key_list_sizes.end(), - size_t{0}, - [](auto l, auto r) { return std::max(l, r); }); - } else { - for (size_t j = 0; j < loop_count; ++j) { - auto& output_buffer = edge_partition_major_output_buffers[j]; - max_output_range_size = - std::max(size_dataframe_buffer(output_buffer), max_output_range_size); - } - } - if (max_output_range_size < static_cast(std::numeric_limits::max())) { - edge_partition_deg1_hypersparse_output_offset_vectors = - std::vector>{}; - std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); - } else { - edge_partition_deg1_hypersparse_output_offset_vectors = - std::vector>{}; - std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); - } + edge_partition_deg1_hypersparse_output_offset_vectors = + std::vector, rmm::device_uvector>>{}; + (*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - - bool process_local_edges = true; - if constexpr (filter_input_key) { - if (static_cast(partition_idx) == minor_comm_rank) { - process_local_edges = false; - } - } + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); auto& output_buffer = edge_partition_major_output_buffers[j]; - auto& values = edge_partition_values[j]; - auto& count = edge_partition_count_scalars[j]; - - size_t output_offset_buf_size{0}; - if constexpr (filter_input_key) { - output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j]; - } else { - assert(!use_input_key); - output_offset_buf_size = - size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j]; - } - std::variant, rmm::device_uvector> output_offsets = rmm::device_uvector(0, loop_stream); - if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { - std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream); - } else { - output_offsets = rmm::device_uvector(output_offset_buf_size, loop_stream); + if (!uint32_key_output_offset) { + output_offsets = rmm::device_uvector(0, loop_stream); } - size_t input_start_offset{}; - if constexpr (filter_input_key) { - input_start_offset = - edge_partition_contiguous_sizes[j] + - (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size(); - } else { - static_assert(!use_input_key); - input_start_offset = edge_partition_allreduce_sizes[j]; - } - auto flag_first = thrust::make_transform_iterator( - get_dataframe_buffer_begin(output_buffer) + input_start_offset, - cuda::proclaim_return_type( - [init] __device__(auto val) { return val != init; })); + if (process_local_edges[j]) { + auto& values = edge_partition_values[j]; - if constexpr (filter_input_key) { - assert(static_cast(partition_idx) != minor_comm_rank); - auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; - if (output_offsets.index() == 0) { - auto input_pair_first = thrust::make_zip_iterator( - get_dataframe_buffer_begin(output_buffer) + input_start_offset, - thrust::make_transform_iterator( - hypersparse_key_offsets.begin() + - (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size(), - typecast_t{})); - copy_if_nosync( - input_pair_first, - input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], - flag_first, - thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], - std::get<0>(output_offsets).begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + size_t output_offset_buf_size{0}; + if constexpr (filter_input_key) { + output_offset_buf_size = (*edge_partition_deg1_hypersparse_key_offset_counts)[j]; } else { - auto input_pair_first = thrust::make_zip_iterator( - get_dataframe_buffer_begin(output_buffer) + input_start_offset, - hypersparse_key_offsets.begin() + - (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j].size()); - copy_if_nosync( - input_pair_first, - input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], - flag_first, - thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], - std::get<1>(output_offsets).begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + assert(!use_input_key); + output_offset_buf_size = + size_dataframe_buffer(output_buffer) - edge_partition_allreduce_sizes[j]; } - hypersparse_key_offsets.resize(0, loop_stream); - hypersparse_key_offsets.shrink_to_fit(loop_stream); - } else { - static_assert(!use_input_key); - assert(process_local_edges); + if (output_offsets.index() == 0) { - auto input_pair_first = - thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), - thrust::make_counting_iterator(uint32_t{0})); - copy_if_nosync( - input_pair_first + input_start_offset, - input_pair_first + size_dataframe_buffer(output_buffer), - flag_first, - thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], - std::get<0>(output_offsets).begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + std::get<0>(output_offsets).resize(output_offset_buf_size, loop_stream); } else { - auto input_pair_first = - thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), - thrust::make_counting_iterator(size_t{0})); - copy_if_nosync( - input_pair_first + input_start_offset, - input_pair_first + size_dataframe_buffer(output_buffer), - flag_first, - thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], - std::get<1>(output_offsets).begin()), - raft::device_span(count.data(), size_t{1}), - loop_stream); + output_offsets = rmm::device_uvector(output_offset_buf_size, loop_stream); } - } - if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { - std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors) - .push_back(std::move(std::get<0>(output_offsets))); - } else { - assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); - std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors) - .push_back(std::move(std::get<1>(output_offsets))); + size_t input_start_offset{}; + if constexpr (filter_input_key) { + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + input_start_offset = + edge_partition_contiguous_sizes[j] + + (span.index() == 0 ? std::get<0>(span).size() : std::get<1>(span).size()); + } else { + static_assert(!use_input_key); + input_start_offset = edge_partition_allreduce_sizes[j]; + } + auto flag_first = thrust::make_transform_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + cuda::proclaim_return_type( + [init] __device__(auto val) { return val != init; })); + + if constexpr (filter_input_key) { + auto& hypersparse_key_offsets = (*edge_partition_hypersparse_key_offset_vectors)[j]; + auto span = (*edge_partition_hypersparse_non_deg1_key_offset_spans)[j]; + if (hypersparse_key_offsets.index() == 0) { + assert(output_offsets.index() == 0); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<0>(hypersparse_key_offsets).begin() + std::get<0>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<0>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<0>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } else { + assert(output_offsets.index() == 1); + auto input_pair_first = thrust::make_zip_iterator( + get_dataframe_buffer_begin(output_buffer) + input_start_offset, + std::get<1>(hypersparse_key_offsets).begin() + std::get<1>(span).size()); + copy_if_nosync( + input_pair_first, + input_pair_first + (*edge_partition_deg1_hypersparse_key_offset_counts)[j], + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + std::get<1>(hypersparse_key_offsets).resize(0, loop_stream); + std::get<1>(hypersparse_key_offsets).shrink_to_fit(loop_stream); + } + } else { + static_assert(!use_input_key); + assert(process_local_edges[j]); + if (output_offsets.index() == 0) { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(uint32_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<0>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } else { + auto input_pair_first = + thrust::make_zip_iterator(get_dataframe_buffer_begin(output_buffer), + thrust::make_counting_iterator(size_t{0})); + copy_if_nosync( + input_pair_first + input_start_offset, + input_pair_first + size_dataframe_buffer(output_buffer), + flag_first, + thrust::make_zip_iterator(get_dataframe_buffer_begin(values) + copy_sizes[j], + std::get<1>(output_offsets).begin()), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } } + (*edge_partition_deg1_hypersparse_output_offset_vectors) + .push_back(std::move(output_offsets)); + resize_dataframe_buffer(output_buffer, 0, loop_stream); shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + + std::vector deg1_copy_sizes(loop_count); + raft::update_host( + deg1_copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - auto deg1_copy_size = edge_partition_count_scalars[j].value(loop_stream); - copy_sizes[j] += deg1_copy_size; - if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { - std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].resize( - deg1_copy_size, loop_stream); - } else { - assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); - std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].resize( - deg1_copy_size, loop_stream); + if (process_local_edges[j]) { + copy_sizes[j] += deg1_copy_sizes[j]; + auto& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + std::get<0>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } else { + assert(offsets.index() == 1); + std::get<1>(offsets).resize(deg1_copy_sizes[j], handle.get_stream()); + } + // skip shrink_to_fit() to cut execution time } - // skip shrink_to_fit() to cut execution time } } for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], loop_stream); - // skip shrink_to_fit() to cut execution time + if (process_local_edges[j]) { + resize_dataframe_buffer(edge_partition_values[j], copy_sizes[j], handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT + if (stream_pool_indices) { handle.sync_stream(); } auto subtime9 = std::chrono::steady_clock::now(); #endif + size_t min_element_size{cache_line_size}; + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(T), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); + } + assert((cache_line_size % min_element_size) == 0); + size_t value_alignment = cache_line_size / min_element_size; + + size_t offset_alignment = 1; + if (graph_view.use_dcs()) { + static_assert(((cache_line_size % sizeof(uint32_t)) == 0) && + ((cache_line_size % sizeof(size_t)) == 0)); + offset_alignment = + cache_line_size / (uint32_key_output_offset ? sizeof(uint32_t) : sizeof(size_t)); + } + std::optional> rx_value_sizes{}; std::optional> rx_value_displs{}; std::optional> rx_values{}; @@ -3211,20 +3599,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional, rmm::device_uvector>> rx_offsets{}; { - auto size_per_rank = loop_count * (graph_view.use_dcs() ? 2 : 1); + auto size_per_rank = + loop_count * (graph_view.use_dcs() ? 2 /* value buffer size, offset buffer size */ + : 1 /* value buffer size */); rmm::device_uvector d_aggregate_buffer_sizes(minor_comm_size * size_per_rank, handle.get_stream()); std::vector h_buffer_sizes(size_per_rank); for (size_t j = 0; j < loop_count; ++j) { h_buffer_sizes[j] = size_dataframe_buffer(edge_partition_values[j]); if (graph_view.use_dcs()) { - if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { - h_buffer_sizes[loop_count + j] = - std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].size(); + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { + h_buffer_sizes[loop_count + j] = std::get<0>(offsets).size(); } else { - assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); - h_buffer_sizes[loop_count + j] = - std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j].size(); + assert(offsets.index() == 1); + h_buffer_sizes[loop_count + j] = std::get<1>(offsets).size(); } } } @@ -3259,22 +3648,37 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, h_aggregate_buffer_sizes[k * size_per_rank + loop_count + j]; } } - std::exclusive_scan((*rx_value_sizes).begin(), - (*rx_value_sizes).end(), - (*rx_value_displs).begin(), - size_t{0}); - std::exclusive_scan((*rx_offset_sizes).begin(), - (*rx_offset_sizes).end(), - (*rx_offset_displs).begin(), - size_t{0}); + + std::vector aligned_sizes(minor_comm_size); + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_value_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_value_sizes)[k], value_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_value_displs).begin(), size_t{0}); + + if (graph_view.use_dcs()) { + for (int k = 0; k < minor_comm_size; ++k) { + if (k == (minor_comm_size - 1)) { + aligned_sizes[k] = (*rx_offset_sizes)[k]; + } else { + aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], value_alignment); + } + } + std::exclusive_scan( + aligned_sizes.begin(), aligned_sizes.end(), (*rx_offset_displs).begin(), size_t{0}); + } + rx_values = allocate_dataframe_buffer( (*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); if (graph_view.use_dcs()) { - if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { + if (uint32_key_output_offset) { rx_offsets = rmm::device_uvector( (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); } else { - assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); rx_offsets = rmm::device_uvector( (*rx_offset_displs).back() + (*rx_offset_sizes).back(), handle.get_stream()); } @@ -3318,46 +3722,43 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto partition_idx = i + j; auto& values = edge_partition_values[j]; - if ((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 0) { - auto& offsets = - std::get<0>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + auto const& offsets = (*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + if (offsets.index() == 0) { if (minor_comm_rank == static_cast(partition_idx)) { device_gatherv(minor_comm, - offsets.data(), + std::get<0>(offsets).data(), std::get<0>(*rx_offsets).data(), - offsets.size(), + std::get<0>(offsets).size(), *rx_offset_sizes, *rx_offset_displs, static_cast(partition_idx), handle.get_stream()); } else { device_gatherv(minor_comm, - offsets.data(), + std::get<0>(offsets).data(), static_cast(nullptr), - offsets.size(), + std::get<0>(offsets).size(), std::vector{}, std::vector{}, static_cast(partition_idx), handle.get_stream()); } } else { - assert((*edge_partition_deg1_hypersparse_output_offset_vectors).index() == 1); - auto& offsets = - std::get<1>(*edge_partition_deg1_hypersparse_output_offset_vectors)[j]; + assert(offsets.index() == 1); if (minor_comm_rank == static_cast(partition_idx)) { device_gatherv(minor_comm, - offsets.data(), + std::get<1>(offsets).data(), std::get<1>(*rx_offsets).data(), - offsets.size(), + std::get<1>(offsets).size(), *rx_offset_sizes, *rx_offset_displs, static_cast(partition_idx), handle.get_stream()); } else { device_gatherv(minor_comm, - offsets.data(), + std::get<1>(offsets).data(), static_cast(nullptr), - offsets.size(), + std::get<1>(offsets).size(), std::vector{}, std::vector{}, static_cast(partition_idx), @@ -3380,8 +3781,145 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, auto j = static_cast(minor_comm_rank % num_concurrent_loops); auto partition_idx = i + j; - auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); - auto old_size = selected_ranks.size(); + { // remove gaps introduced to enforce alignment + rmm::device_uvector bitmap( + packed_bool_size(size_dataframe_buffer(*rx_values)), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + rmm::device_uvector d_displs((*rx_value_displs).size(), handle.get_stream()); + rmm::device_uvector d_sizes((*rx_value_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_value_displs).data(), + (*rx_value_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_value_sizes).data(), + (*rx_value_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + value_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = value_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + resize_dataframe_buffer( + *rx_values, + thrust::distance( + get_dataframe_buffer_begin(*rx_values), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(*rx_values), + get_dataframe_buffer_end(*rx_values), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + std::exclusive_scan((*rx_value_sizes).begin(), + (*rx_value_sizes).end(), + (*rx_value_displs).begin(), + size_t{0}); // now gaps are removed + + if (rx_offsets) { + size_t num_offsets = ((*rx_offsets).index() == 0) + ? size_dataframe_buffer(std::get<0>(*rx_offsets)) + : size_dataframe_buffer(std::get<1>(*rx_offsets)); + bitmap.resize(packed_bool_size(num_offsets), handle.get_stream()); + thrust::fill( + handle.get_thrust_policy(), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); + d_displs.resize((*rx_offset_displs).size(), handle.get_stream()); + d_sizes.resize((*rx_offset_sizes).size(), handle.get_stream()); + raft::update_device(d_displs.data(), + (*rx_offset_displs).data(), + (*rx_offset_displs).size(), + handle.get_stream()); + raft::update_device(d_sizes.data(), + (*rx_offset_sizes).data(), + (*rx_offset_sizes).size(), + handle.get_stream()); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(static_cast(minor_comm_size - 1) * + offset_alignment), + [bitmap = raft::device_span(bitmap.data(), bitmap.size()), + displs = raft::device_span(d_displs.data(), d_displs.size()), + sizes = raft::device_span(d_sizes.data(), d_sizes.size()), + alignment = offset_alignment] __device__(size_t i) { + auto rank = static_cast(i / alignment); + auto first = displs[rank] + sizes[rank]; + auto last = displs[rank + 1]; + if ((i % alignment) < (last - first)) { + auto offset = first + (i % alignment); + cuda::atomic_ref word( + bitmap[packed_bool_offset(offset)]); + word.fetch_or(packed_bool_mask(offset), cuda::std::memory_order_relaxed); + } + }); + if ((*rx_offsets).index() == 0) { + resize_dataframe_buffer( + std::get<0>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<0>(*rx_offsets)), + get_dataframe_buffer_end(std::get<0>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } else { + resize_dataframe_buffer( + std::get<1>(*rx_offsets), + thrust::distance( + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + thrust::remove_if(handle.get_thrust_policy(), + get_dataframe_buffer_begin(std::get<1>(*rx_offsets)), + get_dataframe_buffer_end(std::get<1>(*rx_offsets)), + thrust::make_transform_iterator( + thrust::make_counting_iterator(size_t{0}), + cuda::proclaim_return_type( + [bitmap = raft::device_span( + bitmap.data(), bitmap.size())] __device__(size_t i) { + return (bitmap[packed_bool_offset(i)] & + packed_bool_mask(i)) == packed_bool_mask(i); + })), + thrust::identity{})), + handle.get_stream()); + // skip shrink_to_fit() to cut execution time + } + std::exclusive_scan((*rx_offset_sizes).begin(), + (*rx_offset_sizes).end(), + (*rx_offset_displs).begin(), + size_t{0}); // now gaps are removed + } + } + size_t output_range_size{}; if constexpr (filter_input_key) { output_range_size = local_key_list_sizes[partition_idx]; @@ -3392,11 +3930,23 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, ? *((*segment_offsets).rbegin() + 1) /* exclude the zero degree segment */ : graph_view.local_vertex_partition_range_size(); } - selected_ranks.resize(output_range_size, handle.get_stream()); - thrust::fill(handle.get_thrust_policy(), - selected_ranks.begin() + old_size, - selected_ranks.end(), - minor_comm_size); + auto& selected_ranks = std::get<0>(edge_partition_selected_ranks_or_flags[j]); + if (selected_ranks.index() == 0) { + auto old_size = std::get<0>(selected_ranks).size(); + std::get<0>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin() + old_size, + std::get<0>(selected_ranks).end(), + static_cast(minor_comm_size)); + } else { + assert(selected_ranks.index() == 1); + auto old_size = std::get<1>(selected_ranks).size(); + std::get<1>(selected_ranks).resize(output_range_size, handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin() + old_size, + std::get<1>(selected_ranks).end(), + minor_comm_size); + } if (rx_offsets) { rmm::device_uvector lasts((*rx_offset_displs).size(), handle.get_stream()); raft::update_device(lasts.data(), @@ -3405,54 +3955,102 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, handle.get_stream()); auto num_elements = (*rx_offset_displs).back() + (*rx_offset_sizes).back(); lasts.set_element_async(lasts.size() - 1, num_elements, handle.get_stream()); + handle.sync_stream(); // this is necessary before num_elements becomes out-of-scope if ((*rx_offsets).index() == 0) { auto& offsets = std::get<0>(*rx_offsets); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(offsets.size()), - [offsets = raft::device_span(offsets.data(), offsets.size()), - lasts = raft::device_span(lasts.data(), lasts.size()), - selected_ranks = raft::device_span( - selected_ranks.data(), selected_ranks.size())] __device__(auto i) { - auto minor_comm_rank = static_cast(thrust::distance( - lasts.begin(), - thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); - selected_ranks[offsets[i]] = minor_comm_rank; - }); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } offsets.resize(0, handle.get_stream()); offsets.shrink_to_fit(handle.get_stream()); } else { assert((*rx_offsets).index() == 1); auto& offsets = std::get<1>(*rx_offsets); - thrust::for_each( - handle.get_thrust_policy(), - thrust::make_counting_iterator(size_t{0}), - thrust::make_counting_iterator(offsets.size()), - [offsets = raft::device_span(offsets.data(), offsets.size()), - lasts = raft::device_span(lasts.data(), lasts.size()), - selected_ranks = raft::device_span( - selected_ranks.data(), selected_ranks.size())] __device__(auto i) { - auto minor_comm_rank = static_cast(thrust::distance( - lasts.begin(), - thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); - selected_ranks[offsets[i]] = minor_comm_rank; - }); + if (selected_ranks.index() == 0) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<0>(selected_ranks).data(), + std::get<0>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } else { + assert(selected_ranks.index() == 1); + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(offsets.size()), + [offsets = raft::device_span(offsets.data(), offsets.size()), + lasts = raft::device_span(lasts.data(), lasts.size()), + selected_ranks = raft::device_span( + std::get<1>(selected_ranks).data(), + std::get<1>(selected_ranks).size())] __device__(auto i) { + auto minor_comm_rank = static_cast(thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), i))); + selected_ranks[offsets[i]] = minor_comm_rank; + }); + } offsets.resize(0, handle.get_stream()); offsets.shrink_to_fit(handle.get_stream()); } } - // FIXME: we may use 8 bit ranks to further cut sort time - if (selected_ranks.size() <= std::numeric_limits::max()) { - rmm::device_uvector rx_positions(selected_ranks.size(), handle.get_stream()); + size_t num_positions = (selected_ranks.index() == 0) ? std::get<0>(selected_ranks).size() + : std::get<1>(selected_ranks).size(); + if (num_positions <= static_cast(std::numeric_limits::max())) { + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); thrust::sequence( handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), uint32_t{0}); - thrust::stable_sort_by_key(handle.get_thrust_policy(), - selected_ranks.begin(), - selected_ranks.end(), - rx_positions.begin()); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); @@ -3462,13 +4060,21 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, rx_positions.begin(), tmp_vertex_value_output_first); } else { - rmm::device_uvector rx_positions(selected_ranks.size(), handle.get_stream()); + rmm::device_uvector rx_positions(num_positions, handle.get_stream()); thrust::sequence( handle.get_thrust_policy(), rx_positions.begin(), rx_positions.end(), size_t{0}); - thrust::stable_sort_by_key(handle.get_thrust_policy(), - selected_ranks.begin(), - selected_ranks.end(), - rx_positions.begin()); + if (selected_ranks.index() == 0) { + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<0>(selected_ranks).begin(), + std::get<0>(selected_ranks).end(), + rx_positions.begin()); + } else { + assert(selected_ranks.index() == 1); + thrust::stable_sort_by_key(handle.get_thrust_policy(), + std::get<1>(selected_ranks).begin(), + std::get<1>(selected_ranks).end(), + rx_positions.begin()); + } // selected_ranks[] == minor_comm_size if no GPU in minor_comm has a non-init value rx_positions.resize((*rx_value_displs).back() + (*rx_value_sizes).back(), handle.get_stream()); diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 80e16c1ffaf..403c8813453 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -677,6 +677,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, min_element_size = std::min(min_thrust_tuple_element_sizes(), min_element_size); } } + assert((cache_line_size % min_element_size) == 0); auto alignment = cache_line_size / min_element_size; std::optional, key_t>> invalid_key{std::nullopt}; From f1fb13cabdaf5a469ea487ceb68a3ca76f1d8013 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Oct 2024 13:03:31 -0700 Subject: [PATCH 103/126] added temporary code to configure NCCL to SHARP accelerate minor_comm --- cpp/include/cugraph/partition_manager.hpp | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index e3bb699f00d..13ab2980737 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -22,6 +22,8 @@ #include #include +#include // FIXME: temporarily added for setenv + #include namespace cugraph { @@ -161,10 +163,29 @@ class partition_manager { int row_idx = rank / gpu_row_comm_size; int col_idx = rank % gpu_row_comm_size; +#if 1 // FIXME: a trick to use InfiniBand SHARP in a sub-communicator (currently, a GPU can + // participate in only one SHARP accelerated communicator) + comm.barrier(); // to enforce initialization in comm + handle.set_subcomm("gpu_row_comm", + std::make_shared(comm.comm_split(row_idx, col_idx))); + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + major_comm.barrier(); /// to enforce initialization in major_comm + auto ret = setenv("NCCL_COLLNET_ENABLE", "1", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_COLLNET_ENABLE\", \"1\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SHARP_DISABLE", "0", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SHARP_DISABLE\", \"0\", 1) returned " << ret << std::endl; + handle.set_subcomm("gpu_col_comm", + std::make_shared(comm.comm_split(col_idx, row_idx))); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + minor_comm.barrier(); /// to enforce initialization in minor_comm +#else handle.set_subcomm("gpu_row_comm", std::make_shared(comm.comm_split(row_idx, col_idx))); handle.set_subcomm("gpu_col_comm", std::make_shared(comm.comm_split(col_idx, row_idx))); +#endif }; }; From f6d2b251a66a6f53fcfdc04db270ea3432432227 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 18 Oct 2024 13:05:31 -0700 Subject: [PATCH 104/126] reduce printouts --- .../detail/extract_transform_v_frontier_e.cuh | 3 +- cpp/src/prims/fill_edge_src_dst_property.cuh | 4 +- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 6 ++ .../create_graph_from_edgelist_impl.cuh | 37 ++-------- cpp/src/structure/renumber_edgelist_impl.cuh | 74 ++++--------------- cpp/src/traversal/bfs_impl.cuh | 26 +++---- cpp/src/utilities/shuffle_vertex_pairs.cuh | 2 +- 7 files changed, 44 insertions(+), 108 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index e451c18f4be..a8686b9062c 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -750,6 +750,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // update frontier bitmap (used to reduce broadcast bandwidth size) + // FIXME: v_compressible... std:: conditional_t>, std::byte /* dummy */> frontier_bitmap{}; @@ -983,7 +984,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition_frontier_major_last = thrust_tuple_get_or_identity( edge_partition_frontier_key_last); - // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host + // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host (use cub reduce) // FIXME: check whether skipping a call for 0 key_buffer size helps or not edge_partition_max_pushes = edge_partition.compute_number_of_edges( edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 167147afcce..10c11acb346 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -425,7 +425,6 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto avg_v_list_size = std::reduce(local_v_list_sizes.begin(), local_v_list_sizes.end()) / static_cast(major_comm_size); - // FIXME: should I better set minimum v_list_size??? if ((avg_fill_ratio > threshold_ratio) && (static_cast(avg_v_list_size) > packed_bool_word_bcast_alignment)) { if (is_packed_bool, weight_t>> @@ -1071,10 +1067,6 @@ create_graph_from_edgelist_impl( bool renumber, bool do_expensive_check) { -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 0" << std::endl; -#endif auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -1220,7 +1212,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 1 compressed_v_size=" << compressed_v_size + std::cerr << "create_graph_from_edgelist_impl 0 compressed_v_size=" << compressed_v_size << std::endl; #endif @@ -1266,7 +1258,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 2" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 1" << std::endl; #endif // 3. compress edge chunk source/destination vertices to cut intermediate peak memory requirement @@ -1317,10 +1309,6 @@ create_graph_from_edgelist_impl( (*edgelist_compressed_dsts).push_back(std::move(tmp_dsts)); } } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 3" << std::endl; -#endif // 4. compute additional copy_offset vectors // FIXME: we can store chunk data in multiple rmm::device_uvector objects to free memory earlier @@ -1358,7 +1346,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 4" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 2" << std::endl; #endif // 5. split the grouped edge chunks to local partitions @@ -1387,10 +1375,6 @@ create_graph_from_edgelist_impl( edge_partition_intra_partition_segment_offset_vectors, edge_partition_intra_segment_copy_output_displacement_vectors, compressed_v_size); -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 4-1" << std::endl; -#endif edge_partition_edgelist_compressed_dsts = split_edge_chunk_compressed_elements_to_local_edge_partitions( @@ -1421,11 +1405,6 @@ create_graph_from_edgelist_impl( edge_partition_intra_segment_copy_output_displacement_vectors); } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 5" << std::endl; -#endif - if (edgelist_weights) { edge_partition_edgelist_weights = split_edge_chunk_elements_to_local_edge_partitions( @@ -1458,7 +1437,7 @@ create_graph_from_edgelist_impl( } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 6" << std::endl; + std::cerr << "create_graph_from_edgelist_impl 3" << std::endl; #endif // 6. decompress edge chunk source/destination vertices to cut intermediate peak memory @@ -1523,10 +1502,6 @@ create_graph_from_edgelist_impl( handle.sync_stream(); } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "create_graph_from_edgelist_impl 7" << std::endl; -#endif return create_graph_from_partitioned_edgelist h_offsets(d_offsets.size()); raft::update_host(h_offsets.data(), d_offsets.data(), d_offsets.size(), handle.get_stream()); handle.sync_stream(); - std::cerr << "hypersparse_degree_threshold=" << hypersparse_degree_threshold << std::endl; - raft::print_host_vector("h_offsets", h_offsets.data(), h_offsets.size(), std::cerr); auto num_segments_per_vertex_partition = detail::num_sparse_segments_per_vertex_partition + @@ -726,18 +720,12 @@ compute_renumber_map(raft::handle_t const& handle, (*h_hypersparse_degree_offsets).begin(), [shift](auto offset) { return offset - shift; }); *((*h_hypersparse_degree_offsets).rbegin()) = *(h_offsets.rbegin() + 1); - raft::print_host_vector("hypersparse_degree_offsets", - (*h_hypersparse_degree_offsets).data(), - (*h_hypersparse_degree_offsets).size(), - std::cerr); } - raft::print_host_vector( - "h_segment_offsets", h_segment_offsets.data(), h_segment_offsets.size(), std::cerr); } #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "compute_renumber_map 4" << std::endl; + std::cerr << "compute_renumber_map 3" << std::endl; #endif return std::make_tuple(std::move(sorted_local_vertices), @@ -1029,10 +1017,6 @@ renumber_edgelist( // 1. compute renumber map -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "renumber_edgelist 0" << std::endl; -#endif auto [renumber_map_labels, vertex_partition_segment_offsets, vertex_partition_hypersparse_degree_offsets, @@ -1042,10 +1026,6 @@ renumber_edgelist( edgelist_const_majors, edgelist_const_minors, edgelist_edge_counts); -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "renumber_edgelist 1" << std::endl; -#endif // 2. initialize partition_t object, number_of_vertices, and number_of_edges @@ -1083,7 +1063,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "renumber_edgelist 2" << std::endl; + std::cerr << "renumber_edgelist 0" << std::endl; #endif { vertex_t max_edge_partition_major_range_size{0}; @@ -1117,27 +1097,24 @@ renumber_edgelist( } } -#if 1 - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "renumber_edgelist 3 partition.local_edge_partition_minor_range_size()=" - << partition.local_edge_partition_minor_range_size() - << " number_of_edges=" << number_of_edges << " comm_size=" << comm_size - << " edgelist_intra_partition_segment_offsets.has_value()=" - << edgelist_intra_partition_segment_offsets.has_value() << std::endl; -#endif double approx_mem_requirements = static_cast(partition.local_edge_partition_minor_range_size()) * (static_cast( sizeof(vertex_t)) /* rmm::device_uvector renumber_map_minor_labels */ + static_cast(sizeof(vertex_t) * 2) * - 1.5 /* kv_store_t renumber_map, * 1.5 to consider load factor */); + 2.5 /* kv_store_t renumber_map, * 2.5 to consider load factor */); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "renumber_edgelist 1 partition.local_edge_partition_minor_range_size()=" + << partition.local_edge_partition_minor_range_size() + << " approx_mem_requirements=" << approx_mem_requirements << " threshold=" + << (static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) + << std::endl; +#endif if ((approx_mem_requirements > static_cast(handle.get_device_properties().totalGlobalMem) * 0.05) && edgelist_intra_partition_segment_offsets) { -#if 1 - std::cerr << "path A" << std::endl; -#endif vertex_t max_segment_size{0}; for (int i = 0; i < major_comm_size; ++i) { auto minor_range_vertex_partition_id = @@ -1177,9 +1154,6 @@ renumber_edgelist( } } } else { -#if 1 - std::cerr << "path B" << std::endl; -#endif rmm::device_uvector renumber_map_minor_labels( partition.local_edge_partition_minor_range_size(), handle.get_stream()); std::vector recvcounts(major_comm_size); @@ -1191,34 +1165,12 @@ renumber_edgelist( } std::vector displacements(recvcounts.size(), 0); std::exclusive_scan(recvcounts.begin(), recvcounts.end(), displacements.begin(), size_t{0}); - { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - size_t free{}; - size_t total{}; - RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); - auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); - auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); - auto u_sz = t_sz - f_sz; - std::cerr << "BEFORE device_allgatherv free=" << f_sz << "GB used=" << u_sz - << "GB total=" << t_sz << std::endl; - } device_allgatherv(major_comm, renumber_map_labels.data(), renumber_map_minor_labels.data(), recvcounts, displacements, handle.get_stream()); - { - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - size_t free{}; - size_t total{}; - RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); - auto f_sz = static_cast(free) / (1024.0 * 1024.0 * 1024.0); - auto t_sz = static_cast(total) / (1024.0 * 1024.0 * 1024.0); - auto u_sz = t_sz - f_sz; - std::cerr << "AFTER device_allgatherv free=" << f_sz << "GB used=" << u_sz - << "GB total=" << t_sz << std::endl; - } kv_store_t renumber_map( renumber_map_minor_labels.begin(), @@ -1238,7 +1190,7 @@ renumber_edgelist( #if 1 RAFT_CUDA_TRY(cudaDeviceSynchronize()); - std::cerr << "renumber_edgelist 4" << std::endl; + std::cerr << "renumber_edgelist 2" << std::endl; #endif auto edge_partition_segment_offsets = detail::aggregate_offset_vectors(handle, vertex_partition_segment_offsets); diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index d4ae7b7d64d..1312323cc80 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -365,9 +365,8 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur2 = prep3 - prep2; std::chrono::duration dur3 = prep4 - prep3; std::chrono::duration dur = prep4 - prep0; - std::cerr << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() - << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." - << std::endl; + std::cerr << "prep (init,meta,vf,fill) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." << std::endl; #endif // 4. BFS iteration @@ -436,9 +435,9 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur = topdown3 - topdown0; - std::cerr << "depth=" << depth << " topdown (prim,vf,host) took " - << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," - << dur2.count() << ") s." << std::endl; + std::cerr << "depth=" << depth << " topdown (prim,vf,host) took " << dur.count() << " (" + << dur0.count() << "," << dur1.count() << "," << dur2.count() << ") s." + << std::endl; #endif break; } @@ -566,12 +565,14 @@ void bfs(raft::handle_t const& handle, ? host_scalar_allreduce( handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) : m_u; +#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete std::cerr << "m_f=" << m_f << " m_u=" << m_u << " aggregate_m_f * direction_optimzing_alpha=" << aggregate_m_f * direction_optimizing_alpha << " aggregate_m_u=" << aggregate_m_u << " cur_aggregate_frontier_size=" << cur_aggregate_frontier_size << " next_aggregate_frontier_size=" << next_aggregate_frontier_size << std::endl; +#endif if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { topdown = false; @@ -607,10 +608,10 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur = topdown6 - topdown0; std::cerr << "depth=" << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size - << " next topdown=" << topdown - << " (prim,vf,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," - << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() - << "," << dur5.count() << ") s." << std::endl; + << " next topdown=" << topdown << " (prim,vf,host,fill,dir,vf) took " << dur.count() + << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," + << dur3.count() << "," << dur4.count() << "," << dur5.count() << ") s." + << std::endl; #endif } else { // bottom up #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete @@ -702,9 +703,8 @@ void bfs(raft::handle_t const& handle, std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; - std::cerr << "depth=" << depth << " bottomup (prim+,host) took " - << dur.count() << " (" << dur0.count() << "," << dur1.count() << ") s." - << std::endl; + std::cerr << "depth=" << depth << " bottomup (prim+,host) took " << dur.count() << " (" + << dur0.count() << "," << dur1.count() << ") s." << std::endl; #endif break; } diff --git a/cpp/src/utilities/shuffle_vertex_pairs.cuh b/cpp/src/utilities/shuffle_vertex_pairs.cuh index 70327db5ffb..e13cc6dd9f7 100644 --- a/cpp/src/utilities/shuffle_vertex_pairs.cuh +++ b/cpp/src/utilities/shuffle_vertex_pairs.cuh @@ -61,7 +61,7 @@ shuffle_vertex_pairs_with_values_by_gpu_id_impl( (edge_ids ? sizeof(edge_t) : size_t{0}) + (edge_types ? sizeof(edge_type_t) : size_t{0}); auto constexpr mem_frugal_ratio = - 0.1; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the + 0.05; // if the expected temporary buffer size exceeds the mem_frugal_ratio of the // total_global_mem, switch to the memory frugal approach (thrust::sort is used to // group-by by default, and thrust::sort requires temporary buffer comparable to the input // data size) From ecb0c8e0d066ec2f5c3356bb934eaaeacb95e643 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 19 Oct 2024 03:30:29 -0700 Subject: [PATCH 105/126] bug fix in per_v_transform_reduce_e --- .../prims/detail/per_v_transform_reduce_e.cuh | 510 ++++++++++++------ 1 file changed, 348 insertions(+), 162 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 6309e79c42e..b7d4f888657 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1194,9 +1194,9 @@ compute_selected_ranks_from_priorities( template edge_partition, OptionalKeyIterator edge_partition_key_first, OptionalKeyIterator edge_partition_key_last, - EdgeSrcValueInputWrapper edge_partition_src_value_input, - EdgeDstValueInputWrapper edge_partition_dst_value_input, - EdgeValueInputWrapper edge_partition_e_value_input, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, thrust::optional edge_partition_e_mask, ResultValueOutputIteratorOrWrapper output_buffer, EdgeOp e_op, @@ -2002,6 +2002,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { + auto max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); size_t tmp_buffer_size_per_loop{0}; if constexpr (update_major) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); @@ -2010,7 +2012,11 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, size_t key_size{0}; if constexpr (use_input_key) { if constexpr (std::is_arithmetic_v) { - key_size = sizeof(key_t); + if (v_compressible) { + key_size = sizeof(uint32_t); + } else { + key_size = sizeof(key_t); + } } else { key_size = sum_thrust_tuple_element_sizes(); } @@ -2021,17 +2027,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } else { value_size = sum_thrust_tuple_element_sizes(); } - size_t approx_tmp_size{0}; - if constexpr (filter_input_key) { - // use tmeporary buffers to store non-zero local degree key offsets in the hypersparse - // regioon, priorities, selected ranks (or) flags (non-root), and selected values (and - // key offsets for the selected values that are in the hypersparse region and have the - // global degree of 1) - approx_tmp_size = static_cast( - static_cast(sizeof(size_t)) * 0.25 + - static_cast(value_size) / - static_cast(minor_comm_size) /* only one value will be selected */); - } size_t aggregate_major_range_size{}; if constexpr (use_input_key) { @@ -2046,18 +2041,27 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } + size_t size_per_key{}; + if constexpr (filter_input_key) { + size_per_key = + key_size + + value_size / 2; // to reflect that many keys will be filtered out, note that this is a + // simple approximation, memory requirement in this case is much more + // complex as we store additional temporary variables + + } else { + size_per_key = key_size + value_size; + } tmp_buffer_size_per_loop = (aggregate_major_range_size / graph_view.number_of_local_edge_partitions()) * - (key_size + value_size + approx_tmp_size); + size_per_key; } - stream_pool_indices = init_stream_pool_indices( - static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * - 0.2), - tmp_buffer_size_per_loop, - graph_view.number_of_local_edge_partitions(), - max_segments, - handle.get_stream_pool_size()); + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + max_segments, + handle.get_stream_pool_size()); if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } } @@ -2065,9 +2069,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, // 8. set-up temporary buffers size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loops streams from stream_pool_indices if (stream_pool_indices) { assert(((*stream_pool_indices).size() % max_segments) == 0); - num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } using minor_tmp_buffer_type = std::conditional_t, + std::conditional_t>>, std::byte /* dummy */> edge_partition_bitmap_buffers{std::nullopt}; @@ -2244,7 +2252,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } device_group_end(minor_comm); - if (stream_pool_indices) { handle.sync_stream(); } + if (loop_stream_pool_indices) { handle.sync_stream(); } #if PER_V_PERFORMANCE_MEASUREMENT subtime2 = std::chrono::steady_clock::now(); #endif @@ -2256,9 +2264,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); std::variant, rmm::device_uvector> keys = rmm::device_uvector(0, loop_stream); @@ -2337,7 +2346,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if constexpr (try_bitmap) { if (edge_partition_bitmap_buffers) { allocate_new_key_buffer = false; } } - if (allocate_new_key_buffer) { // allocate new key buffers and copy the sparse segment + if (allocate_new_key_buffer) { // allocate new key buffers and copy the sparse segment // keys to the new key buffers if constexpr (try_bitmap) { edge_partition_new_key_buffers = std::vector< @@ -2349,9 +2358,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; @@ -2410,9 +2420,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; @@ -2460,7 +2471,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]); if (range_offset_first < range_offset_last) { - auto count_first = thrust::make_transform_iterator( + auto input_count_first = thrust::make_transform_iterator( thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), cuda::proclaim_return_type( [range_bitmap = @@ -2474,113 +2485,284 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } return static_cast(__popc(word)); })); - rmm::device_uvector count_displacements( + rmm::device_uvector input_count_offsets( + (rx_bitmap.size() - packed_bool_offset(range_offset_first)) + 1, loop_stream); + input_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan( + rmm::exec_policy_nosync(loop_stream), + input_count_first, + input_count_first + + (rx_bitmap.size() - packed_bool_offset(range_offset_first)), + input_count_offsets.begin() + 1); + rmm::device_uvector filtered_bitmap( rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream); - thrust::exclusive_scan(rmm::exec_policy_nosync(loop_stream), - count_first, - count_first + count_displacements.size(), - count_displacements.begin()); - auto offset_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(range_offset_first), - cuda::proclaim_return_type( - [range_bitmap = - raft::device_span(rx_bitmap.data(), rx_bitmap.size()), - count_displacements = raft::device_span( - count_displacements.data(), count_displacements.size()), - range_offset_first, - start_offset = key_segment_offsets[3]] __device__(auto range_offset) { - auto word = range_bitmap[packed_bool_offset(range_offset)]; - if (packed_bool_offset(range_offset) == - packed_bool_offset(range_offset_first)) { - word &= ~packed_bool_partial_mask( - range_offset_first % - packed_bools_per_word()); // clear the bits in the sparse region - } - return static_cast( - start_offset + - count_displacements[packed_bool_offset(range_offset) - - packed_bool_offset(range_offset_first)] + - __popc(word & packed_bool_partial_mask(range_offset % - packed_bools_per_word()))); - })); - auto flag_first = thrust::make_transform_iterator( - thrust::make_counting_iterator(range_offset_first), - cuda::proclaim_return_type( + thrust::tabulate( + rmm::exec_policy_nosync(loop_stream), + filtered_bitmap.begin(), + filtered_bitmap.end(), + cuda::proclaim_return_type( [range_bitmap = raft::device_span(rx_bitmap.data(), rx_bitmap.size()), segment_bitmap = raft::device_span(segment_bitmap.data(), segment_bitmap.size()), range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + range_offset_last, major_hypersparse_first = - *(edge_partition - .major_hypersparse_first())] __device__(auto range_offset) { - auto segment_offset = - (range_first + range_offset) - major_hypersparse_first; - return ((range_bitmap[packed_bool_offset(range_offset)] & - packed_bool_mask(range_offset)) != packed_bool_empty_mask()) && - ((segment_bitmap[packed_bool_offset(segment_offset)] & - packed_bool_mask(segment_offset)) != packed_bool_empty_mask()); + *(edge_partition.major_hypersparse_first())] __device__(size_t i) { + auto this_word_range_offset_first = cuda::std::max( + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()), + range_offset_first); + auto this_word_range_offset_last = + cuda::std::min(static_cast( + (packed_bool_offset(range_offset_first) + (i + 1)) * + packed_bools_per_word()), + range_offset_last); + auto range_lead_bits = static_cast(this_word_range_offset_first % + packed_bools_per_word()); + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask(range_offset_first % + packed_bools_per_word()); + } + auto this_word_hypersparse_offset_first = + (range_first + this_word_range_offset_first) - major_hypersparse_first; + auto num_bits = static_cast(this_word_range_offset_last - + this_word_range_offset_first); + auto hypersparse_lead_bits = + static_cast(this_word_hypersparse_offset_first) % + packed_bools_per_word(); + auto segment_bitmap_word = ((segment_bitmap[packed_bool_offset( + this_word_hypersparse_offset_first)] >> + hypersparse_lead_bits)) + << range_lead_bits; + auto remaining_bits = + (num_bits > (packed_bools_per_word() - hypersparse_lead_bits)) + ? (num_bits - (packed_bools_per_word() - hypersparse_lead_bits)) + : size_t{0}; + if (remaining_bits > 0) { + segment_bitmap_word |= + ((segment_bitmap + [packed_bool_offset(this_word_hypersparse_offset_first) + 1] & + packed_bool_partial_mask(remaining_bits)) + << ((packed_bools_per_word() - hypersparse_lead_bits) + + range_lead_bits)); + } + return range_bitmap_word & segment_bitmap_word; })); + auto output_count_first = thrust::make_transform_iterator( + filtered_bitmap.begin(), + cuda::proclaim_return_type([] __device__(uint32_t word) { + return static_cast(__popc(word)); + })); + rmm::device_uvector output_count_offsets(filtered_bitmap.size() + 1, + loop_stream); + output_count_offsets.set_element_to_zero_async(0, loop_stream); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + output_count_first, + output_count_first + filtered_bitmap.size(), + output_count_offsets.begin() + 1); if (keys.index() == 0) { if (offsets.index() == 0) { - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_counting_iterator(range_offset_first), - thrust::make_transform_iterator(offset_first, - typecast_t{})); - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (range_offset_last - range_offset_first), - flag_first, - thrust::make_zip_iterator( - get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], - std::get<0>(offsets).begin()), - raft::device_span(counters.data() + j, size_t{1}), - loop_stream); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); } else { - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_counting_iterator(range_offset_first), offset_first); - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (range_offset_last - range_offset_first), - flag_first, - thrust::make_zip_iterator( - get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], - std::get<1>(offsets).begin()), - raft::device_span(counters.data() + j, size_t{1}), - loop_stream); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v_offset = + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v_offset + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); } } else { if (offsets.index() == 0) { - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_counting_iterator(local_v_list_range_firsts[partition_idx] + - range_offset_first), - thrust::make_transform_iterator(offset_first, - typecast_t{})); - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (range_offset_last - range_offset_first), - flag_first, - thrust::make_zip_iterator( - get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], - std::get<0>(offsets).begin()), - raft::device_span(counters.data() + j, size_t{1}), - loop_stream); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<0>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); } else { - auto input_pair_first = thrust::make_zip_iterator( - thrust::make_counting_iterator(local_v_list_range_firsts[partition_idx] + - range_offset_first), - offset_first); - detail::copy_if_nosync( - input_pair_first, - input_pair_first + (range_offset_last - range_offset_first), - flag_first, - thrust::make_zip_iterator( - get_dataframe_buffer_begin(std::get<1>(keys)) + key_segment_offsets[3], - std::get<1>(offsets).begin()), - raft::device_span(counters.data() + j, size_t{1}), - loop_stream); + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(size_t{0}), + thrust::make_counting_iterator(filtered_bitmap.size()), + [range_bitmap = + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + filtered_bitmap = raft::device_span( + filtered_bitmap.data(), filtered_bitmap.size()), + input_count_offsets = raft::device_span( + input_count_offsets.data(), input_count_offsets.size()), + output_count_offsets = raft::device_span( + output_count_offsets.data(), output_count_offsets.size()), + output_key_first = + get_dataframe_buffer_begin(std::get<0>(keys)) + key_segment_offsets[3], + output_offset_first = std::get<1>(offsets).begin(), + range_first = local_v_list_range_firsts[partition_idx], + range_offset_first, + start_key_offset = key_segment_offsets[3]] __device__(size_t i) { + auto range_bitmap_word = + range_bitmap[packed_bool_offset(range_offset_first) + i]; + if (i == 0) { // clear the bits in the sparse region + range_bitmap_word &= ~packed_bool_partial_mask( + range_offset_first % packed_bools_per_word()); + } + auto filtered_bitmap_word = filtered_bitmap[i]; + auto lead_bits = (i == 0) + ? static_cast(range_offset_first % + packed_bools_per_word()) + : static_cast(0); + auto this_word_start_v = + range_first + + static_cast((packed_bool_offset(range_offset_first) + i) * + packed_bools_per_word()); + auto this_word_start_key_offset = + static_cast(start_key_offset + input_count_offsets[i]); + auto this_word_output_start_offset = output_count_offsets[i]; + for (int j = 0; j < __popc(filtered_bitmap_word); ++j) { + auto jth_set_bit_pos = static_cast( + __fns(filtered_bitmap_word, lead_bits, j + 1)); + *(output_key_first + (this_word_output_start_offset + j)) = + this_word_start_v + jth_set_bit_pos; + *(output_offset_first + (this_word_output_start_offset + j)) = + this_word_start_key_offset + + static_cast(__popc( + range_bitmap_word & packed_bool_partial_mask(jth_set_bit_pos))); + } + }); } } + thrust::transform( + rmm::exec_policy_nosync(loop_stream), + output_count_offsets.begin() + (output_count_offsets.size() - 1), + output_count_offsets.end(), + counters.data() + j, + typecast_t{}); } else { thrust::fill(rmm::exec_policy_nosync(loop_stream), counters.data() + j, @@ -2730,11 +2912,13 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); - if (edge_partition_new_key_buffers) { + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + if (edge_partition_new_key_buffers) { + for (size_t j = 0; j < loop_count; ++j) { edge_partition_key_buffers[j] = std::move((*edge_partition_new_key_buffers)[j]); } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } if (edge_partition_bitmap_buffers) { (*edge_partition_bitmap_buffers).clear(); } std::vector h_counts(loop_count); @@ -2743,9 +2927,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); if (process_local_edges[j]) { auto& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; @@ -2775,7 +2960,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } { // update edge_partition_deg1_hypersparse_key_offset_counts - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } std::vector h_ptrs( loop_count); // pointers to hypersparse key offset vectors @@ -2849,7 +3034,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } #if PER_V_PERFORMANCE_MEASUREMENT - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } auto subtime3 = std::chrono::steady_clock::now(); #endif @@ -2863,8 +3048,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) : handle.get_stream(); if constexpr (GraphViewType::is_multi_gpu && update_major) { @@ -2898,7 +3083,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, allocate_dataframe_buffer(buffer_size, loop_stream)); } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT auto subtime4 = std::chrono::steady_clock::now(); #endif @@ -3150,12 +3335,12 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_allreduce_displacements.back() + edge_partition_allreduce_sizes.back(), handle.get_stream()); } - if (stream_pool_indices) { handle.sync_stream(); } + if (loop_stream_pool_indices) { handle.sync_stream(); } for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) : handle.get_stream(); std::optional< @@ -3217,7 +3402,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, loop_stream); } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } #if PER_V_PERFORMANCE_MEASUREMENT auto subtime6 = std::chrono::steady_clock::now(); #endif @@ -3237,7 +3422,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, raft::comms::op_t::MIN, handle.get_stream()); } - if (stream_pool_indices) { handle.sync_stream(); } + if (loop_stream_pool_indices) { handle.sync_stream(); } #if PER_V_PERFORMANCE_MEASUREMENT auto subtime7 = std::chrono::steady_clock::now(); #endif @@ -3249,8 +3434,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_selected_ranks_or_flags.reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) : handle.get_stream(); auto const& output_buffer = edge_partition_major_output_buffers[j]; @@ -3314,7 +3499,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } edge_partition_selected_ranks_or_flags.push_back(std::move(selected_ranks_or_flags)); } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } if (minor_comm_size <= std::numeric_limits::max()) { // priority == uint8_t std::get<0>(aggregate_priorities).resize(0, handle.get_stream()); std::get<0>(aggregate_priorities).shrink_to_fit(handle.get_stream()); @@ -3322,7 +3507,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::get<1>(aggregate_priorities).resize(0, handle.get_stream()); std::get<1>(aggregate_priorities).shrink_to_fit(handle.get_stream()); } - if (stream_pool_indices) { handle.sync_stream(); } + if (loop_stream_pool_indices) { handle.sync_stream(); } #if PER_V_PERFORMANCE_MEASUREMENT auto subtime8 = std::chrono::steady_clock::now(); #endif @@ -3332,8 +3517,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) : handle.get_stream(); auto& output_buffer = edge_partition_major_output_buffers[j]; @@ -3407,7 +3592,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, edge_partition_values.push_back(std::move(values)); } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } std::vector copy_sizes(loop_count); raft::update_host(copy_sizes.data(), counters.data(), loop_count, handle.get_stream()); @@ -3423,9 +3608,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, (*edge_partition_deg1_hypersparse_output_offset_vectors).reserve(loop_count); for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); auto& output_buffer = edge_partition_major_output_buffers[j]; std::variant, rmm::device_uvector> @@ -3538,7 +3724,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, resize_dataframe_buffer(output_buffer, 0, loop_stream); shrink_to_fit_dataframe_buffer(output_buffer, loop_stream); } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } std::vector deg1_copy_sizes(loop_count); raft::update_host( @@ -3567,7 +3753,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } #if PER_V_PERFORMANCE_MEASUREMENT - if (stream_pool_indices) { handle.sync_stream(); } + if (loop_stream_pool_indices) { handle.sync_stream(); } auto subtime9 = std::chrono::steady_clock::now(); #endif @@ -3665,7 +3851,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if (k == (minor_comm_size - 1)) { aligned_sizes[k] = (*rx_offset_sizes)[k]; } else { - aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], value_alignment); + aligned_sizes[k] = raft::round_up_safe((*rx_offset_sizes)[k], offset_alignment); } } std::exclusive_scan( @@ -3770,8 +3956,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } handle.sync_stream(); // this is required before edge_partition_values.clear(); edge_partition_values.clear(); - if (stream_pool_indices) { - handle.sync_stream_pool(*stream_pool_indices); + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); } // to ensure that memory is freed #if PER_V_PERFORMANCE_MEASUREMENT auto subtime11 = std::chrono::steady_clock::now(); @@ -4125,7 +4311,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, handle.get_stream()); } device_group_end(minor_comm); - if (stream_pool_indices) { handle.sync_stream(); } + if (loop_stream_pool_indices) { handle.sync_stream(); } } } } From b419afbbb7f52f066833eb3bf360d1fa2bfa7ac2 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sat, 19 Oct 2024 22:14:49 -0700 Subject: [PATCH 106/126] update detail::extract_transform_v_frontier_e --- .../cugraph/edge_partition_device_view.cuh | 59 +- .../detail/extract_transform_v_frontier_e.cuh | 834 ++++++++++++------ 2 files changed, 627 insertions(+), 266 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index ff6d13fd523..21ed83a87f9 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -251,6 +251,62 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + if (dcs_nzd_vertices_) { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, *dcs_nzd_vertices_, *major_hypersparse_first_}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } else { + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{ + this->offsets_, major_range_first_, std::byte{0} /* dummy */, std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); @@ -516,7 +572,8 @@ class edge_partition_device_view_t> for consistency (see dcs_nzd_range_bitmap()) + // FIxME: better return thrust::optional> for consistency (see + // dcs_nzd_range_bitmap()) __host__ __device__ thrust::optional dcs_nzd_vertices() const { return dcs_nzd_vertices_ ? thrust::optional{(*dcs_nzd_vertices_).data()} diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index a8686b9062c..a56d7df53b1 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -475,6 +475,153 @@ __global__ static void extract_transform_v_frontier_e_high_degree( } } +template +void extract_transform_v_frontier_e_edge_partition( + raft::handle_t const& handle, + edge_partition_device_view_t edge_partition, + InputKeyIterator edge_partition_frontier_key_first, + InputKeyIterator edge_partition_frontier_key_last, + EdgePartitionSrcValueInputWrapper edge_partition_src_value_input, + EdgePartitionDstValueInputWrapper edge_partition_dst_value_input, + EdgePartitionValueInputWrapper edge_partition_e_value_input, + thrust::optional edge_partition_e_mask, + OptionalOutputKeyIterator output_key_first, + OptionalOutputValueIterator output_value_first, + raft::device_span count /* size = 1 */, + EdgeOp e_op, + std::optional> high_segment_key_local_degree_offsets, + std::optional high_segment_edge_count, + std::optional> key_segment_offsets, + std::optional> const& edge_partition_stream_pool_indices) +{ + if (key_segment_offsets) { + if (((*key_segment_offsets)[1] > 0) && ((*high_segment_edge_count) > 0)) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) + : handle.get_stream(); + + raft::grid_1d_thread_t update_grid((*high_segment_edge_count), + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_high_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + raft::device_span((*high_segment_key_local_degree_offsets).data(), + (*high_segment_key_local_degree_offsets).size()), + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) + : handle.get_stream(); + raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_mid_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[1], + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[2], + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + if (edge_partition.dcs_nzd_vertex_count() && + ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { + auto exec_stream = + edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) + : handle.get_stream(); + raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first + (*key_segment_offsets)[3], + edge_partition_frontier_key_first + (*key_segment_offsets)[4], + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + } else { + assert(!edge_partition_stream_pool_indices); + auto frontier_size = static_cast( + thrust::distance(edge_partition_frontier_key_first, edge_partition_frontier_key_last)); + if (frontier_size > 0) { + raft::grid_1d_thread_t update_grid(frontier_size, + extract_transform_v_frontier_e_kernel_block_size, + handle.get_device_properties().maxGridSize[0]); + + extract_transform_v_frontier_e_hypersparse_or_low_degree + <<>>( + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + output_key_first, + output_value_first, + count.data(), + e_op); + } + } +} + #define EXTRACT_PERFORMANCE_MEASUREMENT 1 // FIXME: delete template >, std::byte /* dummy */> frontier_bitmap{}; + std:: + conditional_t>, std::byte /* dummy */> + compressed_frontier{}; if constexpr (try_bitmap) { - // FIXME: 4B v_offset... auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); if (minor_comm_size > 1) { auto const minor_comm_rank = minor_comm.get_rank(); + + if constexpr (sizeof(vertex_t) == 8) { + vertex_t local_frontier_max_range_size{0}; + for (int i = 0; i < minor_comm_size; ++i) { + auto range_size = local_frontier_range_lasts[i] - local_frontier_range_firsts[i]; + local_frontier_max_range_size = std::max(range_size, local_frontier_max_range_size); + } + if (local_frontier_max_range_size <= + std::numeric_limits::max()) { // broadcast 32 bit offset values instead of 64 + // bit vertex IDs + v_compressible = true; + } + } + double avg_fill_ratio{0.0}; for (int i = 0; i < minor_comm_size; ++i) { auto num_keys = static_cast(local_frontier_sizes[i]); @@ -778,6 +941,17 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, local_frontier_range_firsts[minor_comm_rank], local_frontier_range_lasts[minor_comm_rank], handle.get_stream()); + } else if (v_compressible) { + rmm::device_uvector tmps(local_frontier_sizes[minor_comm_rank], + handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + frontier_key_first, + frontier_key_last, + tmps.begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[minor_comm_rank]] __device__( + auto v) { return static_cast(v - range_first); })); + compressed_frontier = std::move(tmps); } } } @@ -796,7 +970,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto const comm_size = comm.get_size(); auto max_tmp_buffer_size = static_cast( - static_cast(handle.get_device_properties().totalGlobalMem) * 0.05); + static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); auto aggregate_major_range_size = host_scalar_allreduce( comm, @@ -813,7 +987,11 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, size_t key_size{0}; if constexpr (std::is_arithmetic_v) { - key_size = sizeof(key_t); + if (v_compressible) { + key_size = sizeof(uint32_t); + } else { + key_size = sizeof(key_t); + } } else { key_size = cugraph::sum_thrust_tuple_element_sizes(); } @@ -833,12 +1011,12 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, output_value_size = cugraph::sum_thrust_tuple_element_sizes(); } } - auto approx_tmp_buffer_size_per_edge_partition = + auto approx_tmp_buffer_size_per_loop = (aggregate_major_range_size / comm_size) * key_size + (aggregate_max_pushes / comm_size) * (output_key_size + output_value_size); stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, - approx_tmp_buffer_size_per_edge_partition, + approx_tmp_buffer_size_per_loop, graph_view.number_of_local_edge_partitions(), max_segments, handle.get_stream_pool_size()); @@ -847,12 +1025,20 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } size_t num_concurrent_loops{1}; + std::optional> loop_stream_pool_indices{ + std::nullopt}; // first num_concurrent_loopos streams from stream_pool_indices if (stream_pool_indices) { assert(((*stream_pool_indices).size() % max_segments) == 0); - num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + loop_stream_pool_indices = std::vector(num_concurrent_loops); + std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } - if (stream_pool_indices) { handle.sync_stream(); } + rmm::device_uvector counters(num_concurrent_loops, handle.get_stream()); + + if constexpr (!GraphViewType::is_multi_gpu) { + if (loop_stream_pool_indices) { handle.sync_stream(); } + } // 2. fill the buffers @@ -874,63 +1060,151 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto loop_count = std::min(num_concurrent_loops, graph_view.number_of_local_edge_partitions() - i); - // FIXME: ncclGroupStart,ncclGroupEnd - std::conditional_t>, - std::byte /* dummy */> + std::conditional_t< + GraphViewType::is_multi_gpu, + std::conditional_t< + try_bitmap, + std::vector, rmm::device_uvector>>, + std::vector>>, + std::byte /* dummy */> edge_partition_key_buffers{}; if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + edge_partition_key_buffers.reserve(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); + std::conditional_t>>, + std::byte /* dummy */> + edge_partition_bitmap_buffers{std::nullopt}; + if constexpr (try_bitmap) { + if (frontier_bitmap) { + edge_partition_bitmap_buffers = std::vector>{}; + (*edge_partition_bitmap_buffers).reserve(loop_count); + } + } - auto edge_partition_key_buffer = allocate_dataframe_buffer( - minor_comm_size > 1 ? local_frontier_sizes[partition_idx] : size_t{0}, loop_stream); - if (size_dataframe_buffer(edge_partition_key_buffer) > 0) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + bool use_bitmap_buffer = false; + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + (*edge_partition_bitmap_buffers) + .emplace_back(packed_bool_size(local_frontier_range_lasts[partition_idx] - + local_frontier_range_firsts[partition_idx]), + handle.get_stream()); + use_bitmap_buffer = true; + } + } + if (!use_bitmap_buffer) { + bool allocated{false}; if constexpr (try_bitmap) { - std::variant, decltype(frontier_key_first)> v_list{}; - if (frontier_bitmap) { - v_list = (static_cast(partition_idx) == minor_comm_rank) - ? raft::device_span((*frontier_bitmap).data(), - (*frontier_bitmap).size()) - : raft::device_span(static_cast(nullptr), - size_t{0}); - } else { - v_list = frontier_key_first; + if (v_compressible) { + edge_partition_key_buffers.push_back(rmm::device_uvector( + local_frontier_sizes[partition_idx], handle.get_stream())); + allocated = true; } - device_bcast_vertex_list(minor_comm, - v_list, - get_dataframe_buffer_begin(edge_partition_key_buffer), - local_frontier_range_firsts[partition_idx], - local_frontier_range_lasts[partition_idx], - local_frontier_sizes[partition_idx], - static_cast(partition_idx), - loop_stream); + } + if (!allocated) { + edge_partition_key_buffers.push_back(allocate_dataframe_buffer( + local_frontier_sizes[partition_idx], handle.get_stream())); + } + } + } + + device_group_start(minor_comm); + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + + if constexpr (try_bitmap) { + if (frontier_bitmap) { + device_bcast(minor_comm, + (*frontier_bitmap).data(), + get_dataframe_buffer_begin((*edge_partition_bitmap_buffers)[j]), + size_dataframe_buffer((*edge_partition_bitmap_buffers)[j]), + static_cast(partition_idx), + handle.get_stream()); + } else if (compressed_frontier) { + device_bcast(minor_comm, + (*compressed_frontier).data(), + get_dataframe_buffer_begin(std::get<0>(edge_partition_key_buffers[j])), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); } else { device_bcast(minor_comm, frontier_key_first, - get_dataframe_buffer_begin(edge_partition_key_buffer), + get_dataframe_buffer_begin(std::get<1>(edge_partition_key_buffers[j])), local_frontier_sizes[partition_idx], static_cast(partition_idx), - loop_stream); + handle.get_stream()); + } + } else { + device_bcast(minor_comm, + frontier_key_first, + get_dataframe_buffer_begin(edge_partition_key_buffers[j]), + local_frontier_sizes[partition_idx], + static_cast(partition_idx), + handle.get_stream()); + } + } + device_group_end(minor_comm); + if (loop_stream_pool_indices) { handle.sync_stream(); } + + if constexpr (try_bitmap) { + if (edge_partition_bitmap_buffers) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + std::variant, rmm::device_uvector> keys = + rmm::device_uvector(0, loop_stream); + if (v_compressible) { + std::get<0>(keys).resize(local_frontier_sizes[partition_idx], loop_stream); + } else { + keys = + rmm::device_uvector(local_frontier_sizes[partition_idx], loop_stream); + } + + auto& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + + auto range_first = local_frontier_range_firsts[partition_idx]; + auto range_last = local_frontier_range_lasts[partition_idx]; + if (keys.index() == 0) { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<0>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + uint32_t{0}, + static_cast(range_last - range_first), + loop_stream); + } else { + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + get_dataframe_buffer_begin(std::get<1>(keys)), + raft::device_span(counters.data() + j, + size_t{1}), // dummy, we already know the counts + range_first, + range_last, + loop_stream); + } + + edge_partition_key_buffers.push_back(std::move(keys)); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + (*edge_partition_bitmap_buffers).clear(); } - edge_partition_key_buffers.push_back(std::move(edge_partition_key_buffer)); } } #if EXTRACT_PERFORMANCE_MEASUREMENT - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } auto subtime1 = std::chrono::steady_clock::now(); #endif @@ -938,96 +1212,178 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, output_key_buffers.reserve(loop_count); std::vector> output_value_buffers{}; output_value_buffers.reserve(loop_count); - std::vector> output_buffer_idx_scalars{}; - output_buffer_idx_scalars.reserve(loop_count); - std::optional>> key_local_degree_offset_vectors{ - std::nullopt}; + std::vector edge_partition_max_push_counts(loop_count); + + std::optional>> + high_segment_key_local_degree_offset_vectors{std::nullopt}; std::optional> high_segment_edge_counts{std::nullopt}; if (key_segment_offset_vectors) { - key_local_degree_offset_vectors = std::vector>{}; - (*key_local_degree_offset_vectors).reserve(loop_count); + high_segment_key_local_degree_offset_vectors = std::vector>{}; + (*high_segment_key_local_degree_offset_vectors).reserve(loop_count); high_segment_edge_counts = std::vector(loop_count); } - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); + edge_partition_max_push_counts[0] = local_max_pushes; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + if (minor_comm_size > 1) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_key_buffers[j]); + if (static_cast(partition_idx) != minor_comm_rank) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& keys = edge_partition_key_buffers[j]; + + bool computed{false}; + if constexpr (try_bitmap) { + if (keys.index() == 0) { + auto major_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + return range_first + static_cast(v_offset); + })); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + std::get<0>(keys).size(), + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + computed = true; + } + } + if (!computed) { + dataframe_buffer_const_iterator_type_t key_first{}; + size_t num_keys{}; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + num_keys = std::get<1>(keys).size(); + } else { + key_first = get_dataframe_buffer_begin(keys); + num_keys = keys.size(); + } + auto major_first = thrust_tuple_get_or_identity(key_first); + edge_partition.compute_number_of_edges_async( + major_first, + major_first + num_keys, + raft::device_span(counters.data() + j, size_t{1}), + loop_stream); + } + } + } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + raft::update_host( + edge_partition_max_push_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); + if (static_cast(minor_comm_rank / num_concurrent_loops) == + (i / num_concurrent_loops)) { + edge_partition_max_push_counts[minor_comm_rank % num_concurrent_loops] = local_max_pushes; } } + } - auto edge_partition_max_pushes = local_max_pushes; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - if (static_cast(partition_idx) != minor_comm_rank) { - auto edge_partition_frontier_major_first = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_first); - auto edge_partition_frontier_major_last = - thrust_tuple_get_or_identity( - edge_partition_frontier_key_last); - // FIXME: compute_number_of_edges() implicitly synchronizes to copy the results to host (use cub reduce) - // FIXME: check whether skipping a call for 0 key_buffer size helps or not - edge_partition_max_pushes = edge_partition.compute_number_of_edges( - edge_partition_frontier_major_first, edge_partition_frontier_major_last, loop_stream); + if (key_segment_offset_vectors) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + rmm::device_uvector high_segment_key_local_degree_offsets( + key_segment_offsets[1] + 1, loop_stream); + high_segment_key_local_degree_offsets.set_element_to_zero_async(0, loop_stream); + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto key_local_degree_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [edge_partition, + range_first = + local_frontier_range_firsts[partition_idx]] __device__(uint32_t v_offset) { + auto major = range_first + static_cast(v_offset); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + computed = true; } } + if (!computed) { + auto key_first = frontier_key_first; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + key_first = get_dataframe_buffer_begin(std::get<1>(keys)); + } else { + key_first = get_dataframe_buffer_begin(keys); + } + auto key_local_degree_first = thrust::make_transform_iterator( + key_first, cuda::proclaim_return_type([edge_partition] __device__(auto key) { + auto major = thrust_tuple_get_or_identity(key); + auto major_offset = edge_partition.major_offset_from_major_nocheck(major); + return static_cast(edge_partition.local_degree(major_offset)); + })); + thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), + key_local_degree_first, + key_local_degree_first + key_segment_offsets[1], + high_segment_key_local_degree_offsets.begin() + 1); + } + } + raft::update_host((*high_segment_edge_counts).data() + j, + high_segment_key_local_degree_offsets.data() + key_segment_offsets[1], + 1, + loop_stream); + (*high_segment_key_local_degree_offset_vectors) + .push_back(std::move(high_segment_key_local_degree_offsets)); } - if (key_segment_offset_vectors) { - auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - rmm::device_uvector key_local_degree_offsets(key_segment_offsets[1] + 1, - loop_stream); - key_local_degree_offsets.set_element_to_zero_async(0, loop_stream); - auto key_local_degree_first = thrust::make_transform_iterator( - edge_partition_frontier_key_first, - cuda::proclaim_return_type([edge_partition] __device__(auto key) { - auto major = thrust_tuple_get_or_identity(key); - auto major_offset = edge_partition.major_offset_from_major_nocheck(major); - return static_cast(edge_partition.local_degree(major_offset)); - })); - thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), - key_local_degree_first, - key_local_degree_first + key_segment_offsets[1], - key_local_degree_offsets.begin() + 1); - size_t num_edges{0}; - raft::update_host( - &num_edges, key_local_degree_offsets.data() + key_segment_offsets[1], 1, loop_stream); - // FIXME: this prevents multi-CUDA stream execution - RAFT_CUDA_TRY(cudaStreamSynchronize(loop_stream)); - (*key_local_degree_offset_vectors).push_back(std::move(key_local_degree_offsets)); - (*high_segment_edge_counts)[j] = num_edges; + // to ensure that *high_segment_edge_counts[] is valid + if (loop_stream_pool_indices) { + handle.sync_stream_pool(*loop_stream_pool_indices); + } else { + handle.sync_stream(); } + } - output_key_buffers.push_back( - allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); - output_value_buffers.push_back( - allocate_optional_dataframe_buffer(edge_partition_max_pushes, loop_stream)); - output_buffer_idx_scalars.push_back(rmm::device_scalar(size_t{0}, loop_stream)); + for (size_t j = 0; j < loop_count; ++j) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + output_key_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); + output_value_buffers.push_back(allocate_optional_dataframe_buffer( + edge_partition_max_push_counts[j], loop_stream)); } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT auto subtime2 = std::chrono::steady_clock::now(); #endif + thrust::fill( + handle.get_thrust_policy(), counters.begin(), counters.begin() + loop_count, size_t{0}); + if (loop_stream_pool_indices) { handle.sync_stream(); } + for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -1045,23 +1401,6 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, (*stream_pool_indices).data() + j * max_segments, max_segments) : std::nullopt; - auto edge_partition_frontier_key_first = frontier_key_first; - auto edge_partition_frontier_key_last = frontier_key_last; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - if (minor_comm_size > 1) { - edge_partition_frontier_key_first = - get_dataframe_buffer_begin(edge_partition_key_buffers[j]); - edge_partition_frontier_key_last = - get_dataframe_buffer_end(edge_partition_key_buffers[j]); - } - } - - auto& tmp_key_buffer = output_key_buffers[j]; - auto& tmp_value_buffer = output_value_buffers[j]; - auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; - edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; if constexpr (GraphViewType::is_storage_transposed) { @@ -1078,164 +1417,123 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto edge_partition_e_value_input = edge_partition_e_input_device_view_t(edge_value_input, partition_idx); - if (key_segment_offset_vectors) { - auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - - if ((key_segment_offsets[1] > 0) && ((*high_segment_edge_counts)[j] > 0)) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) - : handle.get_stream(); - - raft::grid_1d_thread_t update_grid((*high_segment_edge_counts)[j], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_high_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - raft::device_span((*key_local_degree_offset_vectors)[j].data(), - (*key_local_degree_offset_vectors)[j].size()), - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); - } - if (key_segment_offsets[2] - key_segment_offsets[1] > 0) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) - : handle.get_stream(); - raft::grid_1d_warp_t update_grid(key_segment_offsets[2] - key_segment_offsets[1], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_mid_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + key_segment_offsets[1], - edge_partition_frontier_key_first + key_segment_offsets[2], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); - } - if (key_segment_offsets[3] - key_segment_offsets[2] > 0) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid(key_segment_offsets[3] - key_segment_offsets[2], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + key_segment_offsets[2], - edge_partition_frontier_key_first + key_segment_offsets[3], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); - } - if (edge_partition.dcs_nzd_vertex_count() && - (key_segment_offsets[4] - key_segment_offsets[3] > 0)) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) - : handle.get_stream(); - raft::grid_1d_thread_t update_grid(key_segment_offsets[4] - key_segment_offsets[3], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first + key_segment_offsets[3], - edge_partition_frontier_key_first + key_segment_offsets[4], - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); + bool computed{false}; + if constexpr (try_bitmap) { + auto const& keys = edge_partition_key_buffers[j]; + if (keys.index() == 0) { + auto edge_partition_frontier_key_first = thrust::make_transform_iterator( + std::get<0>(keys).begin(), + cuda::proclaim_return_type( + [range_first = local_frontier_range_firsts[partition_idx]] __device__( + uint32_t v_offset) { return range_first + static_cast(v_offset); })); + auto edge_partition_frontier_key_last = + edge_partition_frontier_key_first + std::get<0>(keys).size(); + extract_transform_v_frontier_e_edge_partition( + handle, + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); + computed = true; } - } else { - assert(!edge_partition_stream_pool_indices); - if (local_frontier_sizes[partition_idx] > 0) { - raft::grid_1d_thread_t update_grid(local_frontier_sizes[partition_idx], - extract_transform_v_frontier_e_kernel_block_size, - handle.get_device_properties().maxGridSize[0]); - - extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( - edge_partition, - edge_partition_frontier_key_first, - edge_partition_frontier_key_last, - edge_partition_src_value_input, - edge_partition_dst_value_input, - edge_partition_e_value_input, - edge_partition_e_mask, - get_optional_dataframe_buffer_begin(tmp_key_buffer), - get_optional_dataframe_buffer_begin(tmp_value_buffer), - tmp_buffer_idx.data(), - e_op); + } + if (!computed) { + auto edge_partition_frontier_key_first = frontier_key_first; + auto edge_partition_frontier_key_last = frontier_key_last; + if constexpr (GraphViewType::is_multi_gpu) { + auto const& keys = edge_partition_key_buffers[j]; + if constexpr (try_bitmap) { + assert(keys.index() == 1); + edge_partition_frontier_key_first = std::get<1>(keys).begin(); + edge_partition_frontier_key_last = std::get<1>(keys).end(); + } else { + edge_partition_frontier_key_first = get_dataframe_buffer_begin(keys); + edge_partition_frontier_key_last = get_dataframe_buffer_end(keys); + } } + + extract_transform_v_frontier_e_edge_partition( + handle, + edge_partition, + edge_partition_frontier_key_first, + edge_partition_frontier_key_last, + edge_partition_src_value_input, + edge_partition_dst_value_input, + edge_partition_e_value_input, + edge_partition_e_mask, + get_optional_dataframe_buffer_begin(output_key_buffers[j]), + get_optional_dataframe_buffer_begin(output_value_buffers[j]), + raft::device_span(counters.data() + j, size_t{1}), + e_op, + high_segment_key_local_degree_offset_vectors + ? std::make_optional>( + (*high_segment_key_local_degree_offset_vectors)[j].data(), + (*high_segment_key_local_degree_offset_vectors)[j].size()) + : std::nullopt, + high_segment_edge_counts ? std::make_optional((*high_segment_edge_counts)[j]) + : std::nullopt, + key_segment_offset_vectors ? std::make_optional>( + (*key_segment_offset_vectors)[partition_idx].data(), + (*key_segment_offset_vectors)[partition_idx].size()) + : std::nullopt, + edge_partition_stream_pool_indices); } } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT auto subtime3 = std::chrono::steady_clock::now(); #endif - std::vector tmp_buffer_sizes(loop_count); - for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); - auto& tmp_buffer_idx = output_buffer_idx_scalars[j]; - tmp_buffer_sizes[j] = tmp_buffer_idx.value(loop_stream); - } + std::vector h_counts(loop_count); + raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); + handle.sync_stream(); #if EXTRACT_PERFORMANCE_MEASUREMENT - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } auto subtime4 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) : handle.get_stream(); - auto tmp_buffer_size = tmp_buffer_sizes[j]; + auto tmp_buffer_size = h_counts[j]; if (tmp_buffer_size > 0) { auto& tmp_key_buffer = output_key_buffers[j]; auto& tmp_value_buffer = output_value_buffers[j]; resize_optional_dataframe_buffer( tmp_key_buffer, tmp_buffer_size, loop_stream); - shrink_to_fit_optional_dataframe_buffer(tmp_key_buffer, loop_stream); + // skip shrink_to_fit before return to cut execution time resize_optional_dataframe_buffer( tmp_value_buffer, tmp_buffer_size, loop_stream); - shrink_to_fit_optional_dataframe_buffer(tmp_value_buffer, loop_stream); + // skip shrink_to_fit before return to cut execution time key_buffers.push_back(std::move(tmp_key_buffer)); value_buffers.push_back(std::move(tmp_value_buffer)); } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT auto subtime5 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; @@ -1262,6 +1560,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } else if (key_buffers.size() == 1) { key_buffer = std::move(key_buffers[0]); value_buffer = std::move(value_buffers[0]); + shrink_to_fit_optional_dataframe_buffer(key_buffer, handle.get_stream()); + shrink_to_fit_optional_dataframe_buffer(value_buffer, handle.get_stream()); } else { std::vector buffer_sizes(key_buffers.size()); static_assert(!std::is_same_v || !std::is_same_v); @@ -1279,24 +1579,28 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::vector buffer_displacements(buffer_sizes.size()); std::exclusive_scan( buffer_sizes.begin(), buffer_sizes.end(), buffer_displacements.begin(), size_t{0}); - // FIXME: this copy can be performed in multiple streams + handle.sync_stream(); for (size_t i = 0; i < key_buffers.size(); ++i) { + auto loop_stream = loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[i]) + : handle.get_stream(); if constexpr (!std::is_same_v) { thrust::copy( - handle.get_thrust_policy(), + rmm::exec_policy_nosync(loop_stream), get_optional_dataframe_buffer_cbegin(key_buffers[i]), get_optional_dataframe_buffer_cend(key_buffers[i]), get_optional_dataframe_buffer_begin(key_buffer) + buffer_displacements[i]); } if constexpr (!std::is_same_v) { - thrust::copy(handle.get_thrust_policy(), + thrust::copy(rmm::exec_policy_nosync(loop_stream), get_optional_dataframe_buffer_cbegin(value_buffers[i]), get_optional_dataframe_buffer_cend(value_buffers[i]), get_optional_dataframe_buffer_begin(value_buffer) + buffer_displacements[i]); } } + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } } #if EXTRACT_PERFORMANCE_MEASUREMENT From 7e718b1ad710bb22b641317350d3852f096f5b15 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 20 Oct 2024 17:24:55 -0700 Subject: [PATCH 107/126] fine-tune direction optimizing alpha based on average vertex degree --- cpp/src/traversal/bfs_impl.cuh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 1312323cc80..11ef079570e 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -222,8 +222,13 @@ void bfs(raft::handle_t const& handle, auto prep1 = std::chrono::steady_clock::now(); #endif - constexpr double direction_optimizing_alpha = 14.0; - constexpr vertex_t direction_optimizing_beta = 24; + double direction_optimizing_alpha = + (graph_view.number_of_vertices() > 0) + ? ((static_cast(graph_view.compute_number_of_edges(handle)) / + static_cast(graph_view.number_of_vertices())) * + (1.0 / 3.0) /* tuning parametger */) + : double{1.0}; + constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter std::optional> approx_out_degrees{std::nullopt}; std::optional> nzd_unvisited_vertices{std::nullopt}; @@ -567,6 +572,7 @@ void bfs(raft::handle_t const& handle, : m_u; #if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete std::cerr << "m_f=" << m_f << " m_u=" << m_u + << " direction_optimizing_alpha=" << direction_optimizing_alpha << " aggregate_m_f * direction_optimzing_alpha=" << aggregate_m_f * direction_optimizing_alpha << " aggregate_m_u=" << aggregate_m_u From 58f3ac617817df3838819ba95f28c7df8c752a9d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 20 Oct 2024 17:25:38 -0700 Subject: [PATCH 108/126] performance otimize local computing part of fill_edge_minor_property --- cpp/src/prims/fill_edge_src_dst_property.cuh | 73 +++++++++++++------- 1 file changed, 47 insertions(+), 26 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 10c11acb346..444a5f90dcf 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -444,38 +444,59 @@ void fill_edge_minor_property(raft::handle_t const& handle, packed_bool_offset(graph_view.local_vertex_partition_range_first() - minor_range_first)) && (((local_v_list_range_firsts[major_comm_rank] - minor_range_first) % - packed_bools_per_word()) != 0)) { + packed_bools_per_word()) != + 0)) { // there are unaligned bits (fewer than packed_bools_per_word()) in the vertex + // partition boundary leading_boundary_words = packed_bool_word_bcast_alignment; } thrust::fill(handle.get_thrust_policy(), boundary_words.begin(), boundary_words.begin() + leading_boundary_words, packed_bool_empty_mask()); - // FIXME: this looks expensive... - thrust::for_each( - handle.get_thrust_policy(), - sorted_unique_vertex_first, - sorted_unique_vertex_last, - [input, - minor_range_first, - leading_boundary_words, - word_offset_first = - packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first), - output_value_first = edge_partition_value_first, - boundary_words = raft::device_span( - boundary_words.data(), boundary_words.size())] __device__(auto v) { - auto v_offset = v - minor_range_first; - auto word_offset = packed_bool_offset(v_offset); - cuda::atomic_ref word( - (word_offset - word_offset_first < leading_boundary_words) - ? boundary_words[word_offset - word_offset_first] - : *(output_value_first + word_offset)); - if (input) { - word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); - } else { - word.fetch_and(~packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); - } - }); + if (local_v_list_range_firsts[major_comm_rank] < + local_v_list_range_lasts[major_comm_rank]) { + auto word_offset_first = + packed_bool_offset(local_v_list_range_firsts[major_comm_rank] - minor_range_first); + auto word_offset_last = + packed_bool_offset((local_v_list_range_lasts[major_comm_rank] - 1) - + minor_range_first) + + 1; + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(word_offset_first), + thrust::make_counting_iterator(word_offset_last), + [sorted_unique_vertex_first, + sorted_unique_vertex_last, + input, + minor_range_first, + leading_boundary_words, + word_offset_first, + vertex_partition_range_last = graph_view.local_vertex_partition_range_last(), + output_value_first = edge_partition_value_first, + boundary_words = raft::device_span( + boundary_words.data(), boundary_words.size())] __device__(auto i) { + auto& word = ((i - word_offset_first) < leading_boundary_words) + ? boundary_words[i - word_offset_first] + : *(output_value_first + i); + auto word_v_first = + minor_range_first + static_cast(i * packed_bools_per_word()); + auto word_v_last = + ((vertex_partition_range_last - word_v_first) <= packed_bools_per_word()) + ? vertex_partition_range_last + : (word_v_first + static_cast(packed_bools_per_word())); + auto it = thrust::lower_bound( + thrust::seq, sorted_unique_vertex_first, sorted_unique_vertex_last, word_v_first); + while ((it != sorted_unique_vertex_last) && (*it < word_v_last)) { + auto v_offset = *it - minor_range_first; + if (input) { + word |= packed_bool_mask(v_offset); + } else { + word &= ~packed_bool_mask(v_offset); + } + ++it; + } + }); + } rmm::device_uvector aggregate_boundary_words( major_comm_size * packed_bool_word_bcast_alignment, handle.get_stream()); device_allgather(major_comm, From b2dca9b8160872696db8ccc61bfbb302ef69e6fe Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 20 Oct 2024 20:31:01 -0700 Subject: [PATCH 109/126] parameter tuning in graph creation --- cpp/src/structure/create_graph_from_edgelist_impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/structure/create_graph_from_edgelist_impl.cuh b/cpp/src/structure/create_graph_from_edgelist_impl.cuh index b7b9aa640f1..e040366fe25 100644 --- a/cpp/src/structure/create_graph_from_edgelist_impl.cuh +++ b/cpp/src/structure/create_graph_from_edgelist_impl.cuh @@ -1178,7 +1178,7 @@ create_graph_from_edgelist_impl( } bool compress{false}; if (static_cast(num_edges) * element_size > - static_cast(total_global_mem * 0.65 /* tuning parameter */)) { + static_cast(total_global_mem * 0.5 /* tuning parameter */)) { compress = true; } From ea0907d50ef7c702f01a171c408c83b92072f3c4 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Sun, 20 Oct 2024 23:48:29 -0700 Subject: [PATCH 110/126] reduce kernel launches in fill_edge_src_minor_property --- cpp/src/prims/fill_edge_src_dst_property.cuh | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 444a5f90dcf..6efee71f5ac 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -705,8 +705,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, std::vector, rmm::device_uvector>> edge_partition_v_buffers{}; edge_partition_v_buffers.reserve(loop_count); - std::vector> edge_partition_dummy_counter_scalars{}; - edge_partition_dummy_counter_scalars.reserve(loop_count); + rmm::device_uvector dummy_counters(loop_count, handle.get_stream()); for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; @@ -724,8 +723,6 @@ void fill_edge_minor_property(raft::handle_t const& handle, std::get<0>(v_buffer).resize(local_v_list_sizes[partition_idx], handle.get_stream()); } edge_partition_v_buffers.push_back(std::move(v_buffer)); - edge_partition_dummy_counter_scalars.push_back( - rmm::device_scalar(size_t{0}, handle.get_stream())); } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -779,11 +776,10 @@ void fill_edge_minor_property(raft::handle_t const& handle, auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], loop_stream); - rmm::device_scalar dummy(size_t{0}, loop_stream); retrieve_vertex_list_from_bitmap( raft::device_span(rx_bitmap.data(), rx_bitmap.size()), rx_vertices.begin(), - raft::device_span(dummy.data(), size_t{1}), + raft::device_span(dummy_counters.data() + j, size_t{1}), local_v_list_range_firsts[partition_idx], local_v_list_range_lasts[partition_idx], loop_stream); From 8a78131683daa8df52aa50933e6acf1746e76bb7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Tue, 22 Oct 2024 10:38:26 -0700 Subject: [PATCH 111/126] fix a build error --- cpp/src/prims/detail/extract_transform_v_frontier_e.cuh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index a56d7df53b1..4f2e4e45486 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -1078,7 +1078,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::conditional_t>>, std::byte /* dummy */> - edge_partition_bitmap_buffers{std::nullopt}; + edge_partition_bitmap_buffers{}; if constexpr (try_bitmap) { if (frontier_bitmap) { edge_partition_bitmap_buffers = std::vector>{}; @@ -1269,7 +1269,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, num_keys = std::get<1>(keys).size(); } else { key_first = get_dataframe_buffer_begin(keys); - num_keys = keys.size(); + num_keys = size_dataframe_buffer(keys); } auto major_first = thrust_tuple_get_or_identity(key_first); edge_partition.compute_number_of_edges_async( From aa139258162def648e34ba3e567981f8f0e598e9 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 25 Oct 2024 01:09:46 -0700 Subject: [PATCH 112/126] minor performance tuning --- .../cugraph/edge_partition_device_view.cuh | 46 ++ .../detail/extract_transform_v_frontier_e.cuh | 74 ++- .../prims/detail/per_v_transform_reduce_e.cuh | 10 +- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 519 +++++++++--------- cpp/src/traversal/bfs_impl.cuh | 129 +++-- 5 files changed, 435 insertions(+), 343 deletions(-) diff --git a/cpp/include/cugraph/edge_partition_device_view.cuh b/cpp/include/cugraph/edge_partition_device_view.cuh index 21ed83a87f9..628c3cc10cc 100644 --- a/cpp/include/cugraph/edge_partition_device_view.cuh +++ b/cpp/include/cugraph/edge_partition_device_view.cuh @@ -219,6 +219,7 @@ class edge_partition_device_view_t count /* size = 1 */, rmm::cuda_stream_view stream) const { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + rmm::device_uvector d_tmp_storage(0, stream); size_t tmp_storage_bytes{0}; @@ -368,6 +373,7 @@ class edge_partition_device_view_t()); } + template + __host__ void compute_number_of_edges_async(MajorIterator major_first, + MajorIterator major_last, + raft::device_span count /* size = 1 */, + rmm::cuda_stream_view stream) const + { + if (thrust::distance(major_first, major_last) == 0) { + RAFT_CUDA_TRY(cudaMemsetAsync(count.data(), 0, sizeof(size_t), stream)); + } + + rmm::device_uvector d_tmp_storage(0, stream); + size_t tmp_storage_bytes{0}; + + auto local_degree_first = thrust::make_transform_iterator( + major_first, + detail::local_degree_op_t{this->offsets_, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */, + std::byte{0} /* dummy */}); + cub::DeviceReduce::Sum(static_cast(nullptr), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + d_tmp_storage.resize(tmp_storage_bytes, stream); + cub::DeviceReduce::Sum(d_tmp_storage.data(), + tmp_storage_bytes, + local_degree_first, + count.data(), + thrust::distance(major_first, major_last), + stream); + } + __host__ rmm::device_uvector compute_local_degrees(rmm::cuda_stream_view stream) const { rmm::device_uvector local_degrees(this->major_range_size(), stream); @@ -682,6 +727,7 @@ class edge_partition_device_view_t((*segment_offsets).data(), (*segment_offsets).size()), - graph_view.local_vertex_partition_range_first(), - handle.get_stream()); - (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); - frontier_key_last = frontier_key_first + (*key_segment_offsets).back(); + if (thrust::distance(frontier_key_first, frontier_key_last) > 0) { + key_segment_offsets = compute_key_segment_offsets( + frontier_key_first, + frontier_key_last, + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + (*key_segment_offsets).back() = *((*key_segment_offsets).rbegin() + 1); + frontier_key_last = frontier_key_first + (*key_segment_offsets).back(); + } else { + key_segment_offsets = std::vector((*segment_offsets).size(), 0); + } } } @@ -931,10 +935,16 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; } avg_fill_ratio /= static_cast(minor_comm_size); - constexpr double threshold_ratio = 8.0 /* tuning parameter */ / static_cast(sizeof(vertex_t) * 8); - if (avg_fill_ratio > threshold_ratio) { + auto avg_frontier_size = + std::reduce(local_frontier_sizes.begin(), local_frontier_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_frontier_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to consider additional kernel launch overhead */)) { frontier_bitmap = compute_vertex_list_bitmap_info(frontier_key_first, frontier_key_last, @@ -972,18 +982,21 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto max_tmp_buffer_size = static_cast( static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); - auto aggregate_major_range_size = host_scalar_allreduce( - comm, - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - raft::comms::op_t::SUM, - handle.get_stream()); - auto aggregate_max_pushes = host_scalar_allreduce( - comm, - local_max_pushes, - raft::comms::op_t::SUM, - handle.get_stream()); // this is approximate as we only consider local edges for - // [frontier_key_first, frontier_key_last), note that neighbor lists - // are partitioned if minor_comm_size > 1 + size_t aggregate_major_range_size{}; + size_t aggregate_max_pushes{}; // this is approximate as we only consider local edges for + // [frontier_key_first, frontier_key_last), note that neighbor + // lists are partitioned if minor_comm_size > 1 + { + auto tmp = host_scalar_allreduce( + comm, + thrust::make_tuple( + static_cast(thrust::distance(frontier_key_first, frontier_key_last)), + local_max_pushes), + raft::comms::op_t::SUM, + handle.get_stream()); + aggregate_major_range_size = thrust::get<0>(tmp); + aggregate_max_pushes = thrust::get<1>(tmp); + } size_t key_size{0}; if constexpr (std::is_arithmetic_v) { @@ -1290,6 +1303,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } } } +#if EXTRACT_PERFORMANCE_MEASUREMENT + if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } + auto subtime2 = std::chrono::steady_clock::now(); +#endif if (key_segment_offset_vectors) { for (size_t j = 0; j < loop_count; ++j) { @@ -1377,7 +1394,7 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT - auto subtime2 = std::chrono::steady_clock::now(); + auto subtime3 = std::chrono::steady_clock::now(); #endif thrust::fill( @@ -1501,14 +1518,14 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT - auto subtime3 = std::chrono::steady_clock::now(); + auto subtime4 = std::chrono::steady_clock::now(); #endif std::vector h_counts(loop_count); raft::update_host(h_counts.data(), counters.data(), loop_count, handle.get_stream()); handle.sync_stream(); #if EXTRACT_PERFORMANCE_MEASUREMENT - auto subtime4 = std::chrono::steady_clock::now(); + auto subtime5 = std::chrono::steady_clock::now(); #endif for (size_t j = 0; j < loop_count; ++j) { @@ -1535,15 +1552,16 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } #if EXTRACT_PERFORMANCE_MEASUREMENT - auto subtime5 = std::chrono::steady_clock::now(); + auto subtime6 = std::chrono::steady_clock::now(); std::chrono::duration subdur0 = subtime1 - subtime0; std::chrono::duration subdur1 = subtime2 - subtime1; std::chrono::duration subdur2 = subtime3 - subtime2; std::chrono::duration subdur3 = subtime4 - subtime3; std::chrono::duration subdur4 = subtime5 - subtime4; + std::chrono::duration subdur5 = subtime6 - subtime5; std::cerr << "sub (extract) took (" << subdur0.count() << "," << subdur1.count() << "," - << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() - << ") loop_count=" << loop_count << std::endl; + << subdur2.count() << "," << subdur3.count() << "," << subdur4.count() << "," + << subdur5.count() << ") loop_count=" << loop_count << std::endl; #endif } #if EXTRACT_PERFORMANCE_MEASUREMENT diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index b7d4f888657..3ab4ba39e38 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1944,11 +1944,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, (range_size > 0) ? (num_keys / static_cast(range_size)) : double{0.0}; } avg_fill_ratio /= static_cast(minor_comm_size); - double threshold_ratio = 2.0 /* tuning parameter (consider that we need to reprodce vertex list from bitmap)*/ / static_cast((v_compressible ? sizeof(uint32_t) : sizeof(vertex_t)) * 8); - if (avg_fill_ratio > threshold_ratio) { + auto avg_key_list_size = + std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()) / + static_cast(minor_comm_size); + + if ((avg_fill_ratio > threshold_ratio) && + (static_cast(avg_key_list_size) > + packed_bools_per_word() * + 32 /* tuning parameter, to considerr additional kernel launch overhead */)) { v_list_bitmap = compute_vertex_list_bitmap_info(sorted_unique_key_first, sorted_unique_nzd_key_last, local_v_list_range_firsts[minor_comm_rank], diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index 5c8083837b6..ff70ae21951 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -157,8 +157,8 @@ sort_and_reduce_buffer_elements( dataframe_buffer_type_t&& key_buffer, optional_dataframe_buffer_type_t&& payload_buffer, ReduceOp reduce_op, - std::conditional_t, std::tuple, std::byte /* dummy */> - vertex_range, + std::conditional_t, std::vector, std::byte /* dummy */> + vertex_range_offsets, std::optional invalid_key /* drop (key, (payload)) pairs with invalid key */) { constexpr bool compressed = @@ -173,7 +173,7 @@ sort_and_reduce_buffer_elements( reduce_op::any>)) { // try to use // bitmap for // filtering - key_t range_size = std::get<1>(vertex_range) - std::get<0>(vertex_range); + key_t range_size = vertex_range_offsets.back() - vertex_range_offsets.front(); if (static_cast(size_dataframe_buffer(key_buffer)) >= static_cast(range_size) * 0.125 /* tuning parameter */) { // use bitmap for filtering @@ -190,7 +190,7 @@ sort_and_reduce_buffer_elements( update_keep_flag_t{ raft::device_span(bitmap.data(), bitmap.size()), raft::device_span(keep_flags.data(), keep_flags.size()), - std::get<0>(vertex_range), + vertex_range_offsets.front(), get_dataframe_buffer_begin(key_buffer), to_thrust_optional(invalid_key)}); auto stencil_first = thrust::make_transform_iterator( @@ -245,7 +245,7 @@ sort_and_reduce_buffer_elements( key_buffer.end(), output_key_buffer.begin(), cuda::proclaim_return_type( - [v_first = std::get<0>(vertex_range)] __device__(uint32_t v_offset) { + [v_first = vertex_range_offsets.front()] __device__(uint32_t v_offset) { return static_cast(v_first + v_offset); })); return std::make_tuple(std::move(output_key_buffer), std::move(payload_buffer)); @@ -274,7 +274,7 @@ sort_and_reduce_buffer_elements( auto input_key_first = thrust::make_transform_iterator( get_dataframe_buffer_begin(key_buffer), cuda::proclaim_return_type( - [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + [v_first = vertex_range_offsets.front()] __device__(auto v_offset) { return static_cast(v_first + v_offset); })); resize_dataframe_buffer( @@ -331,7 +331,7 @@ sort_and_reduce_buffer_elements( auto input_key_first = thrust::make_transform_iterator( get_dataframe_buffer_begin(key_buffer), cuda::proclaim_return_type( - [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + [v_first = vertex_range_offsets.front()] __device__(auto v_offset) { return static_cast(v_first + v_offset); })); auto tmp_payload_buffer = allocate_dataframe_buffer( @@ -430,7 +430,7 @@ sort_and_reduce_buffer_elements( auto input_key_first = thrust::make_transform_iterator( get_dataframe_buffer_begin(key_buffer), cuda::proclaim_return_type( - [v_first = std::get<0>(vertex_range)] __device__(auto v_offset) { + [v_first = vertex_range_offsets.front()] __device__(auto v_offset) { return static_cast(v_first + v_offset); })); thrust::reduce_by_key(handle.get_thrust_policy(), @@ -531,20 +531,36 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, // 2. reduce the buffer - std:: - conditional_t, std::tuple, std::byte /* dummy */> - vertex_range{}; - if constexpr (std::is_integral_v) { - vertex_range = std::make_tuple(graph_view.local_edge_partition_dst_range_first(), - graph_view.local_edge_partition_dst_range_last()); + std::vector vertex_range_offsets{}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + vertex_range_offsets = std::vector(major_comm_size + 1); + for (int i = 0; i < major_comm_size; ++i) { + auto vertex_partition_id = + detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ + major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + vertex_range_offsets[i] = graph_view.vertex_partition_range_first(vertex_partition_id); + } + vertex_range_offsets.back() = graph_view.local_edge_partition_dst_range_last(); + } else { + vertex_range_offsets = std::vector{graph_view.local_edge_partition_dst_range_first(), + graph_view.local_edge_partition_dst_range_last()}; } + std::conditional_t, std::vector, std::byte /* dummy */> + aux_range_offsets{}; + if constexpr (std::is_integral_v) { aux_range_offsets = vertex_range_offsets; } std::tie(key_buffer, payload_buffer) = detail::sort_and_reduce_buffer_elements( handle, std::move(key_buffer), std::move(payload_buffer), reduce_op, - vertex_range, + aux_range_offsets, std::nullopt); #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -557,285 +573,304 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, bool aligned_path = false; // FIXME: delete double fill_ratio = 0.0; // FIXME: delete if constexpr (GraphViewType::is_multi_gpu) { - // FIXME: this step is unnecessary if major_comm_size== 1 auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); - auto const major_comm_rank = major_comm.get_rank(); auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - - constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; + if (major_comm_size > 1) { + constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; - std::conditional_t max_vertex_partition_size{ - 0}; - std::conditional_t, std::byte /* dummy */> - h_vertex_firsts{}; - if constexpr (try_compression) { h_vertex_firsts = std::vector(major_comm_size); } - std::vector h_vertex_lasts(major_comm_size); - for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - auto vertex_partition_id = - detail::compute_local_edge_partition_minor_range_vertex_partition_id_t{ - major_comm_size, minor_comm_size, major_comm_rank, minor_comm_rank}(i); + std::conditional_t + max_vertex_partition_size{0}; + std::conditional_t, std::byte /* dummy */> + h_vertex_firsts{}; if constexpr (try_compression) { - max_vertex_partition_size = std::max( - graph_view.vertex_partition_range_size(vertex_partition_id), max_vertex_partition_size); - h_vertex_firsts[i] = graph_view.vertex_partition_range_first(vertex_partition_id); + h_vertex_firsts = std::vector(vertex_range_offsets.begin(), + vertex_range_offsets.begin() + major_comm_size); + } + std::vector h_vertex_lasts(vertex_range_offsets.begin() + 1, + vertex_range_offsets.end()); + for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { + if constexpr (try_compression) { + max_vertex_partition_size = std::max( + vertex_range_offsets[i + 1] - vertex_range_offsets[i], max_vertex_partition_size); + } } - h_vertex_lasts[i] = graph_view.vertex_partition_range_last(vertex_partition_id); - } - std::conditional_t>, - std::byte /* dummy */> - d_vertex_firsts{}; - rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); - if constexpr (try_compression) { - if (max_vertex_partition_size <= std::numeric_limits::max()) { - d_vertex_firsts = - rmm::device_uvector(h_vertex_firsts.size(), handle.get_stream()); - raft::update_device((*d_vertex_firsts).data(), - h_vertex_firsts.data(), - h_vertex_firsts.size(), - handle.get_stream()); + std::conditional_t>, + std::byte /* dummy */> + d_vertex_firsts{}; + rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + d_vertex_firsts = + rmm::device_uvector(h_vertex_firsts.size(), handle.get_stream()); + raft::update_device((*d_vertex_firsts).data(), + h_vertex_firsts.data(), + h_vertex_firsts.size(), + handle.get_stream()); + } } - } - raft::update_device( - d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); - rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), - handle.get_stream()); - auto reduce_by_first = - thrust_tuple_get_or_identity( - get_dataframe_buffer_begin(key_buffer)); - thrust::lower_bound(handle.get_thrust_policy(), - reduce_by_first, - reduce_by_first + size_dataframe_buffer(key_buffer), - d_vertex_lasts.begin(), - d_vertex_lasts.end(), - d_tx_buffer_last_boundaries.begin()); - std::conditional_t>, - std::byte /* dummy */> - compressed_v_buffer{}; - if constexpr (try_compression) { - if (d_vertex_firsts) { - compressed_v_buffer = - rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - (*compressed_v_buffer).begin(), - cuda::proclaim_return_type( - [firsts = raft::device_span((*d_vertex_firsts).data(), - (*d_vertex_firsts).size()), - lasts = raft::device_span( - d_vertex_lasts.data(), d_vertex_lasts.size())] __device__(auto v) { - auto major_comm_rank = thrust::distance( - lasts.begin(), - thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); - return static_cast(v - firsts[major_comm_rank]); - })); - resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); - shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + raft::update_device( + d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); + rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), + handle.get_stream()); + auto key_v_first = + thrust_tuple_get_or_identity( + get_dataframe_buffer_begin(key_buffer)); + thrust::lower_bound(handle.get_thrust_policy(), + key_v_first, + key_v_first + size_dataframe_buffer(key_buffer), + d_vertex_lasts.begin(), + d_vertex_lasts.end(), + d_tx_buffer_last_boundaries.begin()); + std::conditional_t>, + std::byte /* dummy */> + compressed_v_buffer{}; + if constexpr (try_compression) { + if (d_vertex_firsts) { + compressed_v_buffer = + rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); + thrust::transform(handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + (*compressed_v_buffer).begin(), + cuda::proclaim_return_type( + [firsts = raft::device_span( + (*d_vertex_firsts).data(), (*d_vertex_firsts).size()), + lasts = raft::device_span( + d_vertex_lasts.data(), d_vertex_lasts.size())] __device__(auto v) { + auto major_comm_rank = thrust::distance( + lasts.begin(), + thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); + return static_cast(v - firsts[major_comm_rank]); + })); + resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); + shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); + } } - } - std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); - raft::update_host(h_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.data(), - d_tx_buffer_last_boundaries.size(), - handle.get_stream()); - handle.sync_stream(); - std::vector tx_counts(h_tx_buffer_last_boundaries.size()); - std::adjacent_difference( - h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); + std::vector h_tx_buffer_last_boundaries(d_tx_buffer_last_boundaries.size()); + raft::update_host(h_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.data(), + d_tx_buffer_last_boundaries.size(), + handle.get_stream()); + handle.sync_stream(); + std::vector tx_counts(h_tx_buffer_last_boundaries.size()); + std::adjacent_difference( + h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - time3 = std::chrono::steady_clock::now(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time3 = std::chrono::steady_clock::now(); #endif - size_t min_element_size{cache_line_size}; - if constexpr (std::is_same_v) { - if constexpr (try_compression) { - if (compressed_v_buffer) { - min_element_size = std::min(sizeof(uint32_t), min_element_size); + size_t min_element_size{cache_line_size}; + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + min_element_size = std::min(sizeof(uint32_t), min_element_size); + } else { + min_element_size = std::min(sizeof(key_t), min_element_size); + } } else { min_element_size = std::min(sizeof(key_t), min_element_size); } } else { - min_element_size = std::min(sizeof(key_t), min_element_size); + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); } - } else { - static_assert(is_thrust_tuple_of_arithmetic::value); - min_element_size = - std::min(cugraph::min_thrust_tuple_element_sizes(), min_element_size); - } - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - min_element_size = std::min(sizeof(payload_t), min_element_size); - } else { - static_assert(is_thrust_tuple_of_arithmetic::value); - min_element_size = std::min(min_thrust_tuple_element_sizes(), min_element_size); + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + min_element_size = std::min(sizeof(payload_t), min_element_size); + } else { + static_assert(is_thrust_tuple_of_arithmetic::value); + min_element_size = + std::min(min_thrust_tuple_element_sizes(), min_element_size); + } } - } - assert((cache_line_size % min_element_size) == 0); - auto alignment = cache_line_size / min_element_size; - std::optional, key_t>> - invalid_key{std::nullopt}; + assert((cache_line_size % min_element_size) == 0); + auto alignment = cache_line_size / min_element_size; + std::optional, key_t>> + invalid_key{std::nullopt}; - size_t local_key_buffer_size{}; - if constexpr (try_compression) { - if (compressed_v_buffer) { - local_key_buffer_size = size_dataframe_buffer(*compressed_v_buffer); + size_t local_key_buffer_size{}; + if constexpr (try_compression) { + if (compressed_v_buffer) { + local_key_buffer_size = size_dataframe_buffer(*compressed_v_buffer); + } else { + local_key_buffer_size = size_dataframe_buffer(key_buffer); + } } else { local_key_buffer_size = size_dataframe_buffer(key_buffer); } - } else { - local_key_buffer_size = size_dataframe_buffer(key_buffer); - } - auto avg_key_buffer_size = - host_scalar_allreduce( - major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / - major_comm_size; - if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { - aligned_path = true; // FIXME: delete - if constexpr (std::is_same_v) { - if constexpr (try_compression) { - if (compressed_v_buffer) { - invalid_key = std::numeric_limits::max(); + auto avg_key_buffer_size = + host_scalar_allreduce( + major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / + major_comm_size; + if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { + aligned_path = true; // FIXME: delete + if constexpr (std::is_same_v) { + if constexpr (try_compression) { + if (compressed_v_buffer) { + invalid_key = std::numeric_limits::max(); + } else { + invalid_key = invalid_vertex_id_v; + } } else { invalid_key = invalid_vertex_id_v; } } else { - invalid_key = invalid_vertex_id_v; + invalid_key = key_t{}; + thrust::get<0>(*invalid_key) = invalid_vertex_id_v; } - } else { - invalid_key = key_t{}; - thrust::get<0>(*invalid_key) = invalid_vertex_id_v; - } - if constexpr (try_compression) { - if (compressed_v_buffer) { - auto rx_compressed_v_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_compressed_v_buffer, + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + alignment, + std::make_optional(std::get<1>(*invalid_key)), + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore, + std::ignore) = shuffle_values(major_comm, + get_dataframe_buffer_begin(key_buffer), + tx_counts, + alignment, + std::make_optional(std::get<0>(*invalid_key)), + handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore) = shuffle_values(major_comm, - get_dataframe_buffer_begin(*compressed_v_buffer), + get_dataframe_buffer_begin(key_buffer), tx_counts, alignment, - std::make_optional(std::get<1>(*invalid_key)), + invalid_key, handle.get_stream()); - compressed_v_buffer = std::move(rx_compressed_v_buffer); - } else { - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, + key_buffer = std::move(rx_key_buffer); + } + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore, std::ignore) = shuffle_values(major_comm, - get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_begin(payload_buffer), tx_counts, alignment, - std::make_optional(std::get<0>(*invalid_key)), + std::nullopt, handle.get_stream()); - key_buffer = std::move(rx_key_buffer); + payload_buffer = std::move(rx_payload_buffer); } } else { - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, - std::ignore, - std::ignore, - std::ignore, - std::ignore, - std::ignore, - std::ignore) = shuffle_values(major_comm, - get_dataframe_buffer_begin(key_buffer), - tx_counts, - alignment, - invalid_key, - handle.get_stream()); - key_buffer = std::move(rx_key_buffer); - } - if constexpr (!std::is_same_v) { - auto rx_payload_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, - std::ignore, - std::ignore, - std::ignore, - std::ignore, - std::ignore, - std::ignore) = shuffle_values(major_comm, - get_dataframe_buffer_begin(payload_buffer), - tx_counts, - alignment, - std::nullopt, - handle.get_stream()); - payload_buffer = std::move(rx_payload_buffer); - } - } else { - if constexpr (try_compression) { - if (compressed_v_buffer) { - auto rx_compressed_v_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_compressed_v_buffer, std::ignore) = - shuffle_values(major_comm, - get_dataframe_buffer_begin(*compressed_v_buffer), - tx_counts, - handle.get_stream()); - compressed_v_buffer = std::move(rx_compressed_v_buffer); + if constexpr (try_compression) { + if (compressed_v_buffer) { + auto rx_compressed_v_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_compressed_v_buffer, std::ignore) = + shuffle_values(major_comm, + get_dataframe_buffer_begin(*compressed_v_buffer), + tx_counts, + handle.get_stream()); + compressed_v_buffer = std::move(rx_compressed_v_buffer); + } else { + auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_key_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); + key_buffer = std::move(rx_key_buffer); + } } else { auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); std::tie(rx_key_buffer, std::ignore) = shuffle_values( major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); key_buffer = std::move(rx_key_buffer); } - } else { - auto rx_key_buffer = allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_key_buffer, std::ignore) = shuffle_values( - major_comm, get_dataframe_buffer_begin(key_buffer), tx_counts, handle.get_stream()); - key_buffer = std::move(rx_key_buffer); - } - if constexpr (!std::is_same_v) { - auto rx_payload_buffer = - allocate_dataframe_buffer(size_t{0}, handle.get_stream()); - std::tie(rx_payload_buffer, std::ignore) = shuffle_values( - major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); - payload_buffer = std::move(rx_payload_buffer); + if constexpr (!std::is_same_v) { + auto rx_payload_buffer = + allocate_dataframe_buffer(size_t{0}, handle.get_stream()); + std::tie(rx_payload_buffer, std::ignore) = shuffle_values( + major_comm, get_dataframe_buffer_begin(payload_buffer), tx_counts, handle.get_stream()); + payload_buffer = std::move(rx_payload_buffer); + } } - } #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - time4 = std::chrono::steady_clock::now(); + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time4 = std::chrono::steady_clock::now(); #endif - if constexpr (std::is_integral_v) { - vertex_range = std::make_tuple(graph_view.local_vertex_partition_range_first(), - graph_view.local_vertex_partition_range_last()); - fill_ratio = static_cast(size_dataframe_buffer(key_buffer)) / - static_cast(std::get<1>(vertex_range) - - std::get<0>(vertex_range)); // FIXME: delete - } - if constexpr (try_compression) { - if (compressed_v_buffer) { + if constexpr (std::is_integral_v) { + aux_range_offsets = std::vector{graph_view.local_vertex_partition_range_first(), + graph_view.local_vertex_partition_range_last()}; +#if 1 // FIXME: delete + size_t key_buffer_size{}; + if constexpr (try_compression) { + if (compressed_v_buffer) { + key_buffer_size = (*compressed_v_buffer).size(); + } else { + key_buffer_size = size_dataframe_buffer(key_buffer); + } + } else { + key_buffer_size = size_dataframe_buffer(key_buffer); + } + fill_ratio = static_cast(key_buffer_size) / + static_cast(aux_range_offsets.back() - aux_range_offsets.front()); +#endif + } + if constexpr (try_compression) { + if (compressed_v_buffer) { #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT - size_before_greduce = size_dataframe_buffer(*compressed_v_buffer); // FIXME: delete + size_before_greduce = size_dataframe_buffer(*compressed_v_buffer); // FIXME: delete #endif - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, - std::move(*compressed_v_buffer), - std::move(payload_buffer), - reduce_op, - vertex_range, - invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(*compressed_v_buffer), + std::move(payload_buffer), + reduce_op, + aux_range_offsets, + invalid_key ? std::make_optional(std::get<1>(*invalid_key)) : std::nullopt); + } else { +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete +#endif + std::tie(key_buffer, payload_buffer) = + detail::sort_and_reduce_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + reduce_op, + aux_range_offsets, + invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); + } } else { #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete @@ -846,21 +881,9 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::move(key_buffer), std::move(payload_buffer), reduce_op, - vertex_range, - invalid_key ? std::make_optional(std::get<0>(*invalid_key)) : std::nullopt); + aux_range_offsets, + invalid_key); } - } else { -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT - size_before_greduce = size_dataframe_buffer(key_buffer); // FIXME: delete -#endif - std::tie(key_buffer, payload_buffer) = - detail::sort_and_reduce_buffer_elements( - handle, - std::move(key_buffer), - std::move(payload_buffer), - reduce_op, - vertex_range, - invalid_key); } } #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 11ef079570e..cdfa3422e2d 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -119,7 +119,7 @@ void bfs(raft::handle_t const& handle, static_assert(!GraphViewType::is_storage_transposed, "GraphViewType should support the push model."); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep0 = std::chrono::steady_clock::now(); #endif @@ -217,7 +217,7 @@ void bfs(raft::handle_t const& handle, thrust::fill(handle.get_thrust_policy(), output_first, output_first + n_sources, vertex_t{0}); // 3. update meta data for direction optimizing BFS -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep1 = std::chrono::steady_clock::now(); #endif @@ -330,7 +330,7 @@ void bfs(raft::handle_t const& handle, } // 4. initialize BFS frontier -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep2 = std::chrono::steady_clock::now(); #endif @@ -351,7 +351,7 @@ void bfs(raft::handle_t const& handle, handle, graph_view); // this may mark some vertices visited in previous iterations as unvisited // (but this is OK as we check prev_dst_visited_flags first) fill_edge_dst_property(handle, graph_view, dst_visited_flags.mutable_view(), false); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep3 = std::chrono::steady_clock::now(); #endif @@ -362,7 +362,7 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur).end(), prev_dst_visited_flags.mutable_view(), true); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto prep4 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = prep1 - prep0; @@ -382,7 +382,7 @@ void bfs(raft::handle_t const& handle, while (true) { vertex_t next_aggregate_frontier_size{}; if (topdown) { -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown0 = std::chrono::steady_clock::now(); #endif @@ -405,7 +405,7 @@ void bfs(raft::handle_t const& handle, edge_dummy_property_t{}.view(), e_op, reduce_op::any()); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown1 = std::chrono::steady_clock::now(); #endif @@ -423,19 +423,19 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next) = key_bucket_t( handle, std::move(new_frontier_vertex_buffer)); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown2 = std::chrono::steady_clock::now(); #endif next_aggregate_frontier_size = static_cast(vertex_frontier.bucket(bucket_idx_next).aggregate_size()); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown3 = std::chrono::steady_clock::now(); #endif if (next_aggregate_frontier_size == 0) { -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT std::chrono::duration dur0 = topdown1 - topdown0; std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; @@ -453,13 +453,14 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next).end(), prev_dst_visited_flags.mutable_view(), true); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto topdown4 = std::chrono::steady_clock::now(); + auto topdown5 = std::chrono::steady_clock::now(); #endif if (direction_optimizing) { - { + if (vertex_frontier.bucket(bucket_idx_next).size() > 0) { rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), handle.get_stream()); tmp_vertices.resize( @@ -473,6 +474,10 @@ void bfs(raft::handle_t const& handle, handle.get_stream()); nzd_unvisited_vertices = std::move(tmp_vertices); } +#if BFS_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + topdown5 = std::chrono::steady_clock::now(); +#endif double m_f{0.0}; double m_u{0.0}; @@ -560,17 +565,17 @@ void bfs(raft::handle_t const& handle, thrust::plus{})); } - auto aggregate_m_f = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_f, raft::comms::op_t::SUM, handle.get_stream()) - : m_f; - auto aggregate_m_u = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce( - handle.get_comms(), m_u, raft::comms::op_t::SUM, handle.get_stream()) - : m_u; -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + auto aggregate_m_f = m_f; + auto aggregate_m_u = m_u; + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce(handle.get_comms(), + thrust::make_tuple(m_f, m_u), + raft::comms::op_t::SUM, + handle.get_stream()); + aggregate_m_f = thrust::get<0>(tmp); + aggregate_m_u = thrust::get<1>(tmp); + } +#if BFS_PERFORMANCE_MEASUREMENT std::cerr << "m_f=" << m_f << " m_u=" << m_u << " direction_optimizing_alpha=" << direction_optimizing_alpha << " aggregate_m_f * direction_optimzing_alpha=" @@ -584,9 +589,9 @@ void bfs(raft::handle_t const& handle, topdown = false; } } -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto topdown5 = std::chrono::steady_clock::now(); + auto topdown6 = std::chrono::steady_clock::now(); #endif if (topdown) { // staying in top-down @@ -602,25 +607,26 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_next) = key_bucket_t(handle); } -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto topdown6 = std::chrono::steady_clock::now(); + auto topdown7 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = topdown1 - topdown0; std::chrono::duration dur1 = topdown2 - topdown1; std::chrono::duration dur2 = topdown3 - topdown2; std::chrono::duration dur3 = topdown4 - topdown3; std::chrono::duration dur4 = topdown5 - topdown4; std::chrono::duration dur5 = topdown6 - topdown5; - std::chrono::duration dur = topdown6 - topdown0; + std::chrono::duration dur6 = topdown7 - topdown6; + std::chrono::duration dur = topdown7 - topdown0; std::cerr << "depth=" << depth << " topdown next_aggregate_frontier_size=" << next_aggregate_frontier_size - << " next topdown=" << topdown << " (prim,vf,host,fill,dir,vf) took " << dur.count() - << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," - << dur3.count() << "," << dur4.count() << "," << dur5.count() << ") s." - << std::endl; + << " next topdown=" << topdown << " (prim,vf,host,fill,unvisited,dir,vf) took " + << dur.count() << " (" << dur0.count() << "," << dur1.count() << "," << dur2.count() + << "," << dur3.count() << "," << dur4.count() << "," << dur5.count() << "," + << dur6.count() << ") s." << std::endl; #endif - } else { // bottom up -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + } else { // bottom up +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup0 = std::chrono::steady_clock::now(); #endif @@ -688,24 +694,31 @@ void bfs(raft::handle_t const& handle, handle.get_stream()); nzd_unvisited_vertices = std::move(tmp_vertices); } -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup1 = std::chrono::steady_clock::now(); #endif - next_aggregate_frontier_size = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast(new_frontier_vertex_buffer.size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast(new_frontier_vertex_buffer.size()); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete + next_aggregate_frontier_size = static_cast(new_frontier_vertex_buffer.size()); + auto aggregate_nzd_unvisited_vertices = + static_cast((*nzd_unvisited_vertices).size()); + if constexpr (GraphViewType::is_multi_gpu) { + auto tmp = host_scalar_allreduce( + handle.get_comms(), + thrust::make_tuple(static_cast(new_frontier_vertex_buffer.size()), + static_cast((*nzd_unvisited_vertices).size())), + raft::comms::op_t::SUM, + handle.get_stream()); + next_aggregate_frontier_size = thrust::get<0>(tmp); + aggregate_nzd_unvisited_vertices = thrust::get<1>(tmp); + } + +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup2 = std::chrono::steady_clock::now(); #endif if (next_aggregate_frontier_size == 0) { -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur = bottomup2 - bottomup0; @@ -721,29 +734,16 @@ void bfs(raft::handle_t const& handle, new_frontier_vertex_buffer.end(), prev_dst_visited_flags.mutable_view(), true); -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto bottomup3 = std::chrono::steady_clock::now(); #endif - // FIXME: better move this right after host_scalar_allreduce??? - auto aggregate_nzd_unvisited_vertices = - GraphViewType::is_multi_gpu - ? host_scalar_allreduce(handle.get_comms(), - static_cast((*nzd_unvisited_vertices).size()), - raft::comms::op_t::SUM, - handle.get_stream()) - : static_cast((*nzd_unvisited_vertices).size()); - if ((next_aggregate_frontier_size * direction_optimizing_beta < aggregate_nzd_unvisited_vertices) && (next_aggregate_frontier_size < cur_aggregate_frontier_size)) { topdown = true; } -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup4 = std::chrono::steady_clock::now(); -#endif if (topdown) { // swithcing to top-down vertex_frontier.bucket(bucket_idx_cur) = @@ -756,21 +756,20 @@ void bfs(raft::handle_t const& handle, raft::device_span((*nzd_unvisited_vertices).data(), (*nzd_unvisited_vertices).size())); } -#if BFS_PERFORMANCE_MEASUREMENT // FIXME: delete +#if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto bottomup5 = std::chrono::steady_clock::now(); + auto bottomup4 = std::chrono::steady_clock::now(); std::chrono::duration dur0 = bottomup1 - bottomup0; std::chrono::duration dur1 = bottomup2 - bottomup1; std::chrono::duration dur2 = bottomup3 - bottomup2; std::chrono::duration dur3 = bottomup4 - bottomup3; - std::chrono::duration dur4 = bottomup5 - bottomup4; - std::chrono::duration dur = bottomup5 - bottomup0; + std::chrono::duration dur = bottomup4 - bottomup0; std::cerr << "depth=" << depth << " bottomup next_aggregate_frontier_size=" << next_aggregate_frontier_size << " aggregatee_nzd_unvisited_vertices=" << aggregate_nzd_unvisited_vertices - << " (prim+,host,fill,dir,vf) took " << dur.count() << " (" << dur0.count() << "," - << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() - << ") s." << std::endl; + << " (prim+,host,fill,vf) took " << dur.count() << " (" << dur0.count() << "," + << dur1.count() << "," << dur2.count() << "," << dur3.count() << ") s." + << std::endl; #endif } cur_aggregate_frontier_size = next_aggregate_frontier_size; From 2db13e95c23dec1ba80499ad479af927f92b08e7 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 30 Oct 2024 01:50:31 -0700 Subject: [PATCH 113/126] reduce comm. sync --- .../detail/extract_transform_v_frontier_e.cuh | 144 ++++----- .../prims/detail/per_v_transform_reduce_e.cuh | 275 +++++++++--------- cpp/src/prims/fill_edge_src_dst_property.cuh | 84 +++--- 3 files changed, 265 insertions(+), 238 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index c63557f157d..8ef7da3f022 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -791,6 +791,10 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, // 3. communication over minor_comm std::vector local_frontier_sizes{}; + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; std::conditional_t, std::byte /* dummy */> local_frontier_range_firsts{}; std::conditional_t, std::byte /* dummy */> @@ -801,7 +805,39 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - size_t num_scalars = 1; // local_frontier_size + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{}; + { + size_t key_size{0}; + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + size_t output_key_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_key_size = sizeof(output_key_t); + } else { + output_key_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + size_t output_value_size{0}; + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + output_value_size = sizeof(output_value_t); + } else { + output_value_size = cugraph::sum_thrust_tuple_element_sizes(); + } + } + approx_tmp_buffer_size_per_loop = + static_cast(thrust::distance(frontier_key_first, frontier_key_last)) * key_size + + local_max_pushes * (output_key_size + output_value_size); + } + + size_t num_scalars = + 3; // local_frontier_size, max_tmp_buffer_size, approx_tmp_buffer_size_per_loop if constexpr (try_bitmap) { num_scalars += 2; // local_frontier_range_first, local_frontier_range_last } @@ -810,16 +846,23 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, handle.get_stream()); thrust::tabulate( handle.get_thrust_policy(), - d_aggregate_tmps.begin() + minor_comm_rank * num_scalars, - d_aggregate_tmps.begin() + minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + (num_scalars * minor_comm_rank + (try_bitmap ? 5 : 3)), [frontier_key_first, + max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, v_list_size = static_cast(thrust::distance(frontier_key_first, frontier_key_last)), vertex_partition_range_first = graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return v_list_size; + } else if (i == 1) { + return max_tmp_buffer_size; + } else if (i == 2) { + return approx_tmp_buffer_size_per_loop; + } if constexpr (try_bitmap) { - if (i == 0) { - return v_list_size; - } else if (i == 1) { + if (i == 3) { vertex_t first{}; if (v_list_size > 0) { first = *frontier_key_first; @@ -828,8 +871,8 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, } assert(static_cast(static_cast(first)) == first); return static_cast(first); - } else { - assert(i == 2); + } else if (i == 4) { + assert(i == 4); vertex_t last{}; if (v_list_size > 0) { last = *(frontier_key_first + (v_list_size - 1)) + 1; @@ -839,14 +882,13 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, assert(static_cast(static_cast(last)) == last); return static_cast(last); } - } else { - assert(i == 0); - return v_list_size; } + assert(false); + return size_t{0}; }); if (key_segment_offsets) { raft::update_device( - d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1)), + d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 5 : 3)), (*key_segment_offsets).data(), (*key_segment_offsets).size(), handle.get_stream()); @@ -866,7 +908,9 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, d_aggregate_tmps.size(), handle.get_stream()); handle.sync_stream(); - local_frontier_sizes = std::vector(minor_comm_size); + local_frontier_sizes = std::vector(minor_comm_size); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); if constexpr (try_bitmap) { local_frontier_range_firsts = std::vector(minor_comm_size); local_frontier_range_lasts = std::vector(minor_comm_size); @@ -876,18 +920,20 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, (*key_segment_offset_vectors).reserve(minor_comm_size); } for (int i = 0; i < minor_comm_size; ++i) { - local_frontier_sizes[i] = h_aggregate_tmps[i * num_scalars]; + local_frontier_sizes[i] = h_aggregate_tmps[i * num_scalars]; + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars + 1]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 2]; if constexpr (try_bitmap) { local_frontier_range_firsts[i] = - static_cast(h_aggregate_tmps[i * num_scalars + 1]); + static_cast(h_aggregate_tmps[i * num_scalars + 3]); local_frontier_range_lasts[i] = - static_cast(h_aggregate_tmps[i * num_scalars + 2]); + static_cast(h_aggregate_tmps[i * num_scalars + 4]); } if (key_segment_offsets) { (*key_segment_offset_vectors) - .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 3 : 1)), + .emplace_back(h_aggregate_tmps.begin() + (i * num_scalars + (try_bitmap ? 5 : 3)), h_aggregate_tmps.begin() + - (i * num_scalars + (try_bitmap ? 3 : 1) + (*key_segment_offsets).size())); + (i * num_scalars + (try_bitmap ? 5 : 3) + (*key_segment_offsets).size())); } } } else { @@ -971,63 +1017,17 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto partition_idx = static_cast(minor_comm_rank); + auto const minor_comm_size = minor_comm.get_size(); - if (graph_view.local_edge_partition_segment_offsets(partition_idx) && + if (graph_view.local_vertex_partition_segment_offsets() && (handle.get_stream_pool_size() >= max_segments)) { - auto& comm = handle.get_comms(); - auto const comm_size = comm.get_size(); - - auto max_tmp_buffer_size = static_cast( - static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); - - size_t aggregate_major_range_size{}; - size_t aggregate_max_pushes{}; // this is approximate as we only consider local edges for - // [frontier_key_first, frontier_key_last), note that neighbor - // lists are partitioned if minor_comm_size > 1 - { - auto tmp = host_scalar_allreduce( - comm, - thrust::make_tuple( - static_cast(thrust::distance(frontier_key_first, frontier_key_last)), - local_max_pushes), - raft::comms::op_t::SUM, - handle.get_stream()); - aggregate_major_range_size = thrust::get<0>(tmp); - aggregate_max_pushes = thrust::get<1>(tmp); - } - - size_t key_size{0}; - if constexpr (std::is_arithmetic_v) { - if (v_compressible) { - key_size = sizeof(uint32_t); - } else { - key_size = sizeof(key_t); - } - } else { - key_size = cugraph::sum_thrust_tuple_element_sizes(); - } - size_t output_key_size{0}; - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - output_key_size = sizeof(output_key_t); - } else { - output_key_size = cugraph::sum_thrust_tuple_element_sizes(); - } - } - size_t output_value_size{0}; - if constexpr (!std::is_same_v) { - if constexpr (std::is_arithmetic_v) { - output_value_size = sizeof(output_value_t); - } else { - output_value_size = cugraph::sum_thrust_tuple_element_sizes(); - } - } + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); auto approx_tmp_buffer_size_per_loop = - (aggregate_major_range_size / comm_size) * key_size + - (aggregate_max_pushes / comm_size) * (output_key_size + output_value_size); - + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, approx_tmp_buffer_size_per_loop, graph_view.number_of_local_edge_partitions(), diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 3ab4ba39e38..650a307d54c 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1724,61 +1724,117 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - // 5. collect local_key_list_sizes & local_v_list_range_firsts & local_v_list_range_lasts & + // 5. collect max_tmp_buffer_size, approx_tmp_buffer_size_per_loop, local_key_list_sizes, + // local_v_list_range_firsts, local_v_list_range_lasts, local_key_list_deg1_sizes, // key_segment_offset_vectors + std::conditional_t, std::byte /* dummy */> + max_tmp_buffer_sizes{}; + std::conditional_t, std::byte /* dummy */> + tmp_buffer_size_per_loop_approximations{}; std::conditional_t, std::byte /* dummy */> local_key_list_sizes{}; std::conditional_t, std::byte /* dummy */> local_v_list_range_firsts{}; std::conditional_t, std::byte /* dummy */> local_v_list_range_lasts{}; - std::conditional_t>, - std::optional>, - std::byte /* dummy */> + std::conditional_t>, std::byte /* dummy */> local_key_list_deg1_sizes{}; // if global degree is 1, any valid local value should be selected std::conditional_t>>, std::byte /* dummy */> key_segment_offset_vectors{}; - if constexpr (use_input_key) { - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + + auto max_tmp_buffer_size = + static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); + size_t approx_tmp_buffer_size_per_loop{0}; + if constexpr (update_major) { + size_t key_size{0}; + if constexpr (use_input_key) { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + size_t value_size{0}; + if constexpr (std::is_arithmetic_v) { + value_size = sizeof(T); + } else { + value_size = sum_thrust_tuple_element_sizes(); + } - size_t num_scalars = 1; // local_key_list_size + size_t major_range_size{}; + if constexpr (use_input_key) { + major_range_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + ; + } else { + major_range_size = graph_view.local_vertex_partition_range_size(); + } + size_t size_per_key{}; + if constexpr (filter_input_key) { + size_per_key = + key_size + + value_size / 2; // to reflect that many keys will be filtered out, note that this is a + // simple approximation, memory requirement in this case is much more + // complex as we store additional temporary variables + + } else { + size_per_key = key_size + value_size; + } + approx_tmp_buffer_size_per_loop = major_range_size * size_per_key; + } + + size_t num_scalars = 2; // max_tmp_buffer_size, approx_tmp_buffer_size_per_loop + size_t num_scalars_less_key_segment_offsets = num_scalars; + if constexpr (use_input_key) { + num_scalars += 1; // local_key_list_size if constexpr (try_bitmap) { num_scalars += 2; // local_key_list_range_first, local_key_list_range_last } if (filter_input_key && graph_view.use_dcs()) { num_scalars += 1; // local_key_list_degree_1_size } + num_scalars_less_key_segment_offsets = num_scalars; if (key_segment_offsets) { num_scalars += (*key_segment_offsets).size(); } + } - rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, - handle.get_stream()); - auto hypersparse_degree_offsets = - graph_view.local_vertex_partition_hypersparse_degree_offsets(); - thrust::tabulate( - handle.get_thrust_policy(), - d_aggregate_tmps.begin() + minor_comm_rank * num_scalars, - d_aggregate_tmps.begin() + minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1) + - (filter_input_key && graph_view.use_dcs() ? 1 : 0), - [sorted_unique_key_first, - v_list_size = static_cast( - thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)), - deg1_v_first = (filter_input_key && graph_view.use_dcs()) - ? thrust::make_optional(graph_view.local_vertex_partition_range_first() + - (*local_vertex_partition_segment_offsets)[3] + - *((*hypersparse_degree_offsets).rbegin() + 1)) - : thrust::nullopt, - vertex_partition_range_first = - graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + rmm::device_uvector d_aggregate_tmps(minor_comm_size * num_scalars, + handle.get_stream()); + auto hypersparse_degree_offsets = + graph_view.local_vertex_partition_hypersparse_degree_offsets(); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank, + d_aggregate_tmps.begin() + num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets, + [max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + sorted_unique_key_first, + sorted_unique_nzd_key_last, + deg1_v_first = (filter_input_key && graph_view.use_dcs()) + ? thrust::make_optional(graph_view.local_vertex_partition_range_first() + + (*local_vertex_partition_segment_offsets)[3] + + *((*hypersparse_degree_offsets).rbegin() + 1)) + : thrust::nullopt, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return approx_tmp_buffer_size_per_loop; + } + if constexpr (use_input_key) { + auto v_list_size = static_cast( + thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last)); + if (i == 2) { return v_list_size; } if constexpr (try_bitmap) { - if (i == 0) { - return v_list_size; - } else if (i == 1) { + if (i == 3) { vertex_t first{}; if (v_list_size > 0) { first = *sorted_unique_key_first; @@ -1787,8 +1843,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } assert(static_cast(static_cast(first)) == first); return static_cast(first); - } - if (i == 2) { + } else if (i == 4) { vertex_t last{}; if (v_list_size > 0) { last = *(sorted_unique_key_first + (v_list_size - 1)) + 1; @@ -1797,7 +1852,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } assert(static_cast(static_cast(last)) == last); return static_cast(last); - } else { + } else if (i == 5) { if (deg1_v_first) { auto sorted_unique_v_first = thrust::make_transform_iterator( sorted_unique_key_first, @@ -1810,15 +1865,10 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, sorted_unique_v_first, sorted_unique_v_first + v_list_size, deg1_v_first))); - } else { - assert(false); - return size_t{0}; } } } else { - if (i == 0) { - return v_list_size; - } else { + if (i == 3) { if (deg1_v_first) { auto sorted_unique_v_first = thrust::make_transform_iterator( sorted_unique_key_first, @@ -1831,36 +1881,40 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, sorted_unique_v_first, sorted_unique_v_first + v_list_size, deg1_v_first))); - } else { - assert(false); - return size_t{0}; } } } - }); + } + assert(false); + return size_t{0}; + }); + if constexpr (use_input_key) { if (key_segment_offsets) { - raft::update_device( - d_aggregate_tmps.data() + (minor_comm_rank * num_scalars + (try_bitmap ? 3 : 1) + - (filter_input_key && graph_view.use_dcs() ? 1 : 0)), - (*key_segment_offsets).data(), - (*key_segment_offsets).size(), - handle.get_stream()); + raft::update_device(d_aggregate_tmps.data() + (num_scalars * minor_comm_rank + + num_scalars_less_key_segment_offsets), + (*key_segment_offsets).data(), + (*key_segment_offsets).size(), + handle.get_stream()); } + } - if (minor_comm_size > 1) { - device_allgather(minor_comm, - d_aggregate_tmps.data() + minor_comm_rank * num_scalars, - d_aggregate_tmps.data(), - num_scalars, - handle.get_stream()); - } + if (minor_comm_size > 1) { + device_allgather(minor_comm, + d_aggregate_tmps.data() + minor_comm_rank * num_scalars, + d_aggregate_tmps.data(), + num_scalars, + handle.get_stream()); + } - std::vector h_aggregate_tmps(d_aggregate_tmps.size()); - raft::update_host(h_aggregate_tmps.data(), - d_aggregate_tmps.data(), - d_aggregate_tmps.size(), - handle.get_stream()); - handle.sync_stream(); + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + raft::update_host(h_aggregate_tmps.data(), + d_aggregate_tmps.data(), + d_aggregate_tmps.size(), + handle.get_stream()); + handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(minor_comm_size); + tmp_buffer_size_per_loop_approximations = std::vector(minor_comm_size); + if constexpr (use_input_key) { local_key_list_sizes = std::vector(minor_comm_size); if constexpr (try_bitmap) { local_v_list_range_firsts = std::vector(minor_comm_size); @@ -1875,30 +1929,35 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, key_segment_offset_vectors = std::vector>{}; (*key_segment_offset_vectors).reserve(minor_comm_size); } - for (int i = 0; i < minor_comm_size; ++i) { - local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars]; + } + for (int i = 0; i < minor_comm_size; ++i) { + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * num_scalars]; + tmp_buffer_size_per_loop_approximations[i] = h_aggregate_tmps[i * num_scalars + 1]; + if constexpr (use_input_key) { + local_key_list_sizes[i] = h_aggregate_tmps[i * num_scalars + 2]; if constexpr (try_bitmap) { local_v_list_range_firsts[i] = - static_cast(h_aggregate_tmps[i * num_scalars + 1]); + static_cast(h_aggregate_tmps[i * num_scalars + 3]); local_v_list_range_lasts[i] = - static_cast(h_aggregate_tmps[i * num_scalars + 2]); + static_cast(h_aggregate_tmps[i * num_scalars + 4]); } if constexpr (filter_input_key) { if (graph_view.use_dcs()) { (*local_key_list_deg1_sizes)[i] = - static_cast(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 3 : 1)]); + static_cast(h_aggregate_tmps[i * num_scalars + (try_bitmap ? 5 : 3)]); } } if (key_segment_offsets) { (*key_segment_offset_vectors) - .emplace_back(h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1) + - ((filter_input_key && graph_view.use_dcs()) ? 1 : 0), - h_aggregate_tmps.begin() + i * num_scalars + (try_bitmap ? 3 : 1) + - ((filter_input_key && graph_view.use_dcs()) ? 1 : 0) + - (*key_segment_offsets).size()); + .emplace_back( + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets, + h_aggregate_tmps.begin() + i * num_scalars + num_scalars_less_key_segment_offsets + + (*key_segment_offsets).size()); } } - } else { + } + } else { + if constexpr (use_input_key) { local_key_list_sizes = std::vector{ static_cast(thrust::distance(sorted_unique_key_first, sorted_unique_nzd_key_last))}; if (key_segment_offsets) { @@ -2008,63 +2067,17 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { - auto max_tmp_buffer_size = static_cast( - static_cast(handle.get_device_properties().totalGlobalMem) * 0.2); - size_t tmp_buffer_size_per_loop{0}; - if constexpr (update_major) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - - size_t key_size{0}; - if constexpr (use_input_key) { - if constexpr (std::is_arithmetic_v) { - if (v_compressible) { - key_size = sizeof(uint32_t); - } else { - key_size = sizeof(key_t); - } - } else { - key_size = sum_thrust_tuple_element_sizes(); - } - } - size_t value_size{0}; - if constexpr (std::is_arithmetic_v) { - value_size = sizeof(T); - } else { - value_size = sum_thrust_tuple_element_sizes(); - } - - size_t aggregate_major_range_size{}; - if constexpr (use_input_key) { - aggregate_major_range_size = - std::reduce(local_key_list_sizes.begin(), local_key_list_sizes.end()); - } else { - for (size_t i = 0; i < graph_view.number_of_local_edge_partitions(); ++i) { - if constexpr (GraphViewType::is_storage_transposed) { - aggregate_major_range_size += graph_view.local_edge_partition_dst_range_size(i); - } else { - aggregate_major_range_size += graph_view.local_edge_partition_src_range_size(i); - } - } - } - size_t size_per_key{}; - if constexpr (filter_input_key) { - size_per_key = - key_size + - value_size / 2; // to reflect that many keys will be filtered out, note that this is a - // simple approximation, memory requirement in this case is much more - // complex as we store additional temporary variables - - } else { - size_per_key = key_size + value_size; - } - tmp_buffer_size_per_loop = - (aggregate_major_range_size / graph_view.number_of_local_edge_partitions()) * - size_per_key; - } - + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, - tmp_buffer_size_per_loop, + approx_tmp_buffer_size_per_loop, graph_view.number_of_local_edge_partitions(), max_segments, handle.get_stream_pool_size()); diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 6efee71f5ac..bef61080f45 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -335,60 +335,74 @@ void fill_edge_minor_property(raft::handle_t const& handle, sizeof( uint32_t); // 128B cache line alignment (unaligned ncclBroadcast operations are slower) + std::vector max_tmp_buffer_sizes{}; std::vector local_v_list_sizes{}; std::vector local_v_list_range_firsts{}; std::vector local_v_list_range_lasts{}; { auto v_list_size = static_cast( thrust::distance(sorted_unique_vertex_first, sorted_unique_vertex_last)); - rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{3}, - handle.get_stream()); - thrust::tabulate(handle.get_thrust_policy(), - d_aggregate_tmps.begin() + major_comm_rank * size_t{3}, - d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{3}, - [sorted_unique_vertex_first, - v_list_size, - vertex_partition_range_first = - graph_view.local_vertex_partition_range_first()] __device__(size_t i) { - if (i == 0) { - return v_list_size; - } else if (i == 1) { - if (v_list_size > 0) { - return *sorted_unique_vertex_first; - } else { - return vertex_partition_range_first; - } - } else { - if (v_list_size > 0) { - return *(sorted_unique_vertex_first + (v_list_size - 1)) + 1; - } else { - return vertex_partition_range_first; - } - } - }); + rmm::device_uvector d_aggregate_tmps(major_comm_size * size_t{4}, + handle.get_stream()); + thrust::tabulate( + handle.get_thrust_policy(), + d_aggregate_tmps.begin() + major_comm_rank * size_t{4}, + d_aggregate_tmps.begin() + (major_comm_rank + 1) * size_t{4}, + [max_tmp_buffer_size = static_cast( + static_cast(handle.get_device_properties().totalGlobalMem) * 0.05), + sorted_unique_vertex_first, + v_list_size, + vertex_partition_range_first = + graph_view.local_vertex_partition_range_first()] __device__(size_t i) { + if (i == 0) { + return max_tmp_buffer_size; + } else if (i == 1) { + return static_cast(v_list_size); + } else if (i == 2) { + vertex_t first{}; + if (v_list_size > 0) { + first = *sorted_unique_vertex_first; + } else { + first = vertex_partition_range_first; + } + assert(static_cast(static_cast(first)) == first); + return static_cast(first); + } else { + vertex_t last{}; + if (v_list_size > 0) { + last = *(sorted_unique_vertex_first + (v_list_size - 1)) + 1; + } else { + last = vertex_partition_range_first; + } + assert(static_cast(static_cast(last)) == last); + return static_cast(last); + } + }); - if (major_comm_size > 1) { // allgather v_list_size, v_list_range_first (inclusive), - // v_list_range_last (exclusive) + if (major_comm_size > 1) { // allgather max_tmp_buffer_size, v_list_size, v_list_range_first + // (inclusive), v_list_range_last (exclusive) device_allgather(major_comm, - d_aggregate_tmps.data() + major_comm_rank * size_t{3}, + d_aggregate_tmps.data() + major_comm_rank * size_t{4}, d_aggregate_tmps.data(), - size_t{3}, + size_t{4}, handle.get_stream()); } - std::vector h_aggregate_tmps(d_aggregate_tmps.size()); + std::vector h_aggregate_tmps(d_aggregate_tmps.size()); raft::update_host(h_aggregate_tmps.data(), d_aggregate_tmps.data(), d_aggregate_tmps.size(), handle.get_stream()); handle.sync_stream(); + max_tmp_buffer_sizes = std::vector(major_comm_size); local_v_list_sizes = std::vector(major_comm_size); local_v_list_range_firsts = std::vector(major_comm_size); local_v_list_range_lasts = std::vector(major_comm_size); for (int i = 0; i < major_comm_size; ++i) { - local_v_list_sizes[i] = h_aggregate_tmps[i * size_t{3}]; - local_v_list_range_firsts[i] = h_aggregate_tmps[i * size_t{3} + 1]; - local_v_list_range_lasts[i] = h_aggregate_tmps[i * size_t{3} + 2]; + max_tmp_buffer_sizes[i] = h_aggregate_tmps[i * size_t{4}]; + local_v_list_sizes[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 1]); + local_v_list_range_firsts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 2]); + local_v_list_range_lasts[i] = static_cast(h_aggregate_tmps[i * size_t{4} + 3]); } } @@ -546,8 +560,8 @@ void fill_edge_minor_property(raft::handle_t const& handle, } tmp_buffer_size_per_loop /= major_comm_size; stream_pool_indices = init_stream_pool_indices( - static_cast(static_cast(handle.get_device_properties().totalGlobalMem) * - 0.05), + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(major_comm_size), tmp_buffer_size_per_loop, major_comm_size, 1, From a51b708a49a77fc81d843138c87e0fdaf8d4bb60 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 30 Oct 2024 16:50:14 -0700 Subject: [PATCH 114/126] kernel fusion --- cpp/src/prims/fill_edge_src_dst_property.cuh | 294 +++++++++++++------ 1 file changed, 205 insertions(+), 89 deletions(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index bef61080f45..6b62bdf2045 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -774,76 +774,49 @@ void fill_edge_minor_property(raft::handle_t const& handle, } } device_group_end(major_comm); - if (stream_pool_indices) { handle.sync_stream(); } + bool kernel_fusion = + !edge_partition_keys && !v_list_bitmap && (loop_count > 1) && + (static_cast(std::reduce(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count))) < + size_t{64 * 1024} /* tuning parameter */ * + loop_count); // FIXME: kernle fusion can be useful even when + // edge_partition_keys.has_value() is true + + if (!kernel_fusion) { + if (stream_pool_indices) { handle.sync_stream(); } + } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub2 = std::chrono::steady_clock::now(); #endif - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + if (!kernel_fusion) { + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) + : handle.get_stream(); - if (v_list_bitmap) { - auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); - rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], - loop_stream); - retrieve_vertex_list_from_bitmap( - raft::device_span(rx_bitmap.data(), rx_bitmap.size()), - rx_vertices.begin(), - raft::device_span(dummy_counters.data() + j, size_t{1}), - local_v_list_range_firsts[partition_idx], - local_v_list_range_lasts[partition_idx], - loop_stream); - edge_partition_v_buffers[j] = std::move(rx_vertices); - } + if (v_list_bitmap) { + auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); + rmm::device_uvector rx_vertices(local_v_list_sizes[partition_idx], + loop_stream); + retrieve_vertex_list_from_bitmap( + raft::device_span(rx_bitmap.data(), rx_bitmap.size()), + rx_vertices.begin(), + raft::device_span(dummy_counters.data() + j, size_t{1}), + local_v_list_range_firsts[partition_idx], + local_v_list_range_lasts[partition_idx], + loop_stream); + edge_partition_v_buffers[j] = std::move(rx_vertices); + } - if (edge_partition_keys) { - thrust::for_each( - rmm::exec_policy_nosync(loop_stream), - thrust::make_counting_iterator(vertex_t{0}), - thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), - [rx_vertex_first = compressed_v_list - ? static_cast(nullptr) - : std::get<0>(edge_partition_v_buffers[j]).data(), - rx_compressed_vertex_first = compressed_v_list - ? std::get<1>(edge_partition_v_buffers[j]).data() - : static_cast(nullptr), - range_first = local_v_list_range_firsts[partition_idx], - input, - subrange_key_first = (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], - subrange_key_last = - (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], - edge_partition_value_first = edge_partition_value_first, - subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { - vertex_t minor{}; - if (rx_vertex_first != nullptr) { - minor = *(rx_vertex_first + i); - } else { - minor = range_first + *(rx_compressed_vertex_first + i); - } - auto it = - thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); - if ((it != subrange_key_last) && (*it == minor)) { - auto subrange_offset = thrust::distance(subrange_key_first, it); - if constexpr (contains_packed_bool_element) { - fill_scalar_or_thrust_tuple( - edge_partition_value_first, subrange_start_offset + subrange_offset, input); - } else { - *(edge_partition_value_first + subrange_start_offset + subrange_offset) = input; - } - } - }); - } else { - if constexpr (contains_packed_bool_element) { + if (edge_partition_keys) { thrust::for_each( rmm::exec_policy_nosync(loop_stream), thrust::make_counting_iterator(vertex_t{0}), thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), - [minor_range_first, - rx_vertex_first = compressed_v_list + [rx_vertex_first = compressed_v_list ? static_cast(nullptr) : std::get<0>(edge_partition_v_buffers[j]).data(), rx_compressed_vertex_first = compressed_v_list @@ -851,48 +824,191 @@ void fill_edge_minor_property(raft::handle_t const& handle, : static_cast(nullptr), range_first = local_v_list_range_firsts[partition_idx], input, - output_value_first = edge_partition_value_first] __device__(auto i) { + subrange_key_first = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx], + subrange_key_last = + (*edge_partition_keys).begin() + (*key_offsets)[partition_idx + 1], + edge_partition_value_first = edge_partition_value_first, + subrange_start_offset = (*key_offsets)[partition_idx]] __device__(auto i) { vertex_t minor{}; if (rx_vertex_first != nullptr) { minor = *(rx_vertex_first + i); } else { minor = range_first + *(rx_compressed_vertex_first + i); } - auto minor_offset = minor - minor_range_first; - fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + auto it = + thrust::lower_bound(thrust::seq, subrange_key_first, subrange_key_last, minor); + if ((it != subrange_key_last) && (*it == minor)) { + auto subrange_offset = thrust::distance(subrange_key_first, it); + if constexpr (contains_packed_bool_element) { + fill_scalar_or_thrust_tuple( + edge_partition_value_first, subrange_start_offset + subrange_offset, input); + } else { + *(edge_partition_value_first + subrange_start_offset + subrange_offset) = + input; + } + } }); } else { - if (compressed_v_list) { - auto map_first = thrust::make_transform_iterator( - std::get<1>(edge_partition_v_buffers[j]).begin(), - cuda::proclaim_return_type( - [minor_range_first, - range_first = - local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { - return static_cast(v_offset + (range_first - minor_range_first)); - })); - auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(rmm::exec_policy_nosync(loop_stream), - val_first, - val_first + local_v_list_sizes[partition_idx], - map_first, - edge_partition_value_first); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + rmm::exec_policy_nosync(loop_stream), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(local_v_list_sizes[partition_idx]), + [minor_range_first, + rx_vertex_first = compressed_v_list + ? static_cast(nullptr) + : std::get<0>(edge_partition_v_buffers[j]).data(), + rx_compressed_vertex_first = compressed_v_list + ? std::get<1>(edge_partition_v_buffers[j]).data() + : static_cast(nullptr), + range_first = local_v_list_range_firsts[partition_idx], + input, + output_value_first = edge_partition_value_first] __device__(auto i) { + vertex_t minor{}; + if (rx_vertex_first != nullptr) { + minor = *(rx_vertex_first + i); + } else { + minor = range_first + *(rx_compressed_vertex_first + i); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); } else { - auto map_first = thrust::make_transform_iterator( - std::get<0>(edge_partition_v_buffers[j]).begin(), - cuda::proclaim_return_type( - [minor_range_first] __device__(auto v) { return v - minor_range_first; })); - auto val_first = thrust::make_constant_iterator(input); - thrust::scatter(rmm::exec_policy_nosync(loop_stream), - val_first, - val_first + local_v_list_sizes[partition_idx], - map_first, - edge_partition_value_first); + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + std::get<1>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first, + range_first = + local_v_list_range_firsts[partition_idx]] __device__(auto v_offset) { + return static_cast(v_offset + (range_first - minor_range_first)); + })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + std::get<0>(edge_partition_v_buffers[j]).begin(), + cuda::proclaim_return_type( + [minor_range_first] __device__(auto v) { return v - minor_range_first; })); + auto val_first = thrust::make_constant_iterator(input); + thrust::scatter(rmm::exec_policy_nosync(loop_stream), + val_first, + val_first + local_v_list_sizes[partition_idx], + map_first, + edge_partition_value_first); + } } } } + if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } + } else { // kernel fusion + std::vector h_vertex_vars(loop_count /* range_first values */ + + (loop_count + 1) /* loop offsets */); + std::copy(local_v_list_range_firsts.begin() + i, + local_v_list_range_firsts.begin() + (i + loop_count), + h_vertex_vars.begin()); + h_vertex_vars[loop_count] = 0; + std::inclusive_scan(local_v_list_sizes.begin() + i, + local_v_list_sizes.begin() + (i + loop_count), + h_vertex_vars.begin() + (loop_count + 1)); + std::vector h_ptrs(loop_count); + if (compressed_v_list) { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<1>(edge_partition_v_buffers[j]).data()); + } + } else { + for (size_t j = 0; j < loop_count; ++j) { + h_ptrs[j] = static_cast(std::get<0>(edge_partition_v_buffers[j]).data()); + } + } + rmm::device_uvector d_vertex_vars(h_vertex_vars.size(), handle.get_stream()); + rmm::device_uvector d_ptrs(h_ptrs.size(), handle.get_stream()); + raft::update_device( + d_vertex_vars.data(), h_vertex_vars.data(), h_vertex_vars.size(), handle.get_stream()); + raft::update_device(d_ptrs.data(), h_ptrs.data(), h_ptrs.size(), handle.get_stream()); + + raft::device_span range_firsts(d_vertex_vars.data(), loop_count); + raft::device_span loop_offsets(d_vertex_vars.data() + loop_count, + loop_count + 1); + if constexpr (contains_packed_bool_element) { + thrust::for_each( + handle.get_thrust_policy(), + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(h_vertex_vars.back()), + [range_firsts, + loop_offsets, + minor_range_first, + input, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + output_value_first = edge_partition_value_first, + compressed = compressed_v_list.has_value()] __device__(auto i) { + auto loop_idx = + thrust::distance(loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto rx_first = rx_firsts[loop_idx]; + vertex_t minor{}; + if (compressed) { + minor = range_firsts[loop_idx] + + *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } else { + minor = *(static_cast(rx_first) + (i - loop_offsets[loop_idx])); + } + auto minor_offset = minor - minor_range_first; + fill_scalar_or_thrust_tuple(output_value_first, minor_offset, input); + }); + } else { + auto val_first = thrust::make_constant_iterator(input); + if (compressed_v_list) { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [range_firsts, + loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = + range_firsts[loop_idx] + *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } else { + auto map_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type( + [loop_offsets, + rx_firsts = raft::device_span(d_ptrs.data(), d_ptrs.size()), + minor_range_first] __device__(auto i) { + auto loop_idx = thrust::distance( + loop_offsets.begin() + 1, + thrust::upper_bound( + thrust::seq, loop_offsets.begin() + 1, loop_offsets.end(), i)); + auto minor = *(static_cast(rx_firsts[loop_idx]) + + (i - loop_offsets[loop_idx])); + return minor - minor_range_first; + })); + thrust::scatter(handle.get_thrust_policy(), + val_first, + val_first + h_vertex_vars.back(), + map_first, + edge_partition_value_first); + } + } } - if (stream_pool_indices) { handle.sync_stream_pool(*stream_pool_indices); } #if FILL_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto sub3 = std::chrono::steady_clock::now(); @@ -900,7 +1016,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, std::chrono::duration subdur1 = sub2 - sub1; std::chrono::duration subdur2 = sub3 - sub2; std::cerr << "fill_edge_minor path B took (" << subdur0.count() << "," << subdur1.count() - << "," << subdur2.count() << ")" << std::endl; + << "," << subdur2.count() << ") kernel_fusion=" << kernel_fusion << std::endl; #endif } } From 1526dcf8824850f0bfaa5509870e174d7024323d Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 30 Oct 2024 17:28:22 -0700 Subject: [PATCH 115/126] kernel fusion --- cpp/src/prims/fill_edge_src_dst_property.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 6b62bdf2045..134ea61e470 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -778,7 +778,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, !edge_partition_keys && !v_list_bitmap && (loop_count > 1) && (static_cast(std::reduce(local_v_list_sizes.begin() + i, local_v_list_sizes.begin() + (i + loop_count))) < - size_t{64 * 1024} /* tuning parameter */ * + size_t{256 * 1024} /* tuning parameter (binary search vs kernel launch overhead) */ * loop_count); // FIXME: kernle fusion can be useful even when // edge_partition_keys.has_value() is true From a23653366b8b7479f5e26a65ecd6fd1b1cfc2915 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 31 Oct 2024 01:04:44 -0700 Subject: [PATCH 116/126] fix regarding concurrent streams --- .../detail/extract_transform_v_frontier_e.cuh | 96 +++++++++++------- .../prims/detail/per_v_transform_reduce_e.cuh | 99 ++++++++++++------- cpp/src/prims/fill_edge_src_dst_property.cuh | 20 +++- cpp/tests/utilities/mg_utilities.hpp | 2 +- 4 files changed, 136 insertions(+), 81 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 8ef7da3f022..787d19cc125 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -504,12 +504,16 @@ void extract_transform_v_frontier_e_edge_partition( std::optional> key_segment_offsets, std::optional> const& edge_partition_stream_pool_indices) { + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } if (key_segment_offsets) { if (((*key_segment_offsets)[1] > 0) && ((*high_segment_edge_count) > 0)) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_thread_t update_grid((*high_segment_edge_count), extract_transform_v_frontier_e_kernel_block_size, @@ -530,10 +534,10 @@ void extract_transform_v_frontier_e_edge_partition( e_op); } if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); @@ -552,10 +556,10 @@ void extract_transform_v_frontier_e_edge_partition( e_op); } if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2] > 0) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); @@ -575,10 +579,10 @@ void extract_transform_v_frontier_e_edge_partition( } if (edge_partition.dcs_nzd_vertex_count() && ((*key_segment_offsets)[4] - (*key_segment_offsets)[3] > 0)) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_thread_t update_grid((*key_segment_offsets)[4] - (*key_segment_offsets)[3], extract_transform_v_frontier_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); @@ -597,7 +601,11 @@ void extract_transform_v_frontier_e_edge_partition( e_op); } } else { - assert(!edge_partition_stream_pool_indices); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + auto frontier_size = static_cast( thrust::distance(edge_partition_frontier_key_first, edge_partition_frontier_key_last)); if (frontier_size > 0) { @@ -606,7 +614,7 @@ void extract_transform_v_frontier_e_edge_partition( handle.get_device_properties().maxGridSize[0]); extract_transform_v_frontier_e_hypersparse_or_low_degree - <<>>( + <<>>( edge_partition, edge_partition_frontier_key_first, edge_partition_frontier_key_last, @@ -1018,31 +1026,39 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); auto const minor_comm_size = minor_comm.get_size(); - + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; if (graph_view.local_vertex_partition_segment_offsets() && (handle.get_stream_pool_size() >= max_segments)) { - auto max_tmp_buffer_size = - std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / - static_cast(minor_comm_size); - auto approx_tmp_buffer_size_per_loop = - std::reduce(tmp_buffer_size_per_loop_approximations.begin(), - tmp_buffer_size_per_loop_approximations.end()) / - static_cast(minor_comm_size); - stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, - approx_tmp_buffer_size_per_loop, - graph_view.number_of_local_edge_partitions(), - max_segments, - handle.get_stream_pool_size()); - if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } size_t num_concurrent_loops{1}; std::optional> loop_stream_pool_indices{ std::nullopt}; // first num_concurrent_loopos streams from stream_pool_indices if (stream_pool_indices) { - assert(((*stream_pool_indices).size() % max_segments) == 0); - num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); loop_stream_pool_indices = std::vector(num_concurrent_loops); std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } @@ -1413,10 +1429,16 @@ extract_transform_v_frontier_e(raft::handle_t const& handle, detail::edge_partition_edge_property_device_view_t>( *edge_mask_view, partition_idx) : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; + } auto edge_partition_stream_pool_indices = - stream_pool_indices ? std::make_optional>( - (*stream_pool_indices).data() + j * max_segments, max_segments) - : std::nullopt; + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; edge_partition_src_input_device_view_t edge_partition_src_value_input{}; edge_partition_dst_input_device_view_t edge_partition_dst_value_input{}; diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 650a307d54c..a7892ce584f 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1234,14 +1234,18 @@ void per_v_transform_reduce_e_edge_partition( decltype(edge_partition_key_first), decltype(thrust::make_counting_iterator(vertex_t{0}))>; + size_t stream_pool_size{0}; + if (edge_partition_stream_pool_indices) { + stream_pool_size = (*edge_partition_stream_pool_indices).size(); + } if (key_segment_offsets) { static_assert(detail::num_sparse_segments_per_vertex_partition == 3); if (edge_partition.dcs_nzd_vertex_count()) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[0]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); if constexpr (update_major && !use_input_key) { // this is necessary as we don't visit // every vertex in the hypersparse segment @@ -1287,10 +1291,10 @@ void per_v_transform_reduce_e_edge_partition( } } if ((*key_segment_offsets)[3] - (*key_segment_offsets)[2]) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[1]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[1 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_thread_t update_grid((*key_segment_offsets)[3] - (*key_segment_offsets)[2], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); @@ -1321,10 +1325,10 @@ void per_v_transform_reduce_e_edge_partition( pred_op); } if ((*key_segment_offsets)[2] - (*key_segment_offsets)[1] > 0) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[2]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[2 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_warp_t update_grid((*key_segment_offsets)[2] - (*key_segment_offsets)[1], detail::per_v_transform_reduce_e_kernel_block_size, handle.get_device_properties().maxGridSize[0]); @@ -1356,10 +1360,10 @@ void per_v_transform_reduce_e_edge_partition( pred_op); } if ((*key_segment_offsets)[1] > 0) { - auto exec_stream = - edge_partition_stream_pool_indices - ? handle.get_stream_from_stream_pool((*edge_partition_stream_pool_indices)[3]) - : handle.get_stream(); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[3 % stream_pool_size]) + : handle.get_stream(); raft::grid_1d_block_t update_grid( (*key_segment_offsets)[1], std::is_same_v> @@ -1391,7 +1395,11 @@ void per_v_transform_reduce_e_edge_partition( pred_op); } } else { - assert(!edge_partition_stream_pools); + auto exec_stream = edge_partition_stream_pool_indices + ? handle.get_stream_from_stream_pool( + (*edge_partition_stream_pool_indices)[0 % stream_pool_size]) + : handle.get_stream(); + size_t num_keys{}; if constexpr (use_input_key) { num_keys = @@ -1413,7 +1421,7 @@ void per_v_transform_reduce_e_edge_partition( segment_key_first = thrust::make_counting_iterator(edge_partition.major_range_first()); } detail::per_v_transform_reduce_e_low_degree - <<>>( + <<>>( edge_partition, *segment_key_first, *segment_key_first + num_keys, @@ -2066,23 +2074,32 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + auto max_tmp_buffer_size = + std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / + static_cast(minor_comm_size); + auto approx_tmp_buffer_size_per_loop = + std::reduce(tmp_buffer_size_per_loop_approximations.begin(), + tmp_buffer_size_per_loop_approximations.end()) / + static_cast(minor_comm_size); + size_t num_streams_per_loop{1}; if (local_vertex_partition_segment_offsets && (handle.get_stream_pool_size() >= max_segments)) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); - auto max_tmp_buffer_size = - std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / - static_cast(minor_comm_size); - auto approx_tmp_buffer_size_per_loop = - std::reduce(tmp_buffer_size_per_loop_approximations.begin(), - tmp_buffer_size_per_loop_approximations.end()) / - static_cast(minor_comm_size); - stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, - approx_tmp_buffer_size_per_loop, - graph_view.number_of_local_edge_partitions(), - max_segments, - handle.get_stream_pool_size()); - if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } + num_streams_per_loop = std::max( + std::min(size_t{8} / graph_view.number_of_local_edge_partitions(), max_segments), + size_t{ + 1}); // Note that "CUDA_DEVICE_MAX_CONNECTIONS (default: 8, can be set to [1, 32])" sets + // the number of queues, if the total number of streams exceeds this number, jobs on + // different streams can be sent to one queue leading to false dependency. Setting + // num_concurrent_loops above the number of queues has some benefits in NCCL + // communications but creating too many streams just for compute may not help. } + stream_pool_indices = init_stream_pool_indices(max_tmp_buffer_size, + approx_tmp_buffer_size_per_loop, + graph_view.number_of_local_edge_partitions(), + num_streams_per_loop, + handle.get_stream_pool_size()); + if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } // 8. set-up temporary buffers @@ -2091,8 +2108,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::optional> loop_stream_pool_indices{ std::nullopt}; // first num_concurrent_loops streams from stream_pool_indices if (stream_pool_indices) { - assert(((*stream_pool_indices).size() % max_segments) == 0); - num_concurrent_loops = (*stream_pool_indices).size() / max_segments; + num_concurrent_loops = + std::min(graph_view.number_of_local_edge_partitions(), (*stream_pool_indices).size()); loop_stream_pool_indices = std::vector(num_concurrent_loops); std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } @@ -3120,10 +3137,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, detail::edge_partition_edge_property_device_view_t>( *edge_mask_view, partition_idx) : thrust::nullopt; + size_t num_streams_per_loop{1}; + if (stream_pool_indices) { + assert((*stream_pool_indices).size() >= num_concurrent_loops); + num_streams_per_loop = (*stream_pool_indices).size() / num_concurrent_loops; + } auto edge_partition_stream_pool_indices = - stream_pool_indices ? std::make_optional>( - (*stream_pool_indices).data() + j * max_segments, max_segments) - : std::nullopt; + stream_pool_indices + ? std::make_optional>( + (*stream_pool_indices).data() + j * num_streams_per_loop, num_streams_per_loop) + : std::nullopt; T major_init{}; T major_identity_element{}; diff --git a/cpp/src/prims/fill_edge_src_dst_property.cuh b/cpp/src/prims/fill_edge_src_dst_property.cuh index 134ea61e470..9a3dbe6d18d 100644 --- a/cpp/src/prims/fill_edge_src_dst_property.cuh +++ b/cpp/src/prims/fill_edge_src_dst_property.cuh @@ -542,6 +542,7 @@ void fill_edge_minor_property(raft::handle_t const& handle, } std::optional> stream_pool_indices{std::nullopt}; + size_t num_concurrent_bcasts{1}; { size_t tmp_buffer_size_per_loop{}; for (int i = 0; i < major_comm_size; ++i) { @@ -559,16 +560,22 @@ void fill_edge_minor_property(raft::handle_t const& handle, } } tmp_buffer_size_per_loop /= major_comm_size; + size_t max_streams = + static_cast(major_comm_size); // to allow setting num_concurrent_bcasts above + // hnadle.get_stream_pool_size() stream_pool_indices = init_stream_pool_indices( std::reduce(max_tmp_buffer_sizes.begin(), max_tmp_buffer_sizes.end()) / static_cast(major_comm_size), tmp_buffer_size_per_loop, major_comm_size, 1, - handle.get_stream_pool_size()); + max_streams); + num_concurrent_bcasts = (*stream_pool_indices).size(); + if ((*stream_pool_indices).size() > handle.get_stream_pool_size()) { + (*stream_pool_indices).resize(handle.get_stream_pool_size()); + } if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } - size_t num_concurrent_bcasts = stream_pool_indices ? (*stream_pool_indices).size() : size_t{1}; #if FILL_PERFORMANCE_MEASUREMENT std::cerr << "v_list_size=" << local_v_list_sizes[major_comm_rank] << " v_list_range=(" @@ -791,11 +798,14 @@ void fill_edge_minor_property(raft::handle_t const& handle, #endif if (!kernel_fusion) { + size_t stream_pool_size{0}; + if (stream_pool_indices) { stream_pool_size = (*stream_pool_indices).size(); } for (size_t j = 0; j < loop_count; ++j) { auto partition_idx = i + j; - auto loop_stream = stream_pool_indices - ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j]) - : handle.get_stream(); + auto loop_stream = + stream_pool_indices + ? handle.get_stream_from_stream_pool((*stream_pool_indices)[j % stream_pool_size]) + : handle.get_stream(); if (v_list_bitmap) { auto const& rx_bitmap = std::get<1>(edge_partition_v_buffers[j]); diff --git a/cpp/tests/utilities/mg_utilities.hpp b/cpp/tests/utilities/mg_utilities.hpp index a9a1d12417e..a21ee2bc525 100644 --- a/cpp/tests/utilities/mg_utilities.hpp +++ b/cpp/tests/utilities/mg_utilities.hpp @@ -29,7 +29,7 @@ void finalize_mpi(); int query_mpi_comm_world_rank(); int query_mpi_comm_world_size(); -std::unique_ptr initialize_mg_handle(size_t pool_size = 128); +std::unique_ptr initialize_mg_handle(size_t pool_size = 8 /* default value of CUDA_DEVICE_MAX_CONNECTIONS */); // NCCL lazily initializes for P2P, and this enforces P2P initialization for better performance // measurements From 2ab3232c5ac3df051ffecc5e243c9512a0a955b0 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 31 Oct 2024 01:07:09 -0700 Subject: [PATCH 117/126] tune direction optimizing alpha --- cpp/src/traversal/bfs_impl.cuh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index cdfa3422e2d..5bdb6dbce32 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -226,7 +226,7 @@ void bfs(raft::handle_t const& handle, (graph_view.number_of_vertices() > 0) ? ((static_cast(graph_view.compute_number_of_edges(handle)) / static_cast(graph_view.number_of_vertices())) * - (1.0 / 3.0) /* tuning parametger */) + (1.0 / 4.25) /* tuning parametger */) : double{1.0}; constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter From 65e36e16c4591b7c1af3344d08e8b5aee4f1c922 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Thu, 31 Oct 2024 11:28:31 -0700 Subject: [PATCH 118/126] update comments --- cpp/src/prims/detail/per_v_transform_reduce_e.cuh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index a7892ce584f..90952ff8e3c 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -2070,7 +2070,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, (max_key_offset_size <= static_cast(std::numeric_limits::max())); } - // 7. set-up stream pool + // 7. set-up stream pool & events std::optional> stream_pool_indices{std::nullopt}; if constexpr (GraphViewType::is_multi_gpu) { @@ -2102,8 +2102,6 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, if ((*stream_pool_indices).size() <= 1) { stream_pool_indices = std::nullopt; } } - // 8. set-up temporary buffers - size_t num_concurrent_loops{1}; std::optional> loop_stream_pool_indices{ std::nullopt}; // first num_concurrent_loops streams from stream_pool_indices @@ -2114,6 +2112,8 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::iota((*loop_stream_pool_indices).begin(), (*loop_stream_pool_indices).end(), size_t{0}); } + // 8. set-up temporary buffers + using minor_tmp_buffer_type = std::conditional_t, edge_dst_property_t>; @@ -4450,6 +4450,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } + #if PER_V_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); auto time5 = std::chrono::steady_clock::now(); From d6904358507069400ee626ae96addcc3fbd95206 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 1 Nov 2024 01:26:11 -0700 Subject: [PATCH 119/126] perf optimize direction optimizing aux info routines --- cpp/src/traversal/bfs_impl.cuh | 318 ++++++++++++++++++++------------- 1 file changed, 198 insertions(+), 120 deletions(-) diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index 5bdb6dbce32..e2092a1db46 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -52,6 +52,24 @@ namespace cugraph { namespace { +template +struct direction_optimizing_info_t { + rmm::device_uvector + approx_out_degrees; // if graph_view.local_vertex_partition_segment_offsets().has_value() is + // true, holds approximate degrees only for the high and mid degree + // segments; otherwise, exact + rmm::device_uvector visited_bitmap; + std::optional> nzd_unvisited_vertices{ + std::nullopt}; // valid only during bottom-up iterations + std::optional num_nzd_unvisited_low_degree_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() is true + std::optional num_nzd_unvisited_hypersparse_vertices{ + std::nullopt}; // to decide between topdown vs bottomup, relevant only when + // graph_view.local_vertex_partition_segment_offsets().has_value() && + // graph_view.use_dcs() are both true +}; + template struct topdown_e_op_t { detail::edge_partition_endpoint_property_device_view_t @@ -222,6 +240,8 @@ void bfs(raft::handle_t const& handle, auto prep1 = std::chrono::steady_clock::now(); #endif + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + double direction_optimizing_alpha = (graph_view.number_of_vertices() > 0) ? ((static_cast(graph_view.compute_number_of_edges(handle)) / @@ -230,23 +250,22 @@ void bfs(raft::handle_t const& handle, : double{1.0}; constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter - std::optional> approx_out_degrees{std::nullopt}; - std::optional> nzd_unvisited_vertices{std::nullopt}; + std::optional> aux_info{std::nullopt}; if (direction_optimizing) { - size_t partition_idx{0}; - size_t partition_size{1}; - if constexpr (GraphViewType::is_multi_gpu) { - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); - auto const minor_comm_size = minor_comm.get_size(); - partition_idx = static_cast(minor_comm_rank); - partition_size = static_cast(minor_comm_size); - } - - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + rmm::device_uvector approx_out_degrees(0, handle.get_stream()); if (segment_offsets) { // exploit internal knowedge for exhaustive performance optimization for // large-scale benchmarking (the else path is sufficient for small // clusters with few tens of GPUs) + size_t partition_idx{0}; + size_t partition_size{1}; + if constexpr (GraphViewType::is_multi_gpu) { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_rank = minor_comm.get_rank(); + auto const minor_comm_size = minor_comm.get_size(); + partition_idx = static_cast(minor_comm_rank); + partition_size = static_cast(minor_comm_size); + } + auto edge_partition = edge_partition_device_view_t( graph_view.local_edge_partition_view(partition_idx)); @@ -277,56 +296,63 @@ void bfs(raft::handle_t const& handle, handle.get_stream()); } thrust::transform(handle.get_thrust_policy(), - (*approx_out_degrees).begin(), - (*approx_out_degrees).end(), - (*approx_out_degrees).begin(), + approx_out_degrees.begin(), + approx_out_degrees.end(), + approx_out_degrees.begin(), multiplier_t{static_cast( partition_size)}); // local_degrees => approximate global degrees } else { approx_out_degrees = graph_view.compute_out_degrees(handle); // exact } + + rmm::device_uvector visited_bitmap( + packed_bool_size(graph_view.local_vertex_partition_range_size()), handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + visited_bitmap.begin(), + visited_bitmap.end(), + packed_bool_empty_mask()); + thrust::for_each( + handle.get_thrust_policy(), + sources, + sources + n_sources, + [bitmap = raft::device_span(visited_bitmap.data(), visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + + std::optional num_nzd_unvisited_low_degree_vertices{std::nullopt}; + std::optional num_nzd_unvisited_hypersparse_vertices{std::nullopt}; if (segment_offsets) { - auto num_visited_nzd_vertices = static_cast( - thrust::count_if(handle.get_thrust_policy(), - sources, - sources + n_sources, - [nzd_v_last = graph_view.local_vertex_partition_range_first() + - *((*segment_offsets).rbegin() + 1)] __device__(auto v) { - return (v < nzd_v_last) ? true : false; - })); - nzd_unvisited_vertices = rmm::device_uvector( - *((*segment_offsets).rbegin() + 1) - num_visited_nzd_vertices, handle.get_stream()); - thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()) + - *((*segment_offsets).rbegin() + 1), - (*nzd_unvisited_vertices).begin(), - [vertex_partition, - sources = raft::device_span(sources, n_sources)] __device__(vertex_t v) { - return !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); - }); - } else { - nzd_unvisited_vertices = rmm::device_uvector( - graph_view.local_vertex_partition_range_size(), handle.get_stream()); - auto valid_last = thrust::copy_if( - handle.get_thrust_policy(), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), - thrust::make_counting_iterator(graph_view.local_vertex_partition_range_last()), - (*nzd_unvisited_vertices).begin(), - [vertex_partition, - sources = raft::device_span(sources, n_sources), - out_degrees /* exact */ = raft::device_span( - (*approx_out_degrees).data(), (*approx_out_degrees).size())] __device__(vertex_t v) { - auto v_offset = vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return (out_degrees[v_offset] > edge_t{0}) && - !thrust::binary_search(thrust::seq, sources.begin(), sources.end(), v); - }); - (*nzd_unvisited_vertices) - .resize(thrust::distance((*nzd_unvisited_vertices).begin(), valid_last), - handle.get_stream()); - (*nzd_unvisited_vertices).shrink_to_fit(handle.get_stream()); + num_nzd_unvisited_low_degree_vertices = (*segment_offsets)[3] - (*segment_offsets)[2]; + if (graph_view.use_dcs()) { + num_nzd_unvisited_hypersparse_vertices = (*segment_offsets)[4] - (*segment_offsets)[3]; + } + if (n_sources > 0) { + std::vector h_sources(n_sources); + raft::update_host(h_sources.data(), sources, n_sources, handle.get_stream()); + handle.sync_stream(); + for (size_t i = 0; i < h_sources.size(); ++i) { + auto v_offset = h_sources[i] - graph_view.local_vertex_partition_range_first(); + if ((v_offset >= (*segment_offsets)[2]) && (v_offset < (*segment_offsets)[3])) { + --(*num_nzd_unvisited_low_degree_vertices); + } else if (graph_view.use_dcs()) { + if ((v_offset >= (*segment_offsets)[3]) && (v_offset < (*segment_offsets)[4])) { + --(*num_nzd_unvisited_hypersparse_vertices); + } + } + } + } } + + aux_info = + direction_optimizing_info_t{std::move(approx_out_degrees), + std::move(visited_bitmap), + std::nullopt, + num_nzd_unvisited_low_degree_vertices, + num_nzd_unvisited_hypersparse_vertices}; } // 4. initialize BFS frontier @@ -461,18 +487,18 @@ void bfs(raft::handle_t const& handle, if (direction_optimizing) { if (vertex_frontier.bucket(bucket_idx_next).size() > 0) { - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - vertex_frontier.bucket(bucket_idx_next).begin(), - vertex_frontier.bucket(bucket_idx_next).end(), - tmp_vertices.begin())), - handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + thrust::for_each( + handle.get_thrust_policy(), + vertex_frontier.bucket(bucket_idx_next).begin(), + vertex_frontier.bucket(bucket_idx_next).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); } #if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -482,21 +508,16 @@ void bfs(raft::handle_t const& handle, double m_f{0.0}; double m_u{0.0}; { - size_t partition_idx{0}; size_t partition_size{1}; if constexpr (GraphViewType::is_multi_gpu) { auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_rank = minor_comm.get_rank(); auto const minor_comm_size = minor_comm.get_size(); - partition_idx = static_cast(minor_comm_rank); partition_size = static_cast(minor_comm_size); } - auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin(); - auto f_vertex_last = vertex_frontier.bucket(bucket_idx_next).end(); - auto u_vertex_first = (*nzd_unvisited_vertices).begin(); - auto u_vertex_last = (*nzd_unvisited_vertices).end(); - auto segment_offsets = graph_view.local_edge_partition_segment_offsets(partition_idx); + auto f_vertex_first = vertex_frontier.bucket(bucket_idx_next).begin(); + auto f_vertex_last = vertex_frontier.bucket(bucket_idx_next).end(); + if (segment_offsets) { // FIXME: this actually over-estimates for graphs with power-law degree distribution auto approx_low_segment_degree = @@ -509,6 +530,12 @@ void bfs(raft::handle_t const& handle, raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), graph_view.local_vertex_partition_range_first(), handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + (f_segment_offsets[3] - f_segment_offsets[2]); + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + (f_segment_offsets[4] - f_segment_offsets[3]); + } f_vertex_last = f_vertex_first + f_segment_offsets[2]; m_f = static_cast((f_segment_offsets[3] - f_segment_offsets[2])) * approx_low_segment_degree; @@ -517,17 +544,10 @@ void bfs(raft::handle_t const& handle, approx_hypersparse_segment_degree; } - auto u_segment_offsets = compute_key_segment_offsets( - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), - graph_view.local_vertex_partition_range_first(), - handle.get_stream()); - u_vertex_last = u_vertex_first + u_segment_offsets[2]; - m_u = static_cast((u_segment_offsets[3] - u_segment_offsets[2])) * + m_u = static_cast(*((*aux_info).num_nzd_unvisited_low_degree_vertices)) * approx_low_segment_degree; if (graph_view.use_dcs()) { - m_u += static_cast(u_segment_offsets[4] - u_segment_offsets[3]) * + m_u += static_cast(*((*aux_info).num_nzd_unvisited_hypersparse_vertices)) * approx_hypersparse_segment_degree; } } @@ -537,12 +557,10 @@ void bfs(raft::handle_t const& handle, f_vertex_first, f_vertex_last, cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*approx_out_degrees).data(), - (*approx_out_degrees).size())] __device__(vertex_t v) { - auto v_offset = - vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(vertex_t v) { + auto v_offset = v - v_first; return out_degrees[v_offset]; }), edge_t{0}, @@ -550,16 +568,22 @@ void bfs(raft::handle_t const& handle, m_u += static_cast(thrust::transform_reduce( handle.get_thrust_policy(), - u_vertex_first, - u_vertex_last, + thrust::make_counting_iterator(vertex_t{0}), + thrust::make_counting_iterator(segment_offsets + ? (*segment_offsets)[2] + : graph_view.local_vertex_partition_range_size()), cuda::proclaim_return_type( - [vertex_partition, - out_degrees = raft::device_span( - (*approx_out_degrees).data(), - (*approx_out_degrees).size())] __device__(vertex_t v) { - auto v_offset = - vertex_partition.local_vertex_partition_offset_from_vertex_nocheck(v); - return out_degrees[v_offset]; + [out_degrees = raft::device_span((*aux_info).approx_out_degrees.data(), + (*aux_info).approx_out_degrees.size()), + bitmap = raft::device_span( + (*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size())] __device__(vertex_t v_offset) { + auto word = bitmap[packed_bool_offset(v_offset)]; + if ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()) { // visited + return edge_t{0}; + } else { + return out_degrees[v_offset]; + } }), edge_t{0}, thrust::plus{})); @@ -586,7 +610,31 @@ void bfs(raft::handle_t const& handle, #endif if ((aggregate_m_f * direction_optimizing_alpha > aggregate_m_u) && (next_aggregate_frontier_size >= cur_aggregate_frontier_size)) { - topdown = false; + topdown = false; + (*aux_info).nzd_unvisited_vertices = rmm::device_uvector( + segment_offsets ? *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_size(), + handle.get_stream()); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::copy_if( + handle.get_thrust_policy(), + thrust::make_counting_iterator(graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator( + segment_offsets ? graph_view.local_vertex_partition_range_first() + + *((*segment_offsets).rbegin() + 1) + : graph_view.local_vertex_partition_range_last()), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) == packed_bool_empty_mask()); + })), + handle.get_stream()); } } #if BFS_PERFORMANCE_MEASUREMENT @@ -602,8 +650,8 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + (*((*aux_info).nzd_unvisited_vertices)).size())); vertex_frontier.bucket(bucket_idx_next) = key_bucket_t(handle); } @@ -681,18 +729,49 @@ void bfs(raft::handle_t const& handle, assert(direction_optimizing); - rmm::device_uvector tmp_vertices((*nzd_unvisited_vertices).size(), - handle.get_stream()); - tmp_vertices.resize( - thrust::distance(tmp_vertices.begin(), - thrust::set_difference(handle.get_thrust_policy(), - (*nzd_unvisited_vertices).begin(), - (*nzd_unvisited_vertices).end(), - new_frontier_vertex_buffer.begin(), - new_frontier_vertex_buffer.end(), - tmp_vertices.begin())), - handle.get_stream()); - nzd_unvisited_vertices = std::move(tmp_vertices); + thrust::for_each( + handle.get_thrust_policy(), + new_frontier_vertex_buffer.begin(), + new_frontier_vertex_buffer.end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + cuda::atomic_ref word( + bitmap[packed_bool_offset(v_offset)]); + word.fetch_or(packed_bool_mask(v_offset), cuda::std::memory_order_relaxed); + }); + (*((*aux_info).nzd_unvisited_vertices)) + .resize( + thrust::distance( + (*((*aux_info).nzd_unvisited_vertices)).begin(), + thrust::remove_if( + handle.get_thrust_policy(), + (*((*aux_info).nzd_unvisited_vertices)).begin(), + (*((*aux_info).nzd_unvisited_vertices)).end(), + [bitmap = raft::device_span((*aux_info).visited_bitmap.data(), + (*aux_info).visited_bitmap.size()), + v_first = graph_view.local_vertex_partition_range_first()] __device__(auto v) { + auto v_offset = v - v_first; + auto word = bitmap[packed_bool_offset(v_offset)]; + return ((word & packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + handle.get_stream()); + + if (segment_offsets) { + auto key_segment_offsets = compute_key_segment_offsets( + new_frontier_vertex_buffer.begin(), + new_frontier_vertex_buffer.end(), + raft::host_span((*segment_offsets).data(), (*segment_offsets).size()), + graph_view.local_vertex_partition_range_first(), + handle.get_stream()); + *((*aux_info).num_nzd_unvisited_low_degree_vertices) -= + key_segment_offsets[3] - key_segment_offsets[2]; + if (graph_view.use_dcs()) { + *((*aux_info).num_nzd_unvisited_hypersparse_vertices) -= + key_segment_offsets[4] - key_segment_offsets[3]; + } + } } #if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); @@ -701,12 +780,11 @@ void bfs(raft::handle_t const& handle, next_aggregate_frontier_size = static_cast(new_frontier_vertex_buffer.size()); auto aggregate_nzd_unvisited_vertices = - static_cast((*nzd_unvisited_vertices).size()); + static_cast((*((*aux_info).nzd_unvisited_vertices)).size()); if constexpr (GraphViewType::is_multi_gpu) { auto tmp = host_scalar_allreduce( handle.get_comms(), - thrust::make_tuple(static_cast(new_frontier_vertex_buffer.size()), - static_cast((*nzd_unvisited_vertices).size())), + thrust::make_tuple(next_aggregate_frontier_size, aggregate_nzd_unvisited_vertices), raft::comms::op_t::SUM, handle.get_stream()); next_aggregate_frontier_size = thrust::get<0>(tmp); @@ -753,8 +831,8 @@ void bfs(raft::handle_t const& handle, vertex_frontier.bucket(bucket_idx_cur) = key_bucket_t( handle, - raft::device_span((*nzd_unvisited_vertices).data(), - (*nzd_unvisited_vertices).size())); + raft::device_span((*((*aux_info).nzd_unvisited_vertices)).data(), + ((*(*aux_info).nzd_unvisited_vertices)).size())); } #if BFS_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); From 8ecc412243d1edc7ab895fff19be9fc1fd43ac85 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 1 Nov 2024 01:30:17 -0700 Subject: [PATCH 120/126] modify multi-stream kernel launching orders --- .../prims/detail/per_v_transform_reduce_e.cuh | 179 ++++++++++++++---- 1 file changed, 137 insertions(+), 42 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 90952ff8e3c..61a75bc32c6 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -2454,42 +2454,34 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } - for (size_t j = 0; j < loop_count; ++j) { - auto partition_idx = i + j; - auto loop_stream = - loop_stream_pool_indices - ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) - : handle.get_stream(); - - auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; - - auto& keys = edge_partition_key_buffers[j]; - std::variant, rmm::device_uvector> offsets = - rmm::device_uvector(0, loop_stream); - if (uint32_key_output_offset) { - std::get<0>(offsets).resize(process_local_edges[j] - ? (key_segment_offsets[4] - key_segment_offsets[3]) - : vertex_t{0}, - loop_stream); - } else { - offsets = rmm::device_uvector( - process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) - : vertex_t{0}, - loop_stream); - } - - if (process_local_edges[j]) { - auto edge_partition = - edge_partition_device_view_t( - graph_view.local_edge_partition_view(partition_idx)); - auto const& segment_offsets = - graph_view.local_edge_partition_segment_offsets(partition_idx); - - auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + if constexpr (try_bitmap) { // if we are using a bitmap buffer + if (v_list_bitmap) { + std::vector> input_count_offset_vectors{}; + input_count_offset_vectors.reserve(loop_count); + + std::vector> filtered_bitmap_vectors{}; + std::vector> output_count_offset_vectors{}; + filtered_bitmap_vectors.reserve(loop_count); + output_count_offset_vectors.reserve(loop_count); + + std::vector range_offset_firsts(loop_count, 0); + std::vector range_offset_lasts(loop_count, 0); + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector input_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); - if constexpr (try_bitmap) { - if (v_list_bitmap) { - auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; auto range_offset_first = std::min((edge_partition.major_range_first() + (*segment_offsets)[3] > local_v_list_range_firsts[partition_idx]) @@ -2507,6 +2499,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, local_v_list_range_lasts[partition_idx] - local_v_list_range_firsts[partition_idx]); if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; auto input_count_first = thrust::make_transform_iterator( thrust::make_counting_iterator(packed_bool_offset(range_offset_first)), cuda::proclaim_return_type( @@ -2521,7 +2514,7 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } return static_cast(__popc(word)); })); - rmm::device_uvector input_count_offsets( + input_count_offsets.resize( (rx_bitmap.size() - packed_bool_offset(range_offset_first)) + 1, loop_stream); input_count_offsets.set_element_to_zero_async(0, loop_stream); thrust::inclusive_scan( @@ -2530,7 +2523,34 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, input_count_first + (rx_bitmap.size() - packed_bool_offset(range_offset_first)), input_count_offsets.begin() + 1); - rmm::device_uvector filtered_bitmap( + } + range_offset_firsts[j] = range_offset_first; + range_offset_lasts[j] = range_offset_last; + } + input_count_offset_vectors.push_back(std::move(input_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + rmm::device_uvector filtered_bitmap(0, loop_stream); + rmm::device_uvector output_count_offsets(0, loop_stream); + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + filtered_bitmap.resize( rx_bitmap.size() - packed_bool_offset(range_offset_first), loop_stream); thrust::tabulate( rmm::exec_policy_nosync(loop_stream), @@ -2593,13 +2613,51 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, cuda::proclaim_return_type([] __device__(uint32_t word) { return static_cast(__popc(word)); })); - rmm::device_uvector output_count_offsets(filtered_bitmap.size() + 1, - loop_stream); + output_count_offsets.resize(filtered_bitmap.size() + 1, loop_stream); output_count_offsets.set_element_to_zero_async(0, loop_stream); thrust::inclusive_scan(rmm::exec_policy_nosync(loop_stream), output_count_first, output_count_first + filtered_bitmap.size(), output_count_offsets.begin() + 1); + } + } + filtered_bitmap_vectors.push_back(std::move(filtered_bitmap)); + output_count_offset_vectors.push_back(std::move(output_count_offsets)); + } + + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto range_offset_first = range_offset_firsts[j]; + auto range_offset_last = range_offset_lasts[j]; + if (range_offset_first < range_offset_last) { + auto const& rx_bitmap = (*edge_partition_bitmap_buffers)[j]; + auto const& input_count_offsets = input_count_offset_vectors[j]; + auto const& filtered_bitmap = filtered_bitmap_vectors[j]; + auto const& output_count_offsets = output_count_offset_vectors[j]; + if (keys.index() == 0) { if (offsets.index() == 0) { thrust::for_each( @@ -2806,8 +2864,45 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, size_t{0}); } } + + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); } - if (edge_partition_new_key_buffers) { + } + } + if (edge_partition_new_key_buffers) { // if there is no bitmap buffer + for (size_t j = 0; j < loop_count; ++j) { + auto partition_idx = i + j; + auto loop_stream = + loop_stream_pool_indices + ? handle.get_stream_from_stream_pool((*loop_stream_pool_indices)[j]) + : handle.get_stream(); + + auto const& key_segment_offsets = (*key_segment_offset_vectors)[partition_idx]; + + auto& keys = edge_partition_key_buffers[j]; + std::variant, rmm::device_uvector> offsets = + rmm::device_uvector(0, loop_stream); + if (uint32_key_output_offset) { + std::get<0>(offsets).resize(process_local_edges[j] + ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } else { + offsets = rmm::device_uvector( + process_local_edges[j] ? (key_segment_offsets[4] - key_segment_offsets[3]) + : vertex_t{0}, + loop_stream); + } + + if (process_local_edges[j]) { + auto edge_partition = + edge_partition_device_view_t( + graph_view.local_edge_partition_view(partition_idx)); + auto const& segment_offsets = + graph_view.local_edge_partition_segment_offsets(partition_idx); + + auto segment_bitmap = *(edge_partition.dcs_nzd_range_bitmap()); + auto& new_keys = (*edge_partition_new_key_buffers)[j]; if constexpr (try_bitmap) { assert(!v_list_bitmap); @@ -2945,9 +3040,9 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, } } } - } - (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + (*edge_partition_hypersparse_key_offset_vectors).push_back(std::move(offsets)); + } } if (loop_stream_pool_indices) { handle.sync_stream_pool(*loop_stream_pool_indices); } if (edge_partition_new_key_buffers) { From 34c243c9bfc0416712f467fd41144ea16380ba84 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Fri, 1 Nov 2024 01:30:57 -0700 Subject: [PATCH 121/126] perf. opt. compute_vertex_list_bitmap_info --- cpp/src/prims/vertex_frontier.cuh | 73 +++++++++++++++++++------------ 1 file changed, 46 insertions(+), 27 deletions(-) diff --git a/cpp/src/prims/vertex_frontier.cuh b/cpp/src/prims/vertex_frontier.cuh index 6da9b843095..6e7d8515beb 100644 --- a/cpp/src/prims/vertex_frontier.cuh +++ b/cpp/src/prims/vertex_frontier.cuh @@ -137,19 +137,39 @@ rmm::device_uvector compute_vertex_list_bitmap_info( auto bitmap = rmm::device_uvector( packed_bool_size(vertex_range_last - vertex_range_first), stream_view); - thrust::fill( - rmm::exec_policy_nosync(stream_view), bitmap.begin(), bitmap.end(), packed_bool_empty_mask()); - thrust::for_each(rmm::exec_policy_nosync(stream_view), - sorted_unique_vertex_first, - sorted_unique_vertex_last, - [bitmap = raft::device_span(bitmap.data(), bitmap.size()), - v_first = vertex_range_first] __device__(vertex_t v) { - auto v_offset = v - v_first; - cuda::atomic_ref word( - bitmap[packed_bool_offset(v_offset)]); - word.fetch_or(cugraph::packed_bool_mask(v_offset), - cuda::std::memory_order_relaxed); - }); + rmm::device_uvector lasts(bitmap.size(), stream_view); + auto bdry_first = thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{1}), + cuda::proclaim_return_type( + [vertex_range_first, + vertex_range_size = vertex_range_last - vertex_range_first] __device__(vertex_t i) { + return vertex_range_first + + static_cast( + std::min(packed_bools_per_word() * i, static_cast(vertex_range_size))); + })); + thrust::lower_bound(rmm::exec_policy_nosync(stream_view), + sorted_unique_vertex_first, + sorted_unique_vertex_last, + bdry_first, + bdry_first + bitmap.size(), + lasts.begin()); + thrust::tabulate( + rmm::exec_policy_nosync(stream_view), + bitmap.begin(), + bitmap.end(), + cuda::proclaim_return_type( + [sorted_unique_vertex_first, + vertex_range_first, + lasts = raft::device_span(lasts.data(), lasts.size())] __device__(size_t i) { + auto offset_first = (i != 0) ? lasts[i - 1] : vertex_t{0}; + auto offset_last = lasts[i]; + auto ret = packed_bool_empty_mask(); + for (auto j = offset_first; j < offset_last; ++j) { + auto v_offset = *(sorted_unique_vertex_first + j) - vertex_range_first; + ret |= packed_bool_mask(v_offset); + } + return ret; + })); return bitmap; } @@ -207,20 +227,19 @@ void retrieve_vertex_list_from_bitmap( { using vertex_t = typename thrust::iterator_traits::value_type; - assert((comm.get_rank() != root) || (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first))); - detail::copy_if_nosync( - thrust::make_counting_iterator(vertex_range_first), - thrust::make_counting_iterator(vertex_range_last), - thrust::make_transform_iterator( - thrust::make_counting_iterator(vertex_t{0}), - cuda::proclaim_return_type( - [bitmap] __device__(vertex_t v_offset) { - return ((bitmap[packed_bool_offset(v_offset)] & packed_bool_mask(v_offset)) != - packed_bool_empty_mask()); - })), - output_v_first, - count, - stream_view); + assert((comm.get_rank() != root) || + (bitmap.size() >= packed_bool_size(vertex_range_last - vertex_ragne_first))); + detail::copy_if_nosync(thrust::make_counting_iterator(vertex_range_first), + thrust::make_counting_iterator(vertex_range_last), + thrust::make_transform_iterator( + thrust::make_counting_iterator(vertex_t{0}), + cuda::proclaim_return_type([bitmap] __device__(vertex_t v_offset) { + return ((bitmap[packed_bool_offset(v_offset)] & + packed_bool_mask(v_offset)) != packed_bool_empty_mask()); + })), + output_v_first, + count, + stream_view); } // key type is either vertex_t (tag_t == void) or thrust::tuple (tag_t != void) From afa53fcf02622413c2f0f02d50f5b4fdfe2c5456 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Mon, 4 Nov 2024 02:13:05 -0800 Subject: [PATCH 122/126] minor refactor --- .../detail/extract_transform_v_frontier_e.cuh | 2 +- .../prims/detail/per_v_transform_reduce_e.cuh | 79 +------------------ cpp/src/prims/fill_edge_src_dst_property.cuh | 8 +- 3 files changed, 10 insertions(+), 79 deletions(-) diff --git a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh index 787d19cc125..6cc410c0c8a 100644 --- a/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh +++ b/cpp/src/prims/detail/extract_transform_v_frontier_e.cuh @@ -630,7 +630,7 @@ void extract_transform_v_frontier_e_edge_partition( } } -#define EXTRACT_PERFORMANCE_MEASUREMENT 1 // FIXME: delete +#define EXTRACT_PERFORMANCE_MEASUREMENT 0 // FIXME: delete template -__host__ __device__ priority_t -rank_to_priority(int rank, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - int comm_size, - vertex_t offset /* to evenly distribute traffic */) -{ - static_assert(std::is_same_v || std::is_same_v); - using cast_t = std::conditional_t, - int16_t, - int64_t>; // to prevent overflow (assuming that comm_size <= - // std::numeric_limits::max()) no need - // for communication (priority 0) - if (rank == root) { - return priority_t{0}; - } else if (rank / subgroup_size == - root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in - // [1, subgroup_size) - auto rank_dist = - static_cast(((static_cast(rank) + subgroup_size) - root) % subgroup_size); - int modulo = subgroup_size - 1; - return static_cast(1 + (static_cast(rank_dist - 1) + (offset % modulo)) % - modulo); - } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) - auto subgroup_dist = - static_cast(((static_cast(rank / subgroup_size) + (comm_size / subgroup_size)) - - (root / subgroup_size)) % - (comm_size / subgroup_size)); - auto intra_subgroup_rank_dist = static_cast( - ((static_cast(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) % - subgroup_size); - auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; - int modulo = comm_size - subgroup_size; - return static_cast( - subgroup_size + - (static_cast(rank_dist - subgroup_size) + (offset % modulo)) % modulo); - } -} - -template -__host__ __device__ int priority_to_rank( - priority_t priority, - int root, - int subgroup_size /* faster interconnect within a subgroup */, - int comm_size, - vertex_t offset /* to evenly distribute traffict */) -{ - static_assert(std::is_same_v || std::is_same_v); - using cast_t = std::conditional_t, - int16_t, - int64_t>; // to prevent overflow (assuming that comm_size <= - // std::numeric_limits::max()) - if (priority == priority_t{0}) { - return root; - } else if (priority < static_cast(subgroup_size)) { - int modulo = subgroup_size - 1; - auto rank_dist = static_cast( - 1 + ((static_cast(priority - 1) + modulo) - (offset % modulo)) % modulo); - return static_cast((root - (root % subgroup_size)) + - ((static_cast(root) + rank_dist) % subgroup_size)); - } else { - int modulo = comm_size - subgroup_size; - auto rank_dist = static_cast( - subgroup_size + - ((static_cast(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo); - auto subgroup_dist = rank_dist / subgroup_size; - auto intra_subgroup_rank_dist = rank_dist % subgroup_size; - return static_cast( - ((static_cast((root / subgroup_size) * subgroup_size) + - subgroup_dist * subgroup_size) + - (static_cast(root) + intra_subgroup_rank_dist) % subgroup_size) % - comm_size); - } -} - template void compute_priorities( raft::comms::comms_t const& comm, @@ -1438,7 +1363,7 @@ void per_v_transform_reduce_e_edge_partition( } } -#define PER_V_PERFORMANCE_MEASUREMENT 1 // FIXME: delete performance logging code +#define PER_V_PERFORMANCE_MEASUREMENT 0 // FIXME: delete performance logging code template (h_aggregate_tmps[i * size_t{4} + 3]); } } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif auto edge_partition_keys = edge_minor_property_output.keys(); @@ -540,6 +543,9 @@ void fill_edge_minor_property(raft::handle_t const& handle, compressed_v_list = std::move(tmps); } } +#if FILL_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); +#endif std::optional> stream_pool_indices{std::nullopt}; size_t num_concurrent_bcasts{1}; From 6265f138ffe4a6a9a7404166af8fd19ac52528de Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 6 Nov 2024 15:17:06 -0800 Subject: [PATCH 123/126] additional performance optimizations --- .../prims/detail/per_v_transform_reduce_e.cuh | 6 +- ...rm_reduce_v_frontier_outgoing_e_by_dst.cuh | 331 ++++++++++++++---- cpp/src/traversal/bfs_impl.cuh | 4 +- 3 files changed, 268 insertions(+), 73 deletions(-) diff --git a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh index 14fd716f4c5..c15ce02a985 100644 --- a/cpp/src/prims/detail/per_v_transform_reduce_e.cuh +++ b/cpp/src/prims/detail/per_v_transform_reduce_e.cuh @@ -1641,16 +1641,16 @@ void per_v_transform_reduce_e(raft::handle_t const& handle, std::is_same_v>) { auto& comm = handle.get_comms(); auto const comm_size = comm.get_size(); + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); int num_gpus_per_node{}; RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); if (comm_size <= num_gpus_per_node) { - subgroup_size = comm_size; + subgroup_size = minor_comm_size; } else { auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); auto const major_comm_size = major_comm.get_size(); - auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); - auto const minor_comm_size = minor_comm.get_size(); subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm ? std::max(num_gpus_per_node / major_comm_size, int{1}) : std::min(minor_comm_size, num_gpus_per_node); diff --git a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh index ff70ae21951..eaa328a0309 100644 --- a/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh +++ b/cpp/src/prims/transform_reduce_v_frontier_outgoing_e_by_dst.cuh @@ -17,6 +17,7 @@ #include "detail/graph_partition_utils.cuh" #include "prims/detail/extract_transform_v_frontier_e.cuh" +#include "prims/detail/prim_utils.cuh" #include "prims/property_op_utils.cuh" #include "prims/reduce_op.cuh" @@ -145,6 +146,125 @@ struct update_keep_flag_t { } }; +template +std::tuple, optional_dataframe_buffer_type_t> +filter_buffer_elements( + raft::handle_t const& handle, + rmm::device_uvector&& + unique_v_buffer, // assumes that buffer elements are locally reduced first and unique + optional_dataframe_buffer_type_t&& payload_buffer, + raft::device_span vertex_range_offsets, + vertex_t allreduce_count_per_rank, + int subgroup_size) +{ + auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + + rmm::device_uvector priorities(allreduce_count_per_rank * major_comm_size, + handle.get_stream()); + thrust::fill(handle.get_thrust_policy(), + priorities.begin(), + priorities.end(), + std::numeric_limits::max()); + thrust::for_each( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + [offsets = vertex_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = + thrust::distance(offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + priorities[allreduce_count_per_rank * root + v_offset] = + rank_to_priority( + major_comm_rank, root, subgroup_size, major_comm_size, v_offset); + } + }); + device_allreduce(major_comm, + priorities.data(), + priorities.data(), + priorities.size(), + raft::comms::op_t::MIN, + handle.get_stream()); + if constexpr (std::is_same_v) { + unique_v_buffer.resize( + thrust::distance( + unique_v_buffer.begin(), + thrust::remove_if( + handle.get_thrust_policy(), + unique_v_buffer.begin(), + unique_v_buffer.end(), + unique_v_buffer.begin(), + [offsets = vertex_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + } else { + auto kv_pair_first = thrust::make_zip_iterator(unique_v_buffer.begin(), + get_dataframe_buffer_begin(payload_buffer)); + unique_v_buffer.resize( + thrust::distance( + kv_pair_first, + thrust::remove_if( + handle.get_thrust_policy(), + kv_pair_first, + kv_pair_first + unique_v_buffer.size(), + unique_v_buffer.begin(), + [offsets = vertex_range_offsets, + priorities = raft::device_span(priorities.data(), priorities.size()), + allreduce_count_per_rank, + subgroup_size, + major_comm_rank, + major_comm_size] __device__(auto v) { + auto root = thrust::distance( + offsets.begin() + 1, + thrust::upper_bound(thrust::seq, offsets.begin() + 1, offsets.end(), v)); + auto v_offset = v - offsets[root]; + if (v_offset < allreduce_count_per_rank) { + auto selected_rank = priority_to_rank( + priorities[allreduce_count_per_rank * root + v_offset], + root, + subgroup_size, + major_comm_size, + v_offset); + return major_comm_rank != selected_rank; + } else { + return false; + } + })), + handle.get_stream()); + resize_dataframe_buffer(payload_buffer, unique_v_buffer.size(), handle.get_stream()); + } + + return std::make_tuple(std::move(unique_v_buffer), std::move(payload_buffer)); +} + template 1) { - constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; + size_t local_key_buffer_size = size_dataframe_buffer(key_buffer); + auto avg_key_buffer_size = + host_scalar_allreduce( + major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / + major_comm_size; + + rmm::device_uvector d_vertex_range_offsets(vertex_range_offsets.size(), + handle.get_stream()); + raft::update_device(d_vertex_range_offsets.data(), + vertex_range_offsets.data(), + vertex_range_offsets.size(), + handle.get_stream()); + constexpr bool try_compression = (sizeof(vertex_t) == 8) && std::is_same_v; std::conditional_t - max_vertex_partition_size{0}; - std::conditional_t, std::byte /* dummy */> - h_vertex_firsts{}; + max_vertex_partition_size{}; if constexpr (try_compression) { - h_vertex_firsts = std::vector(vertex_range_offsets.begin(), - vertex_range_offsets.begin() + major_comm_size); - } - std::vector h_vertex_lasts(vertex_range_offsets.begin() + 1, - vertex_range_offsets.end()); - for (size_t i = 0; i < h_vertex_lasts.size(); ++i) { - if constexpr (try_compression) { + for (int i = 0; i < major_comm_size; ++i) { max_vertex_partition_size = std::max( vertex_range_offsets[i + 1] - vertex_range_offsets[i], max_vertex_partition_size); } } - std::conditional_t>, - std::byte /* dummy */> - d_vertex_firsts{}; - rmm::device_uvector d_vertex_lasts(h_vertex_lasts.size(), handle.get_stream()); - if constexpr (try_compression) { - if (max_vertex_partition_size <= std::numeric_limits::max()) { - d_vertex_firsts = - rmm::device_uvector(h_vertex_firsts.size(), handle.get_stream()); - raft::update_device((*d_vertex_firsts).data(), - h_vertex_firsts.data(), - h_vertex_firsts.size(), - handle.get_stream()); + if constexpr (std::is_same_v && + std::is_same_v>) { + vertex_t min_vertex_partition_size = std::numeric_limits::max(); + for (int i = 0; i < major_comm_size; ++i) { + min_vertex_partition_size = std::min( + vertex_range_offsets[i + 1] - vertex_range_offsets[i], min_vertex_partition_size); } + + auto segment_offsets = graph_view.local_vertex_partition_segment_offsets(); + auto& comm = handle.get_comms(); + auto const comm_size = comm.get_size(); + if (segment_offsets && + (static_cast(avg_key_buffer_size) > + static_cast(graph_view.number_of_vertices() / comm_size) * + double{0.2})) { // duplicates expected for high in-degree vertices (and we assume + // correlation between in-degrees & out-degrees) // FIXME: we need + // a better criterion + size_t key_size{0}; + size_t payload_size{0}; + if constexpr (try_compression) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { + key_size = sizeof(uint32_t); + } else { + key_size = sizeof(key_t); + } + } else { + if constexpr (std::is_arithmetic_v) { + key_size = sizeof(key_t); + } else { + key_size = sum_thrust_tuple_element_sizes(); + } + } + if constexpr (!std::is_same_v) { + if constexpr (std::is_arithmetic_v) { + payload_size = sizeof(payload_t); + } else { + payload_size = sum_thrust_tuple_element_sizes(); + } + } + + int subgroup_size{}; + int num_gpus_per_node{}; + RAFT_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); + if (comm_size <= num_gpus_per_node) { + subgroup_size = major_comm_size; + } else { + auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + subgroup_size = partition_manager::map_major_comm_to_gpu_row_comm + ? std::min(major_comm_size, num_gpus_per_node) + : std::max(num_gpus_per_node / minor_comm_size, int{1}); + } + + auto p2p_size_per_rank = avg_key_buffer_size * (key_size + payload_size); + auto p2p_size_per_node = p2p_size_per_rank * std::min(num_gpus_per_node, comm_size); + auto allreduce_size_per_node = p2p_size_per_node / 16 /* tuning parameter */; + auto allreduce_size_per_rank = + allreduce_size_per_node / (major_comm_size * (num_gpus_per_node / subgroup_size)); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + std::cerr << "p2p_size_per_rank=" << p2p_size_per_rank + << " p2p_size_per_node=" << p2p_size_per_node + << " allreduce_size_per_node=" << allreduce_size_per_node + << " allreduce_size_per_rank=" << allreduce_size_per_rank << std::endl; +#endif + + if (major_comm_size <= std::numeric_limits::max()) { // priority = uint8_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_range_offsets.data(), + d_vertex_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint8_t)), + min_vertex_partition_size), + subgroup_size); + } else { // priority = uint32_t + std::tie(key_buffer, payload_buffer) = + filter_buffer_elements( + handle, + std::move(key_buffer), + std::move(payload_buffer), + raft::device_span(d_vertex_range_offsets.data(), + d_vertex_range_offsets.size()), + std::min(static_cast(allreduce_size_per_rank / sizeof(uint32_t)), + min_vertex_partition_size), + subgroup_size); + } + } +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + size_after_filter = size_dataframe_buffer(key_buffer); +#endif } - raft::update_device( - d_vertex_lasts.data(), h_vertex_lasts.data(), h_vertex_lasts.size(), handle.get_stream()); - rmm::device_uvector d_tx_buffer_last_boundaries(d_vertex_lasts.size(), - handle.get_stream()); +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time3 = std::chrono::steady_clock::now(); +#endif + + rmm::device_uvector d_tx_buffer_last_boundaries(major_comm_size, handle.get_stream()); auto key_v_first = thrust_tuple_get_or_identity( get_dataframe_buffer_begin(key_buffer)); thrust::lower_bound(handle.get_thrust_policy(), key_v_first, key_v_first + size_dataframe_buffer(key_buffer), - d_vertex_lasts.begin(), - d_vertex_lasts.end(), + d_vertex_range_offsets.begin() + 1, + d_vertex_range_offsets.end(), d_tx_buffer_last_boundaries.begin()); std::conditional_t>, std::byte /* dummy */> compressed_v_buffer{}; if constexpr (try_compression) { - if (d_vertex_firsts) { + if (max_vertex_partition_size <= std::numeric_limits::max()) { compressed_v_buffer = rmm::device_uvector(size_dataframe_buffer(key_buffer), handle.get_stream()); - thrust::transform(handle.get_thrust_policy(), - get_dataframe_buffer_begin(key_buffer), - get_dataframe_buffer_end(key_buffer), - (*compressed_v_buffer).begin(), - cuda::proclaim_return_type( - [firsts = raft::device_span( - (*d_vertex_firsts).data(), (*d_vertex_firsts).size()), - lasts = raft::device_span( - d_vertex_lasts.data(), d_vertex_lasts.size())] __device__(auto v) { - auto major_comm_rank = thrust::distance( - lasts.begin(), - thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); - return static_cast(v - firsts[major_comm_rank]); - })); + thrust::transform( + handle.get_thrust_policy(), + get_dataframe_buffer_begin(key_buffer), + get_dataframe_buffer_end(key_buffer), + (*compressed_v_buffer).begin(), + cuda::proclaim_return_type( + [firsts = raft::device_span(d_vertex_range_offsets.data(), + static_cast(major_comm_size)), + lasts = raft::device_span( + d_vertex_range_offsets.data() + 1, + static_cast(major_comm_size))] __device__(auto v) { + auto major_comm_rank = thrust::distance( + lasts.begin(), thrust::upper_bound(thrust::seq, lasts.begin(), lasts.end(), v)); + return static_cast(v - firsts[major_comm_rank]); + })); resize_dataframe_buffer(key_buffer, 0, handle.get_stream()); shrink_to_fit_dataframe_buffer(key_buffer, handle.get_stream()); } @@ -659,10 +866,6 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::adjacent_difference( h_tx_buffer_last_boundaries.begin(), h_tx_buffer_last_boundaries.end(), tx_counts.begin()); -#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT - RAFT_CUDA_TRY(cudaDeviceSynchronize()); - time3 = std::chrono::steady_clock::now(); -#endif size_t min_element_size{cache_line_size}; if constexpr (std::is_same_v) { if constexpr (try_compression) { @@ -693,20 +896,10 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, std::optional, key_t>> invalid_key{std::nullopt}; - size_t local_key_buffer_size{}; - if constexpr (try_compression) { - if (compressed_v_buffer) { - local_key_buffer_size = size_dataframe_buffer(*compressed_v_buffer); - } else { - local_key_buffer_size = size_dataframe_buffer(key_buffer); - } - } else { - local_key_buffer_size = size_dataframe_buffer(key_buffer); - } - auto avg_key_buffer_size = - host_scalar_allreduce( - major_comm, local_key_buffer_size, raft::comms::op_t::SUM, handle.get_stream()) / - major_comm_size; +#if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + time4 = std::chrono::steady_clock::now(); +#endif if (avg_key_buffer_size >= alignment * size_t{128} /* 128 tuning parameter */) { aligned_path = true; // FIXME: delete if constexpr (std::is_same_v) { @@ -824,7 +1017,7 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, } #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - time4 = std::chrono::steady_clock::now(); + time5 = std::chrono::steady_clock::now(); #endif if constexpr (std::is_integral_v) { @@ -888,16 +1081,18 @@ transform_reduce_v_frontier_outgoing_e_by_dst(raft::handle_t const& handle, } #if TRANSFORM_REDUCE_PERFORMANCE_MEASUREMENT RAFT_CUDA_TRY(cudaDeviceSynchronize()); - auto time5 = std::chrono::steady_clock::now(); + auto time6 = std::chrono::steady_clock::now(); auto size_after_greduce = size_dataframe_buffer(key_buffer); std::chrono::duration dur0 = time1 - time0; std::chrono::duration dur1 = time2 - time1; std::chrono::duration dur2 = time3 - time2; std::chrono::duration dur3 = time4 - time3; std::chrono::duration dur4 = time5 - time4; - std::cerr << "\tprim (fill,lreduce,g-prep,g-shuffle,g-s&r) took (" << dur0.count() << "," + std::chrono::duration dur5 = time6 - time5; + std::cerr << "\tprim (fill,lreduce,filter,g-prep,g-shuffle,g-s&r) took (" << dur0.count() << "," << dur1.count() << "," << dur2.count() << "," << dur3.count() << "," << dur4.count() - << ") l_size=(" << size_before_lreduce << "," << size_after_lreduce << ") g_size=(" + << "," << dur5.count() << ") l_size=(" << size_before_lreduce << "," + << size_after_lreduce << ") f_size=" << size_after_filter << " g_size=(" << size_before_greduce << "," << size_after_greduce << ")" << " aligned_path=" << aligned_path << " fill_ratio=" << fill_ratio << std::endl; #endif diff --git a/cpp/src/traversal/bfs_impl.cuh b/cpp/src/traversal/bfs_impl.cuh index e2092a1db46..7f8b3e075df 100644 --- a/cpp/src/traversal/bfs_impl.cuh +++ b/cpp/src/traversal/bfs_impl.cuh @@ -115,7 +115,7 @@ struct bottomup_pred_op_t { namespace detail { #if 1 // FIXME: delete -#define BFS_PERFORMANCE_MEASUREMENT 1 +#define BFS_PERFORMANCE_MEASUREMENT 0 #endif template @@ -246,7 +246,7 @@ void bfs(raft::handle_t const& handle, (graph_view.number_of_vertices() > 0) ? ((static_cast(graph_view.compute_number_of_edges(handle)) / static_cast(graph_view.number_of_vertices())) * - (1.0 / 4.25) /* tuning parametger */) + (1.0 / 3.75) /* tuning parametger */) : double{1.0}; constexpr vertex_t direction_optimizing_beta = 24; // tuning parameter From 6961fcdfb35ddedeeaf065ca1e1902adcd25e055 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 6 Nov 2024 15:19:29 -0800 Subject: [PATCH 124/126] Graph 500 benchmark specific parameter tuning --- cpp/include/cugraph/partition_manager.hpp | 6 +++++ cpp/tests/utilities/base_fixture.hpp | 29 +++++++++++++++++------ cpp/tests/utilities/mg_utilities.cpp | 18 ++++++++++---- 3 files changed, 42 insertions(+), 11 deletions(-) diff --git a/cpp/include/cugraph/partition_manager.hpp b/cpp/include/cugraph/partition_manager.hpp index 13ab2980737..377fd0a4de9 100644 --- a/cpp/include/cugraph/partition_manager.hpp +++ b/cpp/include/cugraph/partition_manager.hpp @@ -166,20 +166,26 @@ class partition_manager { #if 1 // FIXME: a trick to use InfiniBand SHARP in a sub-communicator (currently, a GPU can // participate in only one SHARP accelerated communicator) comm.barrier(); // to enforce initialization in comm + std::cerr << "start intializing node_comm" << std::endl; + std::cerr << "start intializing major_comm" << std::endl; handle.set_subcomm("gpu_row_comm", std::make_shared(comm.comm_split(row_idx, col_idx))); auto& major_comm = handle.get_subcomm(cugraph::partition_manager::major_comm_name()); major_comm.barrier(); /// to enforce initialization in major_comm + std::cerr << "major_comm initialized" << std::endl; +#if 1 // for EOS auto ret = setenv("NCCL_COLLNET_ENABLE", "1", 1); if (ret != 0) std::cerr << "setenv(\"NCCL_COLLNET_ENABLE\", \"1\", 1) returned " << ret << std::endl; ret = setenv("NCCL_SHARP_DISABLE", "0", 1); if (ret != 0) std::cerr << "setenv(\"NCCL_SHARP_DISABLE\", \"0\", 1) returned " << ret << std::endl; +#endif handle.set_subcomm("gpu_col_comm", std::make_shared(comm.comm_split(col_idx, row_idx))); auto& minor_comm = handle.get_subcomm(cugraph::partition_manager::minor_comm_name()); minor_comm.barrier(); /// to enforce initialization in minor_comm + std::cerr << "minor_comm initialized" << std::endl; #else handle.set_subcomm("gpu_row_comm", std::make_shared(comm.comm_split(row_idx, col_idx))); diff --git a/cpp/tests/utilities/base_fixture.hpp b/cpp/tests/utilities/base_fixture.hpp index dade2af57c0..7a76739f32f 100644 --- a/cpp/tests/utilities/base_fixture.hpp +++ b/cpp/tests/utilities/base_fixture.hpp @@ -70,16 +70,28 @@ inline auto make_managed() { return std::make_shared(total * 0.93)), rmm::CUDA_ALLOCATION_ALIGNMENT) + use_max ? rmm::align_down(std::min(free, static_cast(total * init_alloc_ratio)), rmm::CUDA_ALLOCATION_ALIGNMENT) : rmm::align_down(std::min(free, total / 10), rmm::CUDA_ALLOCATION_ALIGNMENT); std::optional max_alloc{}; if (use_max) { @@ -114,12 +126,12 @@ inline auto make_binning() * @return Memory resource instance */ inline std::shared_ptr create_memory_resource( - std::string const& allocation_mode) + std::string const& allocation_mode, int comm_size) { if (allocation_mode == "binning") return make_binning(); if (allocation_mode == "cuda") return make_cuda(); if (allocation_mode == "pool") return make_pool(); - if (allocation_mode == "maxpool") return make_pool(true); + if (allocation_mode == "maxpool") return make_pool(true, comm_size); if (allocation_mode == "managed") return make_managed(); CUGRAPH_FAIL("Invalid RMM allocation mode"); } @@ -216,7 +228,7 @@ inline auto parse_test_options(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); \ auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + auto resource = cugraph::test::create_memory_resource(rmm_mode, 1); \ rmm::mr::set_current_device_resource(resource.get()); \ cugraph::test::g_perf = cmd_opts["perf"].as(); \ cugraph::test::g_rmat_scale = \ @@ -238,6 +250,9 @@ inline auto parse_test_options(int argc, char** argv) #define CUGRAPH_MG_TEST_PROGRAM_MAIN() \ int main(int argc, char** argv) \ { \ + if (setenv("CUDA_DEVICE_MAX_CONNECTIONS", "18", 1) != 0) { \ + std::cerr << "setenv() returned ret" << std::endl; \ + } \ cugraph::test::initialize_mpi(argc, argv); \ auto comm_rank = cugraph::test::query_mpi_comm_world_rank(); \ auto comm_size = cugraph::test::query_mpi_comm_world_size(); \ @@ -247,7 +262,7 @@ inline auto parse_test_options(int argc, char** argv) ::testing::InitGoogleTest(&argc, argv); \ auto const cmd_opts = parse_test_options(argc, argv); \ auto const rmm_mode = cmd_opts["rmm_mode"].as(); \ - auto resource = cugraph::test::create_memory_resource(rmm_mode); \ + auto resource = cugraph::test::create_memory_resource(rmm_mode, comm_size); \ rmm::mr::set_current_device_resource(resource.get()); \ cugraph::test::g_perf = cmd_opts["perf"].as(); \ cugraph::test::g_rmat_scale = \ diff --git a/cpp/tests/utilities/mg_utilities.cpp b/cpp/tests/utilities/mg_utilities.cpp index d22a9956f17..ee2a4740b97 100644 --- a/cpp/tests/utilities/mg_utilities.cpp +++ b/cpp/tests/utilities/mg_utilities.cpp @@ -51,16 +51,26 @@ std::unique_ptr initialize_mg_handle(size_t pool_size) handle = std::make_unique(rmm::cuda_stream_per_thread, std::make_shared(pool_size)); - raft::comms::initialize_mpi_comms(handle.get(), MPI_COMM_WORLD); - auto& comm = handle->get_comms(); - auto const comm_size = comm.get_size(); + auto comm_rank = query_mpi_comm_world_rank(); + auto comm_size = query_mpi_comm_world_size(); + ncclUniqueId id{}; + if (comm_rank == 0) { + RAFT_NCCL_TRY(ncclGetUniqueId(&id)); + } + RAFT_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); + ncclComm_t nccl_comm{}; + ncclConfig_t nccl_config = NCCL_CONFIG_INITIALIZER; + nccl_config.splitShare = 1; + RAFT_NCCL_TRY(ncclCommInitRankConfig(&nccl_comm, comm_size, id, comm_rank, &nccl_config)); + + raft::comms::initialize_mpi_comms(handle.get(), MPI_COMM_WORLD, nccl_comm); auto gpu_row_comm_size = static_cast(sqrt(static_cast(comm_size))); while (comm_size % gpu_row_comm_size != 0) { --gpu_row_comm_size; } - cugraph::partition_manager::init_subcomm(*handle, std::max(comm_size / 8, 1)); + cugraph::partition_manager::init_subcomm(*handle, std::max(comm_size / 16, 1)); return std::move(handle); } From f57003479af73c70ccd27c84ba84fbb14a130b2f Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 6 Nov 2024 15:21:07 -0800 Subject: [PATCH 125/126] misc updates --- cpp/src/prims/detail/prim_utils.cuh | 104 ++++++++++++++++++++++++++++ cpp/tests/c_api/mg_test_utils.cpp | 9 ++- cpp/tests/c_api/mg_test_utils.h | 12 ++++ 3 files changed, 124 insertions(+), 1 deletion(-) create mode 100644 cpp/src/prims/detail/prim_utils.cuh diff --git a/cpp/src/prims/detail/prim_utils.cuh b/cpp/src/prims/detail/prim_utils.cuh new file mode 100644 index 00000000000..3d8f5626042 --- /dev/null +++ b/cpp/src/prims/detail/prim_utils.cuh @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cugraph { + +namespace detail { + +template +__host__ __device__ priority_t +rank_to_priority(int rank, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffic */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (rank == root) { + return priority_t{0}; + } else if (rank / subgroup_size == + root / subgroup_size) { // intra-subgroup communication is sufficient (priorities in + // [1, subgroup_size) + auto rank_dist = + static_cast(((static_cast(rank) + subgroup_size) - root) % subgroup_size); + int modulo = subgroup_size - 1; + return static_cast(1 + (static_cast(rank_dist - 1) + (offset % modulo)) % + modulo); + } else { // inter-subgroup communication is necessary (priorities in [subgroup_size, comm_size) + auto subgroup_dist = + static_cast(((static_cast(rank / subgroup_size) + (comm_size / subgroup_size)) - + (root / subgroup_size)) % + (comm_size / subgroup_size)); + auto intra_subgroup_rank_dist = static_cast( + ((static_cast(rank % subgroup_size) + subgroup_size) - (root % subgroup_size)) % + subgroup_size); + auto rank_dist = subgroup_dist * subgroup_size + intra_subgroup_rank_dist; + int modulo = comm_size - subgroup_size; + return static_cast( + subgroup_size + + (static_cast(rank_dist - subgroup_size) + (offset % modulo)) % modulo); + } +} + +template +__host__ __device__ int priority_to_rank( + priority_t priority, + int root, + int subgroup_size /* faster interconnect within a subgroup */, + int comm_size, + vertex_t offset /* to evenly distribute traffict */) +{ + static_assert(sizeof(priority_t) == 1 || sizeof(priority_t) == 2 || sizeof(priority_t) == 4); + using cast_t = std::conditional_t< + sizeof(priority_t) == 1, + int16_t, + std::conditional_t>; // to prevent overflow + + if (priority == priority_t{0}) { + return root; + } else if (priority < static_cast(subgroup_size)) { + int modulo = subgroup_size - 1; + auto rank_dist = static_cast( + 1 + ((static_cast(priority - 1) + modulo) - (offset % modulo)) % modulo); + return static_cast((root - (root % subgroup_size)) + + ((static_cast(root) + rank_dist) % subgroup_size)); + } else { + int modulo = comm_size - subgroup_size; + auto rank_dist = static_cast( + subgroup_size + + ((static_cast(priority) - subgroup_size) + (modulo - (offset % modulo))) % modulo); + auto subgroup_dist = rank_dist / subgroup_size; + auto intra_subgroup_rank_dist = rank_dist % subgroup_size; + return static_cast( + ((static_cast((root / subgroup_size) * subgroup_size) + + subgroup_dist * subgroup_size) + + (static_cast(root) + intra_subgroup_rank_dist) % subgroup_size) % + comm_size); + } +} + +} // namespace detail + +} // namespace cugraph diff --git a/cpp/tests/c_api/mg_test_utils.cpp b/cpp/tests/c_api/mg_test_utils.cpp index 58c5e59c16f..18807b00a6b 100644 --- a/cpp/tests/c_api/mg_test_utils.cpp +++ b/cpp/tests/c_api/mg_test_utils.cpp @@ -95,9 +95,16 @@ extern "C" void* create_mg_raft_handle(int argc, char** argv) C_MPI_TRY(MPI_Comm_size(MPI_COMM_WORLD, &comm_size)); C_CUDA_TRY(cudaGetDeviceCount(&num_gpus_per_node)); C_CUDA_TRY(cudaSetDevice(comm_rank % num_gpus_per_node)); + ncclUniqueId id{}; + if (comm_rank == 0) { + C_NCCL_TRY(ncclGetUniqueId(&id)); + } + C_MPI_TRY(MPI_Bcast((void*)&id, sizeof(id), MPI_BYTE, 0, MPI_COMM_WORLD)); + ncclComm_t nccl_comm{}; + C_NCCL_TRY(ncclCommInitRank(&nccl_comm, comm_size, id, comm_rank)); raft::handle_t* handle = new raft::handle_t{}; - raft::comms::initialize_mpi_comms(handle, MPI_COMM_WORLD); + raft::comms::initialize_mpi_comms(handle, MPI_COMM_WORLD, nccl_comm); #if 1 int gpu_row_comm_size = 1; diff --git a/cpp/tests/c_api/mg_test_utils.h b/cpp/tests/c_api/mg_test_utils.h index 7461d402b5b..a79c74675d2 100644 --- a/cpp/tests/c_api/mg_test_utils.h +++ b/cpp/tests/c_api/mg_test_utils.h @@ -36,6 +36,18 @@ } \ } while (0) +#define C_NCCL_TRY(call) \ + do { \ + ncclResult_t status = call; \ + if (ncclSuccess != status) { \ + printf("NCCL call='%s' at file=%s line=%d failed.", \ + #call, \ + __FILE__, \ + __LINE__); \ + exit(1); \ + } \ + } while (0) + #define C_CUDA_TRY(call) \ do { \ cudaError_t const status = call; \ From c22f95c244ccef1d250e4f9c966fbfeba6110443 Mon Sep 17 00:00:00 2001 From: Seunghwa Kang Date: Wed, 6 Nov 2024 15:26:20 -0800 Subject: [PATCH 126/126] Graph 500 benchmark driver --- cpp/tests/traversal/mg_graph500_bfs_test.cu | 916 ++++++++++++++++++++ 1 file changed, 916 insertions(+) create mode 100644 cpp/tests/traversal/mg_graph500_bfs_test.cu diff --git a/cpp/tests/traversal/mg_graph500_bfs_test.cu b/cpp/tests/traversal/mg_graph500_bfs_test.cu new file mode 100644 index 00000000000..21205c5ad64 --- /dev/null +++ b/cpp/tests/traversal/mg_graph500_bfs_test.cu @@ -0,0 +1,916 @@ +/* + * Copyright (c) 2021-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "detail/graph_partition_utils.cuh" +#include "prims/count_if_e.cuh" +#include "prims/extract_transform_e.cuh" +#include "prims/fill_edge_src_dst_property.cuh" +#include "prims/kv_store.cuh" +#include "prims/update_edge_src_dst_property.cuh" +#include "utilities/base_fixture.hpp" +#include "utilities/collect_comm.cuh" +#include "utilities/conversion_utilities.hpp" +#include "utilities/device_comm_wrapper.hpp" +#include "utilities/mg_utilities.hpp" +#include "utilities/property_generator_utilities.hpp" +#include "utilities/test_graphs.hpp" +#include "utilities/thrust_wrapper.hpp" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include +#include +#include +#include +#include +#include + +#include + +#include + +// FIXME: replace std::cerr with std::cout + +struct Graph500_BFS_Usecase { + bool unrenumber_predecessors{true}; + bool validate{true}; +}; + +template +class Tests_GRAPH500_MGBFS + : public ::testing::TestWithParam> { + public: + Tests_GRAPH500_MGBFS() {} + + static void SetUpTestCase() + { +#if 1 + auto ret = setenv("NCCL_DEBUG", "WARN", 1); + if (ret != 0) std::cerr << "setenv(\"NCCL_DEBUG\", \"TRACE\", 1) returned " << ret << std::endl; +#endif +#if 0 // workstation + // nothing +#else +#if 0 // for CW + ret = setenv("NCCL_NET", "IB", 1); + if (ret != 0) std::cerr << "setenv(\"NCCL_NET\", \"IB\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SOCKET_IFNAME", "enp90s0f0np0", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SOCKET_IFNAME\", \"enp90s0f0np0\", 1) returned " << ret + << std::endl; +#else // for EOS + ret = setenv("NCCL_COLLNET_ENABLE", "0", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_COLLNET_ENABLE\", \"0\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SHARP_DISABLE", "1", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SHARP_DISABLE\", \"1\", 1) returned " << ret << std::endl; + ret = setenv("NCCL_SHARP_GROUP_SIZE_THRESH", "8", 1); + if (ret != 0) + std::cerr << "setenv(\"NCCL_SHARP_GROUP_SIZE_THRESH\", \"8\", 1) returned " << ret + << std::endl; +#endif +#endif + size_t pool_size = + 16; // note that CUDA_DEVICE_MAX_CONNECTIONS (default: 8) should be set to a value larger + // than pool_size to avoid false dependency among different streams + handle_ = cugraph::test::initialize_mg_handle(pool_size); + } + + static void TearDownTestCase() { handle_.reset(); } + + virtual void SetUp() {} + virtual void TearDown() {} + + template + void run_current_test(Graph500_BFS_Usecase const& bfs_usecase, + input_usecase_t const& input_usecase) + { + using weight_t = float; + using edge_type_t = int32_t; // dummy + + bool constexpr store_transposed = false; + bool constexpr multi_gpu = true; + bool constexpr renumber = true; + bool constexpr test_weighted = false; + bool constexpr shuffle = false; // Graph 500 requirement (edges can't be pre-shuffled, edges + // should be shuffled in Kernel 1) + size_t constexpr num_warmup_starting_vertices = + 1; // to enforce all CUDA & NCCL initializations + size_t constexpr num_timed_starting_vertices = 64; // Graph 500 requirement (64) + + HighResTimer hr_timer{}; + + auto& comm = handle_->get_comms(); + auto const comm_rank = comm.get_rank(); + auto const comm_size = comm.get_size(); + auto& major_comm = handle_->get_subcomm(cugraph::partition_manager::major_comm_name()); + auto const major_comm_rank = major_comm.get_rank(); + auto const major_comm_size = major_comm.get_size(); + auto& minor_comm = handle_->get_subcomm(cugraph::partition_manager::minor_comm_name()); + auto const minor_comm_size = minor_comm.get_size(); + + std::cerr << "comm_size=" << comm_size << " major_comm_size=" << major_comm_size + << " minor_comm_size=" << minor_comm_size << std::endl; + + constexpr auto invalid_distance = std::numeric_limits::max(); + constexpr auto invalid_vertex = cugraph::invalid_vertex_id::value; + + // 1. force NCCL P2P initialization + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("NCCL P2P buffer initialization"); + } + + cugraph::test::enforce_p2p_initialization(comm, handle_->get_stream()); + cugraph::test::enforce_p2p_initialization(major_comm, handle_->get_stream()); + cugraph::test::enforce_p2p_initialization(minor_comm, handle_->get_stream()); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + // 2. create an edge list + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("MG Construct edge list"); + } + + std::vector> src_chunks{}; + std::vector> dst_chunks{}; + std::tie(src_chunks, dst_chunks, std::ignore, std::ignore, std::ignore) = + input_usecase.template construct_edgelist( + *handle_, test_weighted, store_transposed, multi_gpu, shuffle); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + // 3. create an MG graph + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("MG Construct graph (Kernel 1)"); + } + + for (size_t i = 0; i < src_chunks.size(); ++i) { // shuffle edges +#if 1 // FIXME: delete + std::cerr << "i=" << i << " start shuffling external edges sizes=(" << src_chunks[i].size() + << "," << dst_chunks[i].size() << ")" << std::endl; + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto start = std::chrono::steady_clock::now(); +#endif + std::tie(src_chunks[i], dst_chunks[i], std::ignore, std::ignore, std::ignore, std::ignore) = + cugraph::shuffle_external_edges( + *handle_, + std::move(src_chunks[i]), + std::move(dst_chunks[i]), + std::nullopt, + std::nullopt, + std::nullopt); +#if 1 // FIXME: delete + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + auto end = std::chrono::steady_clock::now(); + std::chrono::duration dur = end - start; + std::cerr << "i=" << i << " shuffle_external_edges took " << dur.count() << " s." + << std::endl; +#endif + } + + cugraph::graph_t mg_graph(*handle_); + std::optional> mg_renumber_map{std::nullopt}; + std::tie(mg_graph, std::ignore, std::ignore, std::ignore, mg_renumber_map) = + cugraph::create_graph_from_edgelist( + *handle_, + std::nullopt, + std::move(src_chunks), + std::move(dst_chunks), + std::nullopt, + std::nullopt, + std::nullopt, + cugraph::graph_properties_t{input_usecase.undirected() /* symmetric */, + true /* multi-graph */}, + renumber); + + auto mg_graph_view = mg_graph.view(); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + auto num_vertices = mg_graph_view.number_of_vertices(); + { + auto num_self_loops = mg_graph_view.count_self_loops(*handle_); + auto number_of_edges = mg_graph_view.compute_number_of_edges(*handle_); + if (mg_graph_view.is_symmetric()) { + std::cerr << "V=" << mg_graph_view.number_of_vertices() << " E=" << number_of_edges + << " num_self_loops=" << num_self_loops + << " undirected E=" << ((number_of_edges - num_self_loops) / 2 + num_self_loops) + << std::endl; + } + } + + // 4. randomly select starting vertices + + rmm::device_uvector d_starting_vertices(0, handle_->get_stream()); + { + raft::random::RngState rng_state(comm_size + comm_rank /* seed */); + auto tot_vertices = num_warmup_starting_vertices + num_timed_starting_vertices; + auto out_degrees = mg_graph_view.compute_out_degrees(*handle_); + + size_t num_generated{0}; + while (num_generated < tot_vertices) { + auto candidates = + cugraph::select_random_vertices( + *handle_, + mg_graph_view, + std::nullopt, + rng_state, + tot_vertices - num_generated, + true /* with_replacement */, + false /* sort_vertices */); + candidates.resize( + thrust::distance( + candidates.begin(), + thrust::remove_if(handle_->get_thrust_policy(), + candidates.begin(), + candidates.end(), + [v_first = mg_graph_view.local_vertex_partition_range_first(), + out_degrees = raft::device_span( + out_degrees.data(), out_degrees.size())] __device__(auto v) { + auto out_degree = out_degrees[v - v_first]; + return out_degree == 0; // remove isolated vertices + })), + handle_->get_stream()); + auto num_valids = cugraph::host_scalar_allreduce( + comm, candidates.size(), raft::comms::op_t::SUM, handle_->get_stream()); + num_generated += num_valids; + auto old_size = d_starting_vertices.size(); + d_starting_vertices.resize(old_size + candidates.size(), handle_->get_stream()); + thrust::copy(handle_->get_thrust_policy(), + candidates.begin(), + candidates.end(), + d_starting_vertices.begin() + old_size); + } +#if 1 // FIXME: delete + raft::print_device_vector( + "d_starting_vertices", d_starting_vertices.data(), d_starting_vertices.size(), std::cerr); + rmm::device_uvector d_starting_vertex_out_degrees(d_starting_vertices.size(), + handle_->get_stream()); + auto map_first = thrust::make_transform_iterator( + d_starting_vertices.begin(), + cugraph::detail::shift_left_t{mg_graph_view.local_vertex_partition_range_first()}); + thrust::gather(handle_->get_thrust_policy(), + map_first, + map_first + d_starting_vertex_out_degrees.size(), + out_degrees.begin(), + d_starting_vertex_out_degrees.begin()); + raft::print_device_vector( + "d_starting_vertex_out_degrees", d_starting_vertex_out_degrees.data(), d_starting_vertex_out_degrees.size(), std::cerr); +#endif + } + auto starting_vertex_counts = + cugraph::host_scalar_allgather(comm, d_starting_vertices.size(), handle_->get_stream()); + auto starting_vertex_offsets = std::vector(starting_vertex_counts.size() + 1); + starting_vertex_offsets[0] = 0; + std::inclusive_scan(starting_vertex_counts.begin(), + starting_vertex_counts.end(), + starting_vertex_offsets.begin() + 1); + + // 5. run MG BFS + + // FIXME: Graph500 doesn't require computing distances. + rmm::device_uvector d_mg_distances(mg_graph_view.local_vertex_partition_range_size(), + handle_->get_stream()); + rmm::device_uvector d_mg_predecessors( + mg_graph_view.local_vertex_partition_range_size(), handle_->get_stream()); + + double total_elapsed{0.0}; + for (size_t i = 0; i < (num_warmup_starting_vertices + num_timed_starting_vertices); ++i) { + auto starting_vertex_comm_rank = static_cast(std::distance( + starting_vertex_offsets.begin() + 1, + std::upper_bound(starting_vertex_offsets.begin() + 1, starting_vertex_offsets.end(), i))); + raft::device_span d_starting_vertex(static_cast(nullptr), + size_t{0}); + if (comm_rank == starting_vertex_comm_rank) { + d_starting_vertex = raft::device_span( + d_starting_vertices.data() + (i - starting_vertex_offsets[comm_rank]), 1); + } + std::cerr << "start running BFS i=" << i << std::endl; + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("MG BFS (Kernel 2)"); + } + + cugraph::bfs(*handle_, + mg_graph_view, + d_mg_distances.data(), + d_mg_predecessors.data(), + d_starting_vertex.data(), + d_starting_vertex.size(), + mg_graph_view.is_symmetric() ? true : false, + std::numeric_limits::max()); + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + auto elapsed = hr_timer.stop(); + if (i >= num_warmup_starting_vertices) { total_elapsed += elapsed; } + hr_timer.display_and_clear(std::cerr); + } +#if 1 + { + size_t free{}; + size_t total{}; + RAFT_CUDA_TRY(cudaMemGetInfo(&free, &total)); + std::cerr << "After BFS CUDA memory free=" << (free / (1024.0 * 1024.0 * 1024.0)) + << " total=" << (total / (1024.0 * 1024.0 * 1024.0)) << std::endl; + } +#endif + + /* compute the number of visisted edges */ + + { + rmm::device_uvector flags(mg_graph_view.local_vertex_partition_range_size(), + handle_->get_stream()); + thrust::transform(handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + flags.begin(), + cuda::proclaim_return_type([invalid_distance] __device__(auto d) { + return d != invalid_distance; + })); + cugraph::edge_src_property_t edge_src_flags(*handle_, + mg_graph_view); + cugraph::update_edge_src_property( + *handle_, mg_graph_view, flags.begin(), edge_src_flags.mutable_view()); + auto m = cugraph::count_if_e( + *handle_, + mg_graph_view, + edge_src_flags.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + cugraph::edge_dummy_property_t{}.view(), + [] __device__(auto, auto, auto src_flag, auto, auto) { return src_flag; }) / + edge_t{2}; + std::cerr << "# visited undirected edges=" << m << std::endl; + } + + if (bfs_usecase.validate) { + /* check starting vertex's predecessor */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (starting vertex's predecessor)"); + } + + { + size_t num_invalids{0}; + if (d_starting_vertex.size() > 0) { + assert(d_starting_vertex.size() == 1); + num_invalids = thrust::count_if( + handle_->get_thrust_policy(), + d_starting_vertex.begin(), + d_starting_vertex.end(), + [v_first = mg_graph_view.local_vertex_partition_range_first(), + predecessors = raft::device_span( + d_mg_predecessors.data(), d_mg_predecessors.size())] __device__(auto v) { + return predecessors[v - v_first] != invalid_vertex; + }); + } + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); + ASSERT_EQ(num_invalids, 0) + << "predecessor of a starting vertex should be invalid_vertex"; // Graph 500 requires + // the predecessor of a + // starting vertex to + // be itself (cuGraph + // API specifies that + // the predecessor of a + // starting vertex is + // an invalid vertex, + // but this really + // doesn't impact + // perforamnce) + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + /* check for cycles (update predecessor to predecessor's predecessor till reaching the + * starting vertex, if there exists a cycle, this won't finish) */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (cycle)"); + } + + { + vertex_t h_starting_vertex{}; + if (comm_rank == starting_vertex_comm_rank) { + raft::update_host( + &h_starting_vertex, d_starting_vertex.data(), 1, handle_->get_stream()); + handle_->sync_stream(); + } + h_starting_vertex = cugraph::host_scalar_bcast( + comm, h_starting_vertex, starting_vertex_comm_rank, handle_->get_stream()); + + rmm::device_uvector ancestors(d_mg_predecessors.size(), handle_->get_stream()); + ancestors.resize( + thrust::distance( + ancestors.begin(), + thrust::copy_if(handle_->get_thrust_policy(), + d_mg_predecessors.begin(), + d_mg_predecessors.end(), + ancestors.begin(), + cugraph::detail::is_not_equal_t{invalid_vertex})), + handle_->get_stream()); + + cugraph::kv_store_t kv_store( + thrust::make_counting_iterator(mg_graph_view.local_vertex_partition_range_first()), + thrust::make_counting_iterator(mg_graph_view.local_vertex_partition_range_last()), + d_mg_predecessors.begin(), + invalid_vertex, + true /* key_sorted */, + handle_->get_stream()); + auto kv_store_view = kv_store.view(); + auto h_vertex_partition_range_lasts = mg_graph_view.vertex_partition_range_lasts(); + auto d_vertex_partition_range_lasts = + cugraph::test::to_device(*handle_, h_vertex_partition_range_lasts); + size_t level{0}; + auto aggregate_size = cugraph::host_scalar_allreduce( + comm, ancestors.size(), raft::comms::op_t::SUM, handle_->get_stream()); + while (aggregate_size > 0) { + ASSERT_TRUE(level < mg_graph_view.number_of_vertices() - 1) + << "BFS predecessor tree has a cycle."; + ancestors.resize( + thrust::distance( + ancestors.begin(), + thrust::remove_if(handle_->get_thrust_policy(), + ancestors.begin(), + ancestors.end(), + cugraph::detail::is_equal_t{h_starting_vertex})), + handle_->get_stream()); + ancestors = cugraph::collect_values_for_keys( + comm, + kv_store_view, + ancestors.begin(), + ancestors.end(), + cugraph::detail::compute_gpu_id_from_int_vertex_t{ + raft::device_span(d_vertex_partition_range_lasts.data(), + d_vertex_partition_range_lasts.size()), + major_comm_size, + minor_comm_size}, + handle_->get_stream()); + aggregate_size = cugraph::host_scalar_allreduce( + comm, ancestors.size(), raft::comms::op_t::SUM, handle_->get_stream()); + ++level; + } + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + /* check that distance(src) = distance(predecssor(v)) + 1 */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (predecessor tree distances)"); + } + + { + rmm::device_uvector tree_srcs(mg_graph_view.local_vertex_partition_range_size(), + handle_->get_stream()); + tree_srcs.resize( + thrust::distance( + tree_srcs.begin(), + thrust::copy_if(handle_->get_thrust_policy(), + d_mg_predecessors.begin(), + d_mg_predecessors.end(), + tree_srcs.begin(), + cugraph::detail::is_not_equal_t{invalid_vertex})), + handle_->get_stream()); + + auto tree_src_dists = cugraph::collect_values_for_int_vertices( + comm, + tree_srcs.begin(), + tree_srcs.end(), + d_mg_distances.begin(), + mg_graph_view.vertex_partition_range_lasts(), + mg_graph_view.local_vertex_partition_range_first(), + handle_->get_stream()); + + rmm::device_uvector tree_dst_dists(tree_src_dists.size(), + handle_->get_stream()); + thrust::copy_if(handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + d_mg_predecessors.begin(), + tree_dst_dists.begin(), + cugraph::detail::is_not_equal_t{invalid_vertex}); + + auto input_pair_first = + thrust::make_zip_iterator(tree_src_dists.begin(), tree_dst_dists.begin()); + auto num_invalids = thrust::count_if(handle_->get_thrust_policy(), + input_pair_first, + input_pair_first + tree_src_dists.size(), + [] __device__(auto pair) { + auto src_dist = thrust::get<0>(pair); + auto dst_dist = thrust::get<1>(pair); + return (src_dist + 1) != dst_dist; + }); + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); + + ASSERT_EQ(num_invalids, 0) + << " source and destination vertices in the BFS predecessor tree are not one hop away."; + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (graph distances & connected components)"); + } + + /* check distances and connect component coverage in the input graph */ + + { + constexpr size_t num_rounds = 24; // to cut peak memory usage + + rmm::device_uvector d_mg_typecasted_distances(d_mg_distances.size(), + handle_->get_stream()); + auto max_distance = thrust::transform_reduce( + handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + cuda::proclaim_return_type([invalid_distance] __device__(auto d) { + return d == invalid_distance ? vertex_t{0} : d; + }), + vertex_t{0}, + thrust::maximum{}); + max_distance = cugraph::host_scalar_allreduce( + comm, max_distance, raft::comms::op_t::MAX, handle_->get_stream()); + ASSERT_TRUE(max_distance <= std::numeric_limits::max()) + << "the input graph diameter exceeds std::numeric_limits::max(), so we " + "can't use uint8_t to store distances in validation."; + thrust::transform(handle_->get_thrust_policy(), + d_mg_distances.begin(), + d_mg_distances.end(), + d_mg_typecasted_distances.begin(), + cugraph::detail::typecast_t{}); + cugraph::edge_src_property_t edge_src_dist( + *handle_, mg_graph_view); + cugraph::update_edge_src_property(*handle_, + mg_graph_view, + d_mg_typecasted_distances.begin(), + edge_src_dist.mutable_view()); + + size_t num_invalids{0}; + for (size_t r = 0; r < num_rounds; ++r) { + auto dst_first = mg_graph_view.local_edge_partition_dst_range_first(); + auto dst_range_size = mg_graph_view.local_edge_partition_dst_range_size(); + auto num_this_round_dsts = + dst_range_size / num_rounds + + (r < (dst_range_size % num_rounds) ? vertex_t{1} : vertex_t{0}); + rmm::device_uvector this_round_dsts(num_this_round_dsts, + handle_->get_stream()); + thrust::tabulate(handle_->get_thrust_policy(), + this_round_dsts.begin(), + this_round_dsts.end(), + [dst_first, r, num_rounds] __device__(size_t i) { + return dst_first + static_cast(r + i * num_rounds); + }); + + auto this_round_dst_dists = cugraph::collect_values_for_sorted_unique_int_vertices( + comm, + raft::device_span(this_round_dsts.data(), this_round_dsts.size()), + d_mg_typecasted_distances.begin(), + mg_graph_view.vertex_partition_range_lasts(), + mg_graph_view.local_vertex_partition_range_first(), + handle_->get_stream()); + + num_invalids += cugraph::count_if_e( + *handle_, + mg_graph_view, + edge_src_dist.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + cugraph::edge_dummy_property_t{}.view(), + [invalid_distance, + num_rounds, + r, + dst_first, + this_round_dst_dists = raft::device_span( + this_round_dst_dists.data(), + this_round_dst_dists + .size())] __device__(auto src, auto dst, auto src_dist, auto, auto) { + auto dst_offset = dst - dst_first; + if ((dst_offset % num_rounds) == r) { + auto dst_dist = this_round_dst_dists[dst_offset / num_rounds]; + if (src_dist != invalid_distance) { + return (dst_dist == invalid_distance) || + (((src_dist >= dst_dist) ? (src_dist - dst_dist) + : (dst_dist - src_dist)) > 1); + } else { + return (dst_dist != invalid_distance); + } + } else { + return false; + } + }); + } + + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); + + ASSERT_EQ(num_invalids, 0) + << "only one of the two connected vertices is reachable from the starting vertex or " + "the distances from the starting vertex differ by more than one."; + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + + /* check that predecessor->v edges exist in the input graph */ + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.start("validate (predecessor->v edge existence)"); + } + + { + rmm::device_uvector query_srcs(d_mg_predecessors.size(), handle_->get_stream()); + rmm::device_uvector query_dsts(query_srcs.size(), handle_->get_stream()); + auto input_edge_first = thrust::make_zip_iterator( + d_mg_predecessors.begin(), + thrust::make_counting_iterator(mg_graph_view.local_vertex_partition_range_first())); + auto output_edge_first = + thrust::make_zip_iterator(query_srcs.begin(), query_dsts.begin()); + query_srcs.resize( + thrust::distance( + output_edge_first, + thrust::copy_if(handle_->get_thrust_policy(), + input_edge_first, + input_edge_first + d_mg_predecessors.size(), + d_mg_predecessors.begin(), + output_edge_first, + cugraph::detail::is_not_equal_t{invalid_vertex})), + handle_->get_stream()); + query_dsts.resize(query_srcs.size(), handle_->get_stream()); + +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "start shuffling edges" << std::endl; +#endif + std::tie(query_srcs, query_dsts, std::ignore, std::ignore, std::ignore, std::ignore) = + cugraph::detail::shuffle_int_vertex_pairs_with_values_to_local_gpu_by_edge_partitioning< + vertex_t, + edge_t, + weight_t, + edge_type_t>(*handle_, + std::move(query_srcs), + std::move(query_dsts), + std::nullopt, + std::nullopt, + std::nullopt, + mg_graph_view.vertex_partition_range_lasts()); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "edges shuffled, calling has_edge()" << std::endl; +#endif + + auto flags = mg_graph_view.has_edge( + *handle_, + raft::device_span(query_srcs.data(), query_srcs.size()), + raft::device_span(query_dsts.data(), query_dsts.size()), + true /* FIXME: remove */); + auto num_invalids = + thrust::count(handle_->get_thrust_policy(), flags.begin(), flags.end(), false); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "local # invalids=" << num_invalids << std::endl; +#endif + num_invalids = cugraph::host_scalar_allreduce( + comm, num_invalids, raft::comms::op_t::SUM, handle_->get_stream()); +#if 1 + RAFT_CUDA_TRY(cudaDeviceSynchronize()); + std::cerr << "global # invalids=" << num_invalids << std::endl; + if (num_invalids > 0) { + rmm::device_uvector d_pair(2, handle_->get_stream()); + thrust::fill( + handle_->get_thrust_policy(), d_pair.begin(), d_pair.end(), invalid_vertex); + auto triplet_first = + thrust::make_zip_iterator(query_srcs.begin(), query_dsts.begin(), flags.begin()); + thrust::for_each(handle_->get_thrust_policy(), + triplet_first, + triplet_first + query_srcs.size(), + [pair = raft::device_span( + d_pair.data(), d_pair.size())] __device__(auto triplet) { + if (thrust::get<2>(triplet) == false) { + auto src = thrust::get<0>(triplet); + auto dst = thrust::get<1>(triplet); + printf("missing edge from src=%lld to dst=%lld\n", + (long long)src, + (long long)dst); + pair[0] = src; + pair[1] = dst; + } + }); + std::vector h_pair(2); + raft::update_host(h_pair.data(), d_pair.data(), d_pair.size(), handle_->get_stream()); + handle_->sync_stream(); + auto min_comm_rank = + cugraph::host_scalar_allreduce(comm, + h_pair[0] == invalid_vertex ? comm_size : comm_rank, + raft::comms::op_t::MIN, + handle_->get_stream()); + if (min_comm_rank != comm_size) { + if (comm_rank == min_comm_rank) { + std::cerr << "comm_rank=" << comm_rank << " has an invalid pair (" << h_pair[0] + << "," << h_pair[1] << ")" << std::endl; + } + auto tup = cugraph::host_scalar_bcast(comm, + thrust::make_tuple(h_pair[0], h_pair[1]), + min_comm_rank, + handle_->get_stream()); + std::cerr << "tup=(" << thrust::get<0>(tup) << "," << thrust::get<1>(tup) << ")" + << std::endl; + auto num_appears = cugraph::count_if_e( + *handle_, + mg_graph_view, + cugraph::edge_src_dummy_property_t{}.view(), + cugraph::edge_dst_dummy_property_t{}.view(), + cugraph::edge_dummy_property_t{}.view(), + [missing_src = thrust::get<0>(tup), missing_dst = thrust::get<1>(tup)] __device__( + auto src, auto dst, auto src_dist, auto, auto) { + if (src == missing_src && dst == missing_dst) { + printf("edge %lld, %lld actually exists.\n", (long long)src, (long long)dst); + return true; + } + return false; + }); + std::cerr << "num_appears=" << num_appears << std::endl; + if (thrust::get<0>(tup) >= mg_graph_view.local_vertex_partition_range_first() && + thrust::get<0>(tup) < mg_graph_view.local_vertex_partition_range_last()) { + auto v_offset = + thrust::get<0>(tup) - mg_graph_view.local_vertex_partition_range_first(); + std::cerr << "thrust::get<0>(tup) v_offset=" << v_offset << std::endl; + raft::print_device_vector( + "thrust::get<0>(tup) dist", d_mg_distances.data() + v_offset, 1, std::cerr); + raft::print_device_vector( + "thrust::get<0>(tup) pred", d_mg_predecessors.data() + v_offset, 1, std::cerr); + } + if (thrust::get<1>(tup) >= mg_graph_view.local_vertex_partition_range_first() && + thrust::get<1>(tup) < mg_graph_view.local_vertex_partition_range_last()) { + auto v_offset = + thrust::get<1>(tup) - mg_graph_view.local_vertex_partition_range_first(); + std::cerr << "thrust::get<1>(tup) v_offset=" << v_offset << std::endl; + raft::print_device_vector( + "thrust::get<1>(tup) dist", d_mg_distances.data() + v_offset, 1, std::cerr); + raft::print_device_vector( + "thrust::get<1>(tup) pred", d_mg_predecessors.data() + v_offset, 1, std::cerr); + } + } + comm.barrier(); + } +#else + ASSERT_EQ(num_invalids, 0) << "predecessor->v missing in the input graph."; +#endif + } + + if (cugraph::test::g_perf) { + RAFT_CUDA_TRY(cudaDeviceSynchronize()); // for consistent performance measurement + comm.barrier(); + hr_timer.stop(); + hr_timer.display_and_clear(std::cerr); + } + } + } + + std::cerr << "average MG BFS (Kernel 2) time: " << (total_elapsed / num_timed_starting_vertices) + << std::endl; + } + + private: + static std::unique_ptr handle_; +}; + +template +std::unique_ptr Tests_GRAPH500_MGBFS::handle_ = nullptr; + +using Tests_GRAPH500_MGBFS_Rmat = Tests_GRAPH500_MGBFS; + +TEST_P(Tests_GRAPH500_MGBFS_Rmat, CheckInt64Int64) +{ + auto param = GetParam(); + run_current_test( + std::get<0>(param), override_Rmat_Usecase_with_cmd_line_arguments(std::get<1>(param))); +} + +INSTANTIATE_TEST_SUITE_P( + rmat_small_test, + Tests_GRAPH500_MGBFS_Rmat, + ::testing::Values( + // enable correctness checks + std::make_tuple(Graph500_BFS_Usecase{false, true}, + cugraph::test::Rmat_Usecase(10, + 16, + 0.57, + 0.19, + 0.19, + 0 /* base RNG seed */, + true /* undirected */, + true /* scramble vertex ID */)))); + +INSTANTIATE_TEST_SUITE_P( + rmat_benchmark_test, /* note that scale & edge factor can be overridden in benchmarking (with + --gtest_filter to select only the rmat_benchmark_test with a specific + vertex & edge type combination) by command line arguments and do not + include more than one Rmat_Usecase that differ only in scale or edge + factor (to avoid running same benchmarks more than once) */ + Tests_GRAPH500_MGBFS_Rmat, + ::testing::Values( + // disable correctness checks for large graphs + std::make_tuple(Graph500_BFS_Usecase{false, false}, + cugraph::test::Rmat_Usecase(20, + 16, + 0.57, + 0.19, + 0.19, + 0 /* base RNG seed */, + true /* undirected */, + true /* scramble vertex IDs */)))); + +CUGRAPH_MG_TEST_PROGRAM_MAIN()