From ea0e66d7136aadbdbc292517e20e826286e86df8 Mon Sep 17 00:00:00 2001 From: jnke2016 Date: Mon, 24 Jun 2024 15:49:13 -0700 Subject: [PATCH] add plc implementation of all-pairs similarity leveraging the capi --- .../pylibcugraph/pylibcugraph/CMakeLists.txt | 3 + python/pylibcugraph/pylibcugraph/__init__.py | 6 + .../_cugraph_c/similarity_algorithms.pxd | 44 ++++- .../all_pairs_jaccard_coefficients.pyx | 158 ++++++++++++++++++ .../all_pairs_overlap_coefficients.pyx | 158 ++++++++++++++++++ .../all_pairs_sorensen_coefficients.pyx | 158 ++++++++++++++++++ 6 files changed, 526 insertions(+), 1 deletion(-) create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx create mode 100644 python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx diff --git a/python/pylibcugraph/pylibcugraph/CMakeLists.txt b/python/pylibcugraph/pylibcugraph/CMakeLists.txt index 7cc90145949..53fbb00f1c1 100644 --- a/python/pylibcugraph/pylibcugraph/CMakeLists.txt +++ b/python/pylibcugraph/pylibcugraph/CMakeLists.txt @@ -58,6 +58,9 @@ set(cython_sources weakly_connected_components.pyx replicate_edgelist.pyx degrees.pyx + all_pairs_jaccard_coefficients.pyx + all_pairs_sorensen_coefficients.pyx + all_pairs_overlap_coefficients.pyx ) set(linked_libraries cugraph::cugraph;cugraph::cugraph_c) diff --git a/python/pylibcugraph/pylibcugraph/__init__.py b/python/pylibcugraph/pylibcugraph/__init__.py index dcdef05e106..99ed3b509e8 100644 --- a/python/pylibcugraph/pylibcugraph/__init__.py +++ b/python/pylibcugraph/pylibcugraph/__init__.py @@ -95,6 +95,12 @@ from pylibcugraph.sorensen_coefficients import sorensen_coefficients +from pylibcugraph.all_pairs_jaccard_coefficients import all_pairs_jaccard_coefficients + +from pylibcugraph.all_pairs_overlap_coefficients import all_pairs_overlap_coefficients + +from pylibcugraph.all_pairs_sorensen_coefficients import all_pairs_sorensen_coefficients + from pylibcugraph.degrees import in_degrees, out_degrees, degrees diff --git a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd index 406094f18d5..e343b35c069 100644 --- a/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd +++ b/python/pylibcugraph/pylibcugraph/_cugraph_c/similarity_algorithms.pxd @@ -50,7 +50,7 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_free( cugraph_similarity_result_t* result ) - + ########################################################################### # jaccard coefficients cdef cugraph_error_code_t \ @@ -63,6 +63,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs jaccard coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_jaccard_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) ########################################################################### # sorensen coefficients @@ -76,6 +90,20 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs sorensen coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_sorensen_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) ########################################################################### # overlap coefficients @@ -89,3 +117,17 @@ cdef extern from "cugraph_c/similarity_algorithms.h": cugraph_similarity_result_t** result, cugraph_error_t** error ) + + ########################################################################### + # all-pairs overlap coefficients + cdef cugraph_error_code_t \ + cugraph_all_pairs_overlap_coefficients( + const cugraph_resource_handle_t* handle, + cugraph_graph_t* graph, + const cugraph_type_erased_device_array_view_t* vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check, + cugraph_similarity_result_t** result, + cugraph_error_t** error + ) diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx new file mode 100644 index 00000000000..f4d188ed9e2 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_jaccard_coefficients.pyx @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_jaccard_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def all_pairs_jaccard_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check): + """ + Perform All-Pairs Jaccard similarity computation. + + Note that Jaccard similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, the compute weighted jaccard_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted jaccard_coefficients + + topk : size_t + Specify how many answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding Jaccard coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_jaccard_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_jaccard_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx new file mode 100644 index 00000000000..12c163a759e --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_overlap_coefficients.pyx @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_overlap_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def all_pairs_overlap_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check): + """ + Perform All-Pairs overlap similarity computation. + + Note that overlap similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, the compute weighted overlap_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted overlap_coefficients + + topk : size_t + Specify how many answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding overlap coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_overlap_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_overlap_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + + return cupy_first, cupy_second, cupy_similarity diff --git a/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx new file mode 100644 index 00000000000..be769381be6 --- /dev/null +++ b/python/pylibcugraph/pylibcugraph/all_pairs_sorensen_coefficients.pyx @@ -0,0 +1,158 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Have cython use python 3 syntax +# cython: language_level = 3 + +from libc.stdint cimport uintptr_t +from libc.stdio cimport printf +from cython.operator cimport dereference + +from pylibcugraph._cugraph_c.resource_handle cimport ( + bool_t, + cugraph_resource_handle_t, +) +from pylibcugraph._cugraph_c.error cimport ( + cugraph_error_code_t, + cugraph_error_t, +) +from pylibcugraph._cugraph_c.array cimport ( + cugraph_type_erased_device_array_view_t, + cugraph_type_erased_device_array_view_free +) +from pylibcugraph._cugraph_c.graph_functions cimport ( + cugraph_vertex_pairs_t, + cugraph_vertex_pairs_get_first, + cugraph_vertex_pairs_get_second, + cugraph_vertex_pairs_free, + cugraph_create_vertex_pairs +) +from pylibcugraph._cugraph_c.graph cimport ( + cugraph_graph_t, +) +from pylibcugraph._cugraph_c.similarity_algorithms cimport ( + cugraph_all_pairs_sorensen_coefficients, + cugraph_similarity_result_t, + cugraph_similarity_result_get_similarity, + cugraph_similarity_result_free +) +from pylibcugraph.resource_handle cimport ( + ResourceHandle, +) +from pylibcugraph.graphs cimport ( + _GPUGraph, +) +from pylibcugraph.utils cimport ( + assert_success, + copy_to_cupy_array, + create_cugraph_type_erased_device_array_view_from_py_obj +) + + +def all_pairs_sorensen_coefficients(ResourceHandle resource_handle, + _GPUGraph graph, + vertices, + bool_t use_weight, + size_t topk, + bool_t do_expensive_check): + """ + Perform All-Pairs sorensen similarity computation. + + Note that sorensen similarity must run on a symmetric graph. + + Parameters + ---------- + resource_handle : ResourceHandle + Handle to the underlying device resources needed for referencing data + and running algorithms. + + graph : SGGraph or MGGraph + The input graph, for either Single or Multi-GPU operations. + + vertices : cudf.Series or None + Vertex list to compute all-pairs. If None, then compute based + on all vertices in the graph. + + use_weight : bool, optional + If set to True, the compute weighted sorensen_coefficients( + the input graph must be weighted in that case). + Otherwise, computed un-weighted sorensen_coefficients + + topk : size_t + Specify how many answers to return otherwise will return all values. + + + do_expensive_check : bool + If True, performs more extensive tests on the inputs to ensure + validitity, at the expense of increased run time. + + Returns + ------- + A tuple of device arrays containing the vertex pairs with + their corresponding sorensen coefficient scores. + + Examples + -------- + # FIXME: No example yet + + """ + + cdef cugraph_vertex_pairs_t* vertex_pairs_ptr + + cdef cugraph_resource_handle_t* c_resource_handle_ptr = \ + resource_handle.c_resource_handle_ptr + cdef cugraph_graph_t* c_graph_ptr = graph.c_graph_ptr + + cdef cugraph_similarity_result_t* result_ptr + cdef cugraph_error_code_t error_code + cdef cugraph_error_t* error_ptr + + cdef cugraph_type_erased_device_array_view_t* \ + vertices_view_ptr = \ + create_cugraph_type_erased_device_array_view_from_py_obj( + vertices) + + error_code = cugraph_all_pairs_sorensen_coefficients(c_resource_handle_ptr, + c_graph_ptr, + vertices_view_ptr, + use_weight, + topk, + do_expensive_check, + &result_ptr, + &error_ptr) + assert_success(error_code, error_ptr, "cugraph_all_pairs_sorensen_coefficients") + + # Extract individual device array pointers from result and copy to cupy + # arrays for returning. + cdef cugraph_type_erased_device_array_view_t* similarity_ptr = \ + cugraph_similarity_result_get_similarity(result_ptr) + + cupy_similarity = copy_to_cupy_array(c_resource_handle_ptr, similarity_ptr) + + cdef cugraph_type_erased_device_array_view_t* first_ptr = \ + cugraph_vertex_pairs_get_first(vertex_pairs_ptr) + + cupy_first = copy_to_cupy_array(c_resource_handle_ptr, first_ptr) + + cdef cugraph_type_erased_device_array_view_t* second_ptr = \ + cugraph_vertex_pairs_get_second(vertex_pairs_ptr) + + cupy_second = copy_to_cupy_array(c_resource_handle_ptr, second_ptr) + + # Free all pointers + cugraph_similarity_result_free(result_ptr) + cugraph_vertex_pairs_free(vertex_pairs_ptr) + + cugraph_type_erased_device_array_view_free(vertices_view_ptr) + + return cupy_first, cupy_second, cupy_similarity