From 3aa461f0c80606cef29e52858a2a733bdf131ed7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 30 Aug 2023 20:31:59 -0700 Subject: [PATCH 01/54] Initial version of scalar --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 4 ++- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 25 ++++++++++++++++++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 26 +++++++++++++++++++ 3 files changed, 54 insertions(+), 1 deletion(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 0ce42dc43ff..cfca4add290 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx) +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx scalar.pyx table.pyx types.pyx + utils.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd new file mode 100644 index 00000000000..410d4206555 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -0,0 +1,25 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp cimport bool +from libcpp.memory cimport unique_ptr + +from rmm._lib.memory_resource cimport DeviceMemoryResource + +from cudf._lib.cpp.scalar.scalar cimport scalar + +from .types cimport DataType + + +cdef class Scalar: + cdef unique_ptr[scalar] c_obj + cdef DataType _data_type + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr + + cdef const scalar* get(self) except * + + cpdef DataType type(self) + cpdef bool is_valid(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx new file mode 100644 index 00000000000..640877a9e27 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -0,0 +1,26 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from rmm._lib.memory_resource cimport get_current_device_resource + +from .types cimport DataType + + +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the DeviceScalar is in a reference cycle. Removing +# the tp_clear function with the no_gc_clear decoration prevents that. +# See https://github.com/rapidsai/rmm/pull/931 for details. +cdef class Scalar: + + def __cinit__(self, *args, **kwargs): + self.mr = get_current_device_resource() + + cdef const scalar* get(self) except *: + return self.c_obj.get() + + cpdef DataType type(self): + """The type of data in the column.""" + return self._data_type + + cpdef bool is_valid(self): + """True if the scalar is valid, false if not""" + return self.get().is_valid() From 1c4f99049c0f45d1b74c0fffe678a4d92943b706 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 30 Aug 2023 20:55:24 -0700 Subject: [PATCH 02/54] Implement basic scatter algorithm with Scalar --- .../cudf/cudf/_lib/cpp/libcpp/functional.pxd | 4 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 6 +++ python/cudf/cudf/_lib/pylibcudf/copying.pyx | 53 +++++++++++++++++++ 3 files changed, 62 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd index f3e2d6d0878..c38db036119 100644 --- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd +++ b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd @@ -1,6 +1,8 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# TODO: Can be replaced once https://github.com/cython/cython/pull/5671 is +# merged and released cdef extern from "" namespace "std" nogil: cdef cppclass reference_wrapper[T]: reference_wrapper() diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index d57be650710..a2232fc5d81 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -13,3 +13,9 @@ cpdef Table gather( Column gather_map, out_of_bounds_policy bounds_policy ) + +cpdef Table scatter ( + list source_scalars, + Column indices, + Table target, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index a27b44b3107..dff3e95328d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -2,6 +2,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move +from libcpp.vector cimport vector # TODO: We want to make cpp a more full-featured package so that we can access # directly from that. It will make namespacing much cleaner in pylibcudf. What @@ -9,13 +10,16 @@ from libcpp.utility cimport move # cimport libcudf... libcudf.copying.algo(...) from cudf._lib.cpp cimport copying as cpp_copying from cudf._lib.cpp.copying cimport out_of_bounds_policy +from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from .column cimport Column +from .scalar cimport Scalar from .table cimport Table @@ -55,3 +59,52 @@ cpdef Table gather( ) ) return Table.from_libcudf(move(c_result)) + + +ctypedef const scalar constscalar + +cpdef Table scatter ( + list source_scalars, + Column indices, + Table target, +): + """Scatter source_scalars into target according to the indices. + + For details on the implementation, see cudf::scatter in libcudf. + + Parameters + ---------- + source_scalars : List[Scalar] + A list containing one scalar for each column in target. + indices : Column + The rows of the target into which the source_scalars should be written. + target : Table + The table into which data should be written. + + Returns + ------- + pylibcudf.Table + The result of the scatter + """ + cdef unique_ptr[table] c_result + # TODO: This doesn't require the constscalar ctypedef + cdef vector[reference_wrapper[const scalar]] c_scalars + c_scalars.reserve(len(source_scalars)) + cdef Scalar d_slr + for d_slr in source_scalars: + c_scalars.push_back( + # TODO: This requires the constscalar ctypedef + # Possibly the same as https://github.com/cython/cython/issues/4180 + reference_wrapper[constscalar](d_slr.get()[0]) + ) + + with nogil: + c_result = move( + cpp_copying.scatter( + c_scalars, + indices.view(), + target.view(), + ) + ) + + return Table.from_libcudf(move(c_result)) From 8ad836715cf3e49415479a1810f9deaf18e33e1b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Sep 2023 12:18:11 -0700 Subject: [PATCH 03/54] Build DeviceScalar around pylibcudf.Scalar --- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 2 + python/cudf/cudf/_lib/pylibcudf/__init__.py | 2 + python/cudf/cudf/_lib/scalar.pxd | 5 ++- python/cudf/cudf/_lib/scalar.pyx | 39 ++++++++++---------- 4 files changed, 27 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index ba7822b0a54..5bcf234adea 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -4,6 +4,7 @@ from . cimport copying from .column cimport Column from .gpumemoryview cimport gpumemoryview +from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved @@ -12,6 +13,7 @@ from .types cimport DataType __all__ = [ "Column", "DataType", + "Scalar", "Table", "copying", "gpumemoryview", diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 3edff9a53e8..7bcf024d5c1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -3,12 +3,14 @@ from . import copying from .column import Column from .gpumemoryview import gpumemoryview +from .scalar import Scalar from .table import Table from .types import DataType, TypeId __all__ = [ "Column", "DataType", + "Scalar", "Table", "TypeId", "copying", diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 1deed60d67d..09a91493723 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,15 +1,16 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef unique_ptr[scalar] c_value + cdef pylibcudf.Scalar c_value # Holds a reference to the DeviceMemoryResource used for allocation. # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 0407785b2d8..54509802558 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -20,7 +20,6 @@ from libc.stdint cimport ( ) from libcpp cimport bool from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move from rmm._lib.memory_resource cimport get_current_device_resource @@ -41,6 +40,7 @@ from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id from cudf._lib.interop import from_arrow, to_arrow cimport cudf._lib.cpp.types as libcudf_types +from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, fixed_point_scalar, @@ -81,6 +81,7 @@ cdef class DeviceScalar: def __cinit__(self, *args, **kwargs): self.mr = get_current_device_resource() + self.c_value = pylibcudf.Scalar() def __init__(self, value, dtype): """ @@ -105,26 +106,26 @@ cdef class DeviceScalar: if isinstance(dtype, cudf.core.dtypes.DecimalDtype): _set_decimal_from_scalar( - self.c_value, value, dtype, valid) + self.c_value.c_obj, value, dtype, valid) elif isinstance(dtype, cudf.ListDtype): _set_list_from_pylist( - self.c_value, value, dtype, valid) + self.c_value.c_obj, value, dtype, valid) elif isinstance(dtype, cudf.StructDtype): - _set_struct_from_pydict(self.c_value, value, dtype, valid) + _set_struct_from_pydict(self.c_value.c_obj, value, dtype, valid) elif pd.api.types.is_string_dtype(dtype): - _set_string_from_np_string(self.c_value, value, valid) + _set_string_from_np_string(self.c_value.c_obj, value, valid) elif pd.api.types.is_numeric_dtype(dtype): - _set_numeric_from_np_scalar(self.c_value, + _set_numeric_from_np_scalar(self.c_value.c_obj, value, dtype, valid) elif pd.api.types.is_datetime64_dtype(dtype): _set_datetime64_from_np_scalar( - self.c_value, value, dtype, valid + self.c_value.c_obj, value, dtype, valid ) elif pd.api.types.is_timedelta64_dtype(dtype): _set_timedelta64_from_np_scalar( - self.c_value, value, dtype, valid + self.c_value.c_obj, value, dtype, valid ) else: raise ValueError( @@ -134,19 +135,19 @@ cdef class DeviceScalar: def _to_host_scalar(self): if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): - result = _get_py_decimal_from_fixed_point(self.c_value) + result = _get_py_decimal_from_fixed_point(self.c_value.c_obj) elif cudf.api.types.is_struct_dtype(self.dtype): - result = _get_py_dict_from_struct(self.c_value, self.dtype) + result = _get_py_dict_from_struct(self.c_value.c_obj, self.dtype) elif cudf.api.types.is_list_dtype(self.dtype): - result = _get_py_list_from_list(self.c_value, self.dtype) + result = _get_py_list_from_list(self.c_value.c_obj, self.dtype) elif pd.api.types.is_string_dtype(self.dtype): - result = _get_py_string_from_string(self.c_value) + result = _get_py_string_from_string(self.c_value.c_obj) elif pd.api.types.is_numeric_dtype(self.dtype): - result = _get_np_scalar_from_numeric(self.c_value) + result = _get_np_scalar_from_numeric(self.c_value.c_obj) elif pd.api.types.is_datetime64_dtype(self.dtype): - result = _get_np_scalar_from_timestamp64(self.c_value) + result = _get_np_scalar_from_timestamp64(self.c_value.c_obj) elif pd.api.types.is_timedelta64_dtype(self.dtype): - result = _get_np_scalar_from_timedelta64(self.c_value) + result = _get_np_scalar_from_timedelta64(self.c_value.c_obj) else: raise ValueError( "Could not convert cudf::scalar to a Python value" @@ -169,7 +170,7 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.get() + return self.c_value.c_obj.get() cpdef bool is_valid(self): """ @@ -194,7 +195,7 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value = move(ptr) + s.c_value.c_obj.swap(ptr) cdtype = s.get_raw_ptr()[0].type() if dtype is not None: @@ -601,9 +602,9 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True) + _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True) + _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) return result else: raise TypeError('NAT only valid for datetime and timedelta') From c1479725ddc3c2e7d4c192039fe750ab20947a89 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Sep 2023 14:28:38 -0700 Subject: [PATCH 04/54] Fix some initialization --- python/cudf/cudf/_lib/datetime.pyx | 6 ++++-- python/cudf/cudf/_lib/scalar.pyx | 3 +++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 81949dbaa20..3d96f59c4d6 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.filling cimport calendrical_month_sequence +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) + cdef const scalar* c_start = start.c_value.get() with nogil: c_result = move(calendrical_month_sequence( n, - start.c_value.get()[0], + c_start[0], months )) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 54509802558..ca0d08c4d60 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -132,6 +132,8 @@ cdef class DeviceScalar: f"Cannot convert value of type " f"{type(value).__name__} to cudf scalar" ) + cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() + self.c_value._data_type = pylibcudf.DataType.from_libcudf(cdtype) def _to_host_scalar(self): if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): @@ -197,6 +199,7 @@ cdef class DeviceScalar: s.c_value.c_obj.swap(ptr) cdtype = s.get_raw_ptr()[0].type() + s.c_value._data_type = pylibcudf.DataType.from_libcudf(cdtype) if dtype is not None: s._dtype = dtype From 0e5e5d38c0bf360d441be47008248afbed3a6c54 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Sep 2023 14:28:51 -0700 Subject: [PATCH 05/54] Add missing no_gc_clear --- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 640877a9e27..df35ff92e72 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -1,5 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. +cimport cython + from rmm._lib.memory_resource cimport get_current_device_resource from .types cimport DataType @@ -9,6 +11,7 @@ from .types cimport DataType # by the gc if the DeviceScalar is in a reference cycle. Removing # the tp_clear function with the no_gc_clear decoration prevents that. # See https://github.com/rapidsai/rmm/pull/931 for details. +@cython.no_gc_clear cdef class Scalar: def __cinit__(self, *args, **kwargs): From e3008840ebc7821d1504b9d4921df0afe03359db Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Sep 2023 15:11:19 -0700 Subject: [PATCH 06/54] Implement from_libcudf --- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 3 +++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 21 +++++++++++++++++++++ python/cudf/cudf/_lib/scalar.pyx | 7 ++++--- 3 files changed, 28 insertions(+), 3 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index 410d4206555..85d57305622 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -23,3 +23,6 @@ cdef class Scalar: cpdef DataType type(self) cpdef bool is_valid(self) + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index df35ff92e72..3886c5a5290 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -4,6 +4,8 @@ cimport cython from rmm._lib.memory_resource cimport get_current_device_resource +from cudf._lib.cpp.scalar.scalar cimport scalar + from .types cimport DataType @@ -13,6 +15,12 @@ from .types cimport DataType # See https://github.com/rapidsai/rmm/pull/931 for details. @cython.no_gc_clear cdef class Scalar: + """A scalar value in device memory.""" + # Unlike for columns, libcudf does not support scalar views. All APIs that + # accept scalar values accept references to the owning object rather than a + # special view type. As a result, pylibcudf.Scalar has a simpler structure + # than pylibcudf.Column because it can be a true wrapper around a libcudf + # column def __cinit__(self, *args, **kwargs): self.mr = get_current_device_resource() @@ -27,3 +35,16 @@ cdef class Scalar: cpdef bool is_valid(self): """True if the scalar is valid, false if not""" return self.get().is_valid() + + @staticmethod + cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): + """Construct a Scalar object from a libcudf scalar. + + This method is for pylibcudf's functions to use to ingest outputs of + calling libcudf algorithms, and should generally not be needed by users + (even direct pylibcudf Cython users). + """ + cdef Scalar s = Scalar.__new__(Scalar) + s.c_obj.swap(libcudf_scalar) + s._data_type = DataType.from_libcudf(s.get().type()) + return s diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index ca0d08c4d60..f6350c1cb94 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -20,10 +20,12 @@ from libc.stdint cimport ( ) from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move from rmm._lib.memory_resource cimport get_current_device_resource import cudf +from cudf._lib cimport pylibcudf from cudf._lib.types import ( LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, datetime_unit_map, @@ -178,7 +180,7 @@ cdef class DeviceScalar: """ Returns if the Scalar is valid or not(i.e., ). """ - return self.get_raw_ptr()[0].is_valid() + return self.c_value.is_valid() def __repr__(self): if cudf.utils.utils.is_na_like(self.value): @@ -197,9 +199,8 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value.c_obj.swap(ptr) + s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) cdtype = s.get_raw_ptr()[0].type() - s.c_value._data_type = pylibcudf.DataType.from_libcudf(cdtype) if dtype is not None: s._dtype = dtype From 4a2a436e67195ce2ddd87b1f7e2295b96c6c0a78 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Sep 2023 15:14:08 -0700 Subject: [PATCH 07/54] Remove mr from DeviceScalar --- python/cudf/cudf/_lib/scalar.pxd | 5 ----- python/cudf/cudf/_lib/scalar.pyx | 10 ---------- 2 files changed, 15 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index 09a91493723..ae1d350edc6 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -12,11 +12,6 @@ from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: cdef pylibcudf.Scalar c_value - # Holds a reference to the DeviceMemoryResource used for allocation. - # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is - # needed for deallocation - cdef DeviceMemoryResource mr - cdef object _dtype cdef const scalar* get_raw_ptr(self) except * diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index f6350c1cb94..ff1a09a9c9e 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,7 +1,5 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -cimport cython - import decimal import numpy as np @@ -22,8 +20,6 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from rmm._lib.memory_resource cimport get_current_device_resource - import cudf from cudf._lib cimport pylibcudf from cudf._lib.types import ( @@ -74,15 +70,9 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. -@cython.no_gc_clear cdef class DeviceScalar: def __cinit__(self, *args, **kwargs): - self.mr = get_current_device_resource() self.c_value = pylibcudf.Scalar() def __init__(self, value, dtype): From abd5550be0e7727621838d207dc8146d6a8a4218 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 1 Sep 2023 16:11:36 -0700 Subject: [PATCH 08/54] Inline _set_value --- python/cudf/cudf/_lib/scalar.pyx | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index ff1a09a9c9e..32a7daae098 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -90,34 +90,30 @@ cdef class DeviceScalar: A NumPy dtype. """ self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') - self._set_value(value, self._dtype) - - def _set_value(self, value, dtype): - # IMPORTANT: this should only ever be called from __init__ valid = not _is_null_host_scalar(value) - if isinstance(dtype, cudf.core.dtypes.DecimalDtype): + if isinstance(self._dtype, cudf.core.dtypes.DecimalDtype): _set_decimal_from_scalar( - self.c_value.c_obj, value, dtype, valid) - elif isinstance(dtype, cudf.ListDtype): + self.c_value.c_obj, value, self._dtype, valid) + elif isinstance(self._dtype, cudf.ListDtype): _set_list_from_pylist( - self.c_value.c_obj, value, dtype, valid) - elif isinstance(dtype, cudf.StructDtype): - _set_struct_from_pydict(self.c_value.c_obj, value, dtype, valid) - elif pd.api.types.is_string_dtype(dtype): + self.c_value.c_obj, value, self._dtype, valid) + elif isinstance(self._dtype, cudf.StructDtype): + _set_struct_from_pydict(self.c_value.c_obj, value, self._dtype, valid) + elif pd.api.types.is_string_dtype(self._dtype): _set_string_from_np_string(self.c_value.c_obj, value, valid) - elif pd.api.types.is_numeric_dtype(dtype): + elif pd.api.types.is_numeric_dtype(self._dtype): _set_numeric_from_np_scalar(self.c_value.c_obj, value, - dtype, + self._dtype, valid) - elif pd.api.types.is_datetime64_dtype(dtype): + elif pd.api.types.is_datetime64_dtype(self._dtype): _set_datetime64_from_np_scalar( - self.c_value.c_obj, value, dtype, valid + self.c_value.c_obj, value, self._dtype, valid ) - elif pd.api.types.is_timedelta64_dtype(dtype): + elif pd.api.types.is_timedelta64_dtype(self._dtype): _set_timedelta64_from_np_scalar( - self.c_value.c_obj, value, dtype, valid + self.c_value.c_obj, value, self._dtype, valid ) else: raise ValueError( From e8f6847085aa26bb851b56e771b38dbb021ee49c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 5 Sep 2023 10:03:45 -0700 Subject: [PATCH 09/54] Add a comment --- python/cudf/cudf/_lib/scalar.pyx | 3 +++ 1 file changed, 3 insertions(+) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 32a7daae098..9b2ede1b55d 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -72,6 +72,9 @@ from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns cdef class DeviceScalar: + # I think this should be removable, except that currently the way that + # from_unique_ptr is implemented is probably dereferencing this in an + # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): self.c_value = pylibcudf.Scalar() From 8360f8098ed022c9c74df08c8194baec6fb0016c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 5 Sep 2023 21:32:42 -0700 Subject: [PATCH 10/54] Implement from_arrow --- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 27 ++++++++++++++++-- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 10 +++++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 28 +++++++++++++++++++ 3 files changed, 63 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pxd create mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pyx diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index cfca4add290..6d01199cfc8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,8 +12,8 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx scalar.pyx table.pyx types.pyx - utils.pyx +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx + types.pyx utils.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( @@ -21,3 +21,26 @@ rapids_cython_create_modules( SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +set(targets_using_arrow_headers pylibcudf_interop) +foreach(target IN LISTS targets_using_arrow_headers) + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() + +# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for +# more info +find_package(NumPy REQUIRED) +set(targets_using_numpy pylibcudf_interop) +foreach(target IN LISTS targets_using_numpy) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. + # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") +endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd new file mode 100644 index 00000000000..dd518aeea52 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from pyarrow.lib cimport Table as pa_Table + +from .table cimport Table + + +cpdef Table from_arrow( + pa_Table pyarrow_table, +) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx new file mode 100644 index 00000000000..f5161074b15 --- /dev/null +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -0,0 +1,28 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.utility cimport move +from pyarrow.lib cimport ( + CTable as pa_CTable, + Table as pa_Table, + pyarrow_unwrap_table, +) + +from cudf._lib.cpp.interop cimport from_arrow as cpp_from_arrow +from cudf._lib.cpp.table.table cimport table + +from .table cimport Table + + +cpdef Table from_arrow( + pa_Table pyarrow_table, +): + cdef shared_ptr[pa_CTable] ctable = ( + pyarrow_unwrap_table(pyarrow_table) + ) + cdef unique_ptr[table] c_result + + with nogil: + c_result = move(cpp_from_arrow(ctable.get()[0])) + + return Table.from_libcudf(move(c_result)) From 88147728b705d20ea51a6c0006e2653ab95f2c3e Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Sep 2023 17:02:52 -0700 Subject: [PATCH 11/54] Add to package --- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 6 +++++- python/cudf/cudf/_lib/pylibcudf/__init__.py | 3 ++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index 5bcf234adea..cf0007b9303 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,7 +1,10 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -from . cimport copying +# TODO: Cannot import interop because it introduces a build-time pyarrow header +# dependency for everything that cimports pylibcudf. See if there's a way to +# avoid that before polluting the whole package. +from . cimport copying # , interop from .column cimport Column from .gpumemoryview cimport gpumemoryview from .scalar cimport Scalar @@ -17,4 +20,5 @@ __all__ = [ "Table", "copying", "gpumemoryview", + # "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 7bcf024d5c1..72b74a57b87 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,6 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from . import copying +from . import copying, interop from .column import Column from .gpumemoryview import gpumemoryview from .scalar import Scalar @@ -15,4 +15,5 @@ "TypeId", "copying", "gpumemoryview", + "interop", ] From 3a04947dee18fc7506b80ea1be219514cbe45dd8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Sep 2023 17:04:29 -0700 Subject: [PATCH 12/54] Implement release for gpumemoryview --- python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd | 4 +++- python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx | 14 +++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd index 713697bd139..2c40348ec4f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd @@ -6,4 +6,6 @@ cdef class gpumemoryview: # to treat this object as something like a POD struct cdef readonly: Py_ssize_t ptr - object obj + object _obj + + cpdef release(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx index fc98f087a1b..9121092ecdd 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx @@ -19,9 +19,21 @@ cdef class gpumemoryview: "gpumemoryview must be constructed from an object supporting " "the CUDA array interface" ) - self.obj = obj + self._obj = obj + self._released = False # TODO: Need to respect readonly self.ptr = cai["data"][0] def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ + + @property + def obj(self): + if not self._released: + return self._obj + else: + raise ValueError("operation forbidden on released gpumemoryview object") + + cpdef release(self): + self._obj = None + self._released = True From 032adbb6a91dbc2fcc70daef99bd5402093bfec6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Sep 2023 17:35:59 -0700 Subject: [PATCH 13/54] Add proper constructor for scalar from pyarrow scalar --- python/cudf/cudf/_lib/cpp/reduce.pxd | 5 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 4 +- .../cudf/_lib/pylibcudf/gpumemoryview.pxd | 3 ++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 49 +++++++++++++++++-- 4 files changed, 53 insertions(+), 8 deletions(-) diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd index 7952c717916..997782dec6c 100644 --- a/python/cudf/cudf/_lib/cpp/reduce.pxd +++ b/python/cudf/cudf/_lib/cpp/reduce.pxd @@ -1,14 +1,13 @@ -# Copyright (c) 2020-2022, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport pair -from cudf._lib.aggregation cimport reduce_aggregation, scan_aggregation +from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport data_type -from cudf._lib.scalar cimport DeviceScalar cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 6d01199cfc8..64adb38ace3 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -30,7 +30,7 @@ execute_process( OUTPUT_STRIP_TRAILING_WHITESPACE ) -set(targets_using_arrow_headers pylibcudf_interop) +set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar) foreach(target IN LISTS targets_using_arrow_headers) target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") endforeach() @@ -38,7 +38,7 @@ endforeach() # TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for # more info find_package(NumPy REQUIRED) -set(targets_using_numpy pylibcudf_interop) +set(targets_using_numpy pylibcudf_interop pylibcudf_scalar) foreach(target IN LISTS targets_using_numpy) target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd index 2c40348ec4f..93449fd02d1 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd @@ -1,5 +1,7 @@ # Copyright (c) 2023, NVIDIA CORPORATION. +from libcpp cimport bool + cdef class gpumemoryview: # TODO: Eventually probably want to make this opaque, but for now it's fine @@ -7,5 +9,6 @@ cdef class gpumemoryview: cdef readonly: Py_ssize_t ptr object _obj + bool _released cpdef release(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 3886c5a5290..292e6e9d51f 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -1,11 +1,23 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -cimport cython +cimport pyarrow.lib +from cython cimport no_gc_clear +from cython.operator cimport dereference +from libcpp.utility cimport move + +import pyarrow.lib from rmm._lib.memory_resource cimport get_current_device_resource +from cudf._lib.cpp cimport aggregation +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.reduce cimport cpp_reduce from cudf._lib.cpp.scalar.scalar cimport scalar +from cudf._lib.cpp.types cimport data_type +from .column cimport Column +from .interop cimport from_arrow +from .table cimport Table from .types cimport DataType @@ -13,7 +25,7 @@ from .types cimport DataType # by the gc if the DeviceScalar is in a reference cycle. Removing # the tp_clear function with the no_gc_clear decoration prevents that. # See https://github.com/rapidsai/rmm/pull/931 for details. -@cython.no_gc_clear +@no_gc_clear cdef class Scalar: """A scalar value in device memory.""" # Unlike for columns, libcudf does not support scalar views. All APIs that @@ -22,9 +34,40 @@ cdef class Scalar: # than pylibcudf.Column because it can be a true wrapper around a libcudf # column - def __cinit__(self, *args, **kwargs): + def __init__(self, pyarrow.lib.Scalar value): self.mr = get_current_device_resource() + # Convert the value to a cudf object via pyarrow + arr = pyarrow.lib.array([value.as_py()], type=value.type) + cdef pyarrow.lib.Table pa_tbl = pyarrow.lib.Table.from_arrays( + [arr], names=["scalar"] + ) + cdef Table tbl = from_arrow(pa_tbl) + + # TODO: The code below is a pretty hacky way to get a scalar from a + # single row of a column, but for now want to see if we can write a + # generic solution like this that works. If it does, we can consider + # implementing a better approach natively in libcudf. + cdef Column col = tbl.columns()[0] + cdef column_view cv = col.view() + + cdef unique_ptr[scalar] c_result + cdef unique_ptr[aggregation.reduce_aggregation] c_agg = ( + aggregation.make_min_aggregation[aggregation.reduce_aggregation]() + ) + cdef data_type c_type = col.type().c_obj + + with nogil: + c_result = move(cpp_reduce( + cv, + dereference(c_agg), + c_type + )) + + cdef Scalar s = Scalar.from_libcudf(move(c_result)) + self._data_type = DataType.from_libcudf(s.get().type()) + self.c_obj.swap(s.c_obj) + cdef const scalar* get(self) except *: return self.c_obj.get() From 16f45284ed1eddf07fa4ed4f1087cc75bc4e9b72 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Sep 2023 20:43:25 -0700 Subject: [PATCH 14/54] Construct pylibcudf.Scalar for numeric types --- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 8 +++- python/cudf/cudf/_lib/scalar.pyx | 44 ++++------------------ 2 files changed, 15 insertions(+), 37 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 292e6e9d51f..ef98a82a44e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -34,9 +34,15 @@ cdef class Scalar: # than pylibcudf.Column because it can be a true wrapper around a libcudf # column - def __init__(self, pyarrow.lib.Scalar value): + def __init__(self, pyarrow.lib.Scalar value=None): self.mr = get_current_device_resource() + if value is None: + # TODO: This early return is not something we really want to + # support, but it here for now to ease the transition of + # DeviceScalar. + return + # Convert the value to a cudf object via pyarrow arr = pyarrow.lib.array([value.as_py()], type=value.type) cdef pyarrow.lib.Table pa_tbl = pyarrow.lib.Table.from_arrays( diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 9b2ede1b55d..5c9a6eeaf02 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -94,6 +94,7 @@ cdef class DeviceScalar: """ self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') valid = not _is_null_host_scalar(value) + set_dtype = True if isinstance(self._dtype, cudf.core.dtypes.DecimalDtype): _set_decimal_from_scalar( @@ -106,10 +107,10 @@ cdef class DeviceScalar: elif pd.api.types.is_string_dtype(self._dtype): _set_string_from_np_string(self.c_value.c_obj, value, valid) elif pd.api.types.is_numeric_dtype(self._dtype): - _set_numeric_from_np_scalar(self.c_value.c_obj, - value, - self._dtype, - valid) + if value is NA: + value = None + pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) + self.c_value = pylibcudf.Scalar(pa_scalar) elif pd.api.types.is_datetime64_dtype(self._dtype): _set_datetime64_from_np_scalar( self.c_value.c_obj, value, self._dtype, valid @@ -123,8 +124,10 @@ cdef class DeviceScalar: f"Cannot convert value of type " f"{type(value).__name__} to cudf scalar" ) + cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() - self.c_value._data_type = pylibcudf.DataType.from_libcudf(cdtype) + if set_dtype: + self.c_value._data_type = pylibcudf.DataType.from_libcudf(cdtype) def _to_host_scalar(self): if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): @@ -235,37 +238,6 @@ cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): s.reset(new string_scalar(value.encode(), valid)) -cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - value = value if valid else 0 - if dtype == "int8": - s.reset(new numeric_scalar[int8_t](value, valid)) - elif dtype == "int16": - s.reset(new numeric_scalar[int16_t](value, valid)) - elif dtype == "int32": - s.reset(new numeric_scalar[int32_t](value, valid)) - elif dtype == "int64": - s.reset(new numeric_scalar[int64_t](value, valid)) - elif dtype == "uint8": - s.reset(new numeric_scalar[uint8_t](value, valid)) - elif dtype == "uint16": - s.reset(new numeric_scalar[uint16_t](value, valid)) - elif dtype == "uint32": - s.reset(new numeric_scalar[uint32_t](value, valid)) - elif dtype == "uint64": - s.reset(new numeric_scalar[uint64_t](value, valid)) - elif dtype == "float32": - s.reset(new numeric_scalar[float](value, valid)) - elif dtype == "float64": - s.reset(new numeric_scalar[double](value, valid)) - elif dtype == "bool": - s.reset(new numeric_scalar[bool](value, valid)) - else: - raise ValueError(f"dtype not supported: {dtype}") - - cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, object value, object dtype, From b75949aa6a42421296b392fcb5e86cfbb9d3a339 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Sep 2023 20:45:48 -0700 Subject: [PATCH 15/54] Construct pylibcudf.Scalar for string types --- python/cudf/cudf/_lib/scalar.pyx | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 5c9a6eeaf02..388508c447b 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -105,12 +105,17 @@ cdef class DeviceScalar: elif isinstance(self._dtype, cudf.StructDtype): _set_struct_from_pydict(self.c_value.c_obj, value, self._dtype, valid) elif pd.api.types.is_string_dtype(self._dtype): - _set_string_from_np_string(self.c_value.c_obj, value, valid) + if value is NA: + value = None + pa_scalar = pa.scalar(value, type=pa.string()) + self.c_value = pylibcudf.Scalar(pa_scalar) + set_dtype = False elif pd.api.types.is_numeric_dtype(self._dtype): if value is NA: value = None pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) self.c_value = pylibcudf.Scalar(pa_scalar) + set_dtype = False elif pd.api.types.is_datetime64_dtype(self._dtype): _set_datetime64_from_np_scalar( self.c_value.c_obj, value, self._dtype, valid @@ -233,11 +238,6 @@ cdef class DeviceScalar: return s -cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): - value = value if valid else "" - s.reset(new string_scalar(value.encode(), valid)) - - cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, object value, object dtype, From 8d460469ab05c091c7d7a2cb7f9a1107a92b2adf Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Sep 2023 21:08:55 -0700 Subject: [PATCH 16/54] Add comment --- python/cudf/cudf/_lib/scalar.pyx | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 388508c447b..fc31ac1062d 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -116,6 +116,11 @@ cdef class DeviceScalar: pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) self.c_value = pylibcudf.Scalar(pa_scalar) set_dtype = False + # TODO: For datetime and timedeltas need to find a way to not overflow + # for large values. The problem is that numpy supports larger values, + # but Python's built-in objects do not, and currently in + # pylibcudf.Scalar.__init__ we rely on as_py to toss the scalar into an + # array because pa.array doesn't understand pa.Scalar instances. elif pd.api.types.is_datetime64_dtype(self._dtype): _set_datetime64_from_np_scalar( self.c_value.c_obj, value, self._dtype, valid From ce164a002d06c11e5cb3dbfe4d160de576b52213 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 7 Sep 2023 12:12:25 -0700 Subject: [PATCH 17/54] Stop using reduction in favor of get_element --- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 32 ++++++++-------------- 1 file changed, 11 insertions(+), 21 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index ef98a82a44e..60803c1c71a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -2,18 +2,15 @@ cimport pyarrow.lib from cython cimport no_gc_clear -from cython.operator cimport dereference from libcpp.utility cimport move import pyarrow.lib from rmm._lib.memory_resource cimport get_current_device_resource -from cudf._lib.cpp cimport aggregation from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.reduce cimport cpp_reduce +from cudf._lib.cpp.copying cimport get_element from cudf._lib.cpp.scalar.scalar cimport scalar -from cudf._lib.cpp.types cimport data_type from .column cimport Column from .interop cimport from_arrow @@ -43,36 +40,29 @@ cdef class Scalar: # DeviceScalar. return - # Convert the value to a cudf object via pyarrow + cdef Scalar s = Scalar.from_pyarrow_scalar(value) + self._data_type = DataType.from_libcudf(s.get().type()) + self.c_obj.swap(s.c_obj) + + @staticmethod + def from_pyarrow_scalar(self, pyarrow.lib.Scalar value): + # Put the scalar into a column so that we can use from_arrow (no scalar + # implementation), then extract the zeroth element. arr = pyarrow.lib.array([value.as_py()], type=value.type) cdef pyarrow.lib.Table pa_tbl = pyarrow.lib.Table.from_arrays( [arr], names=["scalar"] ) cdef Table tbl = from_arrow(pa_tbl) - # TODO: The code below is a pretty hacky way to get a scalar from a - # single row of a column, but for now want to see if we can write a - # generic solution like this that works. If it does, we can consider - # implementing a better approach natively in libcudf. cdef Column col = tbl.columns()[0] cdef column_view cv = col.view() cdef unique_ptr[scalar] c_result - cdef unique_ptr[aggregation.reduce_aggregation] c_agg = ( - aggregation.make_min_aggregation[aggregation.reduce_aggregation]() - ) - cdef data_type c_type = col.type().c_obj with nogil: - c_result = move(cpp_reduce( - cv, - dereference(c_agg), - c_type - )) + c_result = move(get_element(cv, 0)) - cdef Scalar s = Scalar.from_libcudf(move(c_result)) - self._data_type = DataType.from_libcudf(s.get().type()) - self.c_obj.swap(s.c_obj) + return Scalar.from_libcudf(move(c_result)) cdef const scalar* get(self) except *: return self.c_obj.get() From a8d8cb602dfc73bf4771113b1c7a3159616ad2ec Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 09:28:46 -0700 Subject: [PATCH 18/54] Move construction from pyarrow scalar to a factory --- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 19 ++++++++----------- python/cudf/cudf/_lib/scalar.pyx | 4 ++-- 2 files changed, 10 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 60803c1c71a..74475f803bb 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -31,21 +31,18 @@ cdef class Scalar: # than pylibcudf.Column because it can be a true wrapper around a libcudf # column - def __init__(self, pyarrow.lib.Scalar value=None): + def __cinit__(self, *args, **kwargs): self.mr = get_current_device_resource() - if value is None: - # TODO: This early return is not something we really want to - # support, but it here for now to ease the transition of - # DeviceScalar. - return - - cdef Scalar s = Scalar.from_pyarrow_scalar(value) - self._data_type = DataType.from_libcudf(s.get().type()) - self.c_obj.swap(s.c_obj) + def __init__(self, pyarrow.lib.Scalar value=None): + # TODO: This case is not something we really want to + # support, but it here for now to ease the transition of + # DeviceScalar. + if value is not None: + raise ValueError("Scalar should be constructed with a factory") @staticmethod - def from_pyarrow_scalar(self, pyarrow.lib.Scalar value): + def from_pyarrow_scalar(pyarrow.lib.Scalar value): # Put the scalar into a column so that we can use from_arrow (no scalar # implementation), then extract the zeroth element. arr = pyarrow.lib.array([value.as_py()], type=value.type) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index fc31ac1062d..8ceda0f887d 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -108,13 +108,13 @@ cdef class DeviceScalar: if value is NA: value = None pa_scalar = pa.scalar(value, type=pa.string()) - self.c_value = pylibcudf.Scalar(pa_scalar) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) set_dtype = False elif pd.api.types.is_numeric_dtype(self._dtype): if value is NA: value = None pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) - self.c_value = pylibcudf.Scalar(pa_scalar) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) set_dtype = False # TODO: For datetime and timedeltas need to find a way to not overflow # for large values. The problem is that numpy supports larger values, From 3fd071e17e048453db657805c60475eb557313b0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 11:57:07 -0700 Subject: [PATCH 19/54] Implement from_arrow at the C level --- cpp/include/cudf/interop.hpp | 12 ++ cpp/src/interop/from_arrow.cu | 119 ++++++++++++++++++++ python/cudf/cudf/_lib/cpp/interop.pxd | 6 +- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 19 ++++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 26 +---- 5 files changed, 158 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index e210179b147..da3380e1d44 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -145,5 +145,17 @@ std::unique_ptr from_arrow( arrow::Table const& input, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create `cudf::table` from given arrow Scalar input + * + * @param input arrow:Scalar that needs to be converted to `cudf::scalar` + * @param mr Device memory resource used to allocate `cudf::scalar` + * @return cudf scalar generated from given arrow Scalar + */ + +std::unique_ptr from_arrow( + arrow::Scalar const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** @} */ // end of group } // namespace cudf diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 30cfee97fd8..5ad30d29868 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -13,6 +13,8 @@ * See the License for the specific language governing permissions and * limitations under the License. */ +#include +#include #include #include #include @@ -472,4 +474,121 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, return detail::from_arrow(input_table, cudf::get_default_stream(), mr); } +template +constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, + Functor f, + Ts&&... args) +{ + switch (dtype.id()) { + case arrow::Type::INT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::FLOAT: + return f.template operator()(std::forward(args)...); + case arrow::Type::DOUBLE: + return f.template operator()(std::forward(args)...); + case arrow::Type::BOOL: + return f.template operator()(std::forward(args)...); + // case arrow::Type::TIMESTAMP_DAYS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::TIMESTAMP_SECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::TIMESTAMP_MILLISECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::TIMESTAMP_MICROSECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::TIMESTAMP: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DURATION_DAYS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DURATION_SECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DURATION_MILLISECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DURATION_MICROSECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DURATION_NANOSECONDS: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DICTIONARY32: + // return f.template operator()( + // std::forward(args)...); + case arrow::Type::STRING: + return f.template operator()(std::forward(args)...); + // case arrow::Type::LIST: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DECIMAL32: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DECIMAL64: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DECIMAL128: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::STRUCT: + // return f.template operator()( + // std::forward(args)...); + default: { + CUDF_FAIL("Invalid type."); + } + } +} + +struct ArrowArrayGenerator { + template + auto operator()(arrow::Scalar const& scalar) + { + // Get a builder for the scalar type + typename arrow::TypeTraits::BuilderType builder; + + auto status = builder.AppendScalar(scalar); + if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } + + auto maybe_array = builder.Finish(); + if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + return *maybe_array; + } +}; + +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::mr::device_memory_resource* mr) +{ + auto array = arrow_type_dispatcher(*input.type, ArrowArrayGenerator{}, input); + + auto field = arrow::field("", input.type); + + auto table = arrow::Table::Make(arrow::schema({field}), {array}); + + auto cudf_table = from_arrow(*table); + + auto col = cudf_table->get_column(0); + + auto cv = col.view(); + return get_element(cv, 0); +} + } // namespace cudf diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd index e81f0d617fb..bbfcd8c4a0a 100644 --- a/python/cudf/cudf/_lib/cpp/interop.pxd +++ b/python/cudf/cudf/_lib/cpp/interop.pxd @@ -1,12 +1,13 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from pyarrow.lib cimport CTable +from pyarrow.lib cimport CScalar, CTable from cudf._lib.types import cudf_to_np_types, np_to_cudf_types +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -24,6 +25,7 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ ) except + cdef unique_ptr[table] from_arrow(CTable input) except + + cdef unique_ptr[scalar] from_arrow(CScalar input) except + cdef cppclass column_metadata: column_metadata() except + diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index f5161074b15..63f4ed68f7e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -3,14 +3,19 @@ from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from pyarrow.lib cimport ( + CScalar as pa_CScalar, CTable as pa_CTable, + Scalar as pa_Scalar, Table as pa_Table, + pyarrow_unwrap_scalar, pyarrow_unwrap_table, ) from cudf._lib.cpp.interop cimport from_arrow as cpp_from_arrow +from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table +from .scalar cimport Scalar from .table cimport Table @@ -26,3 +31,17 @@ cpdef Table from_arrow( c_result = move(cpp_from_arrow(ctable.get()[0])) return Table.from_libcudf(move(c_result)) + + +cpdef Scalar from_arrow_scalar( + pa_Scalar pyarrow_scalar, +): + cdef shared_ptr[pa_CScalar] cscalar = ( + pyarrow_unwrap_scalar(pyarrow_scalar) + ) + cdef unique_ptr[scalar] c_result + + with nogil: + c_result = move(cpp_from_arrow(cscalar.get()[0])) + + return Scalar.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 74475f803bb..3a365d7ad80 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -2,19 +2,13 @@ cimport pyarrow.lib from cython cimport no_gc_clear -from libcpp.utility cimport move import pyarrow.lib from rmm._lib.memory_resource cimport get_current_device_resource -from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.copying cimport get_element from cudf._lib.cpp.scalar.scalar cimport scalar -from .column cimport Column -from .interop cimport from_arrow -from .table cimport Table from .types cimport DataType @@ -43,23 +37,11 @@ cdef class Scalar: @staticmethod def from_pyarrow_scalar(pyarrow.lib.Scalar value): - # Put the scalar into a column so that we can use from_arrow (no scalar - # implementation), then extract the zeroth element. - arr = pyarrow.lib.array([value.as_py()], type=value.type) - cdef pyarrow.lib.Table pa_tbl = pyarrow.lib.Table.from_arrays( - [arr], names=["scalar"] - ) - cdef Table tbl = from_arrow(pa_tbl) + # Need a local import here to avoid a circular dependency because + # from_arrow_scalar returns a Scalar. + from .interop import from_arrow_scalar - cdef Column col = tbl.columns()[0] - cdef column_view cv = col.view() - - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(get_element(cv, 0)) - - return Scalar.from_libcudf(move(c_result)) + return from_arrow_scalar(value) cdef const scalar* get(self) except *: return self.c_obj.get() From 33abe2b41243ca762a74bdd394097af030dd0120 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 12:13:50 -0700 Subject: [PATCH 20/54] Enable datetime --- cpp/src/interop/from_arrow.cu | 7 +++---- python/cudf/cudf/_lib/scalar.pyx | 13 +++++-------- 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 5ad30d29868..55c6cd9e1e6 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -502,9 +502,8 @@ constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, return f.template operator()(std::forward(args)...); case arrow::Type::BOOL: return f.template operator()(std::forward(args)...); - // case arrow::Type::TIMESTAMP_DAYS: - // return f.template operator()( - // std::forward(args)...); + case arrow::Type::TIMESTAMP: + return f.template operator()(std::forward(args)...); // case arrow::Type::TIMESTAMP_SECONDS: // return f.template operator()( // std::forward(args)...); @@ -563,7 +562,7 @@ struct ArrowArrayGenerator { auto operator()(arrow::Scalar const& scalar) { // Get a builder for the scalar type - typename arrow::TypeTraits::BuilderType builder; + typename arrow::TypeTraits::BuilderType builder{scalar.type, arrow::default_memory_pool()}; auto status = builder.AppendScalar(scalar); if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 8ceda0f887d..068c4dd1ef9 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -116,15 +116,12 @@ cdef class DeviceScalar: pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) set_dtype = False - # TODO: For datetime and timedeltas need to find a way to not overflow - # for large values. The problem is that numpy supports larger values, - # but Python's built-in objects do not, and currently in - # pylibcudf.Scalar.__init__ we rely on as_py to toss the scalar into an - # array because pa.array doesn't understand pa.Scalar instances. elif pd.api.types.is_datetime64_dtype(self._dtype): - _set_datetime64_from_np_scalar( - self.c_value.c_obj, value, self._dtype, valid - ) + if value is NA or value is NaT: + value = None + pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) + set_dtype = False elif pd.api.types.is_timedelta64_dtype(self._dtype): _set_timedelta64_from_np_scalar( self.c_value.c_obj, value, self._dtype, valid From c6b595d5534f2e098941c0e5f19f7360a552066a Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 12:22:25 -0700 Subject: [PATCH 21/54] Enable timedeltas --- cpp/src/interop/from_arrow.cu | 29 ++--------------------------- python/cudf/cudf/_lib/scalar.pyx | 8 +++++--- 2 files changed, 7 insertions(+), 30 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 55c6cd9e1e6..504f99d7d6b 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -504,33 +504,8 @@ constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, return f.template operator()(std::forward(args)...); case arrow::Type::TIMESTAMP: return f.template operator()(std::forward(args)...); - // case arrow::Type::TIMESTAMP_SECONDS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::TIMESTAMP_MILLISECONDS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::TIMESTAMP_MICROSECONDS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::TIMESTAMP: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DURATION_DAYS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DURATION_SECONDS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DURATION_MILLISECONDS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DURATION_MICROSECONDS: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DURATION_NANOSECONDS: - // return f.template operator()( - // std::forward(args)...); + case arrow::Type::DURATION: + return f.template operator()(std::forward(args)...); // case arrow::Type::DICTIONARY32: // return f.template operator()( // std::forward(args)...); diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 068c4dd1ef9..42ef880964b 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -123,9 +123,11 @@ cdef class DeviceScalar: self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) set_dtype = False elif pd.api.types.is_timedelta64_dtype(self._dtype): - _set_timedelta64_from_np_scalar( - self.c_value.c_obj, value, self._dtype, valid - ) + if value is NA or value is NaT: + value = None + pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) + set_dtype = False else: raise ValueError( f"Cannot convert value of type " From ec4b5106fdfd091b8eafb081ae3704927d0c17ae Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 16:20:38 -0700 Subject: [PATCH 22/54] Enable lists --- cpp/src/interop/from_arrow.cu | 51 ++++++++++++++++++++++++++++++-- python/cudf/cudf/_lib/scalar.pyx | 39 +++++++++++------------- 2 files changed, 66 insertions(+), 24 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 504f99d7d6b..2b6f803edf0 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -511,9 +511,8 @@ constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, // std::forward(args)...); case arrow::Type::STRING: return f.template operator()(std::forward(args)...); - // case arrow::Type::LIST: - // return f.template operator()( - // std::forward(args)...); + case arrow::Type::LIST: + return f.template operator()(std::forward(args)...); // case arrow::Type::DECIMAL32: // return f.template operator()( // std::forward(args)...); @@ -548,6 +547,52 @@ struct ArrowArrayGenerator { } }; +struct BuilderGenerator { + template + std::shared_ptr operator()(std::shared_ptr const& type) + { + return std::make_shared::BuilderType>( + type, arrow::default_memory_pool()); + } +}; + +template <> +std::shared_ptr BuilderGenerator::operator()( + std::shared_ptr const& type) +{ + CUDF_FAIL("Not implemented"); +} + +template <> +auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) +{ + std::shared_ptr vt = scalar.type; + std::vector> types; + + // TODO: Consider the possibility of list of structs/struct of lists + while (vt->id() == arrow::Type::LIST) { + types.push_back(vt); + // TODO: Error handling on dynamic_cast + vt = dynamic_cast(*vt).value_type(); + } + + // Now walk the list backwards and create the builders. Start with the + // current vt, which is the innermost one and not a list. + auto inner_builder = arrow_type_dispatcher(*vt, BuilderGenerator{}, vt); + std::shared_ptr builder; + for (auto it = types.rbegin(); it != types.rend(); ++it) { + builder = std::make_shared(arrow::default_memory_pool(), inner_builder); + inner_builder = builder; + } + + auto status = builder->AppendScalar(scalar); + if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } + + auto maybe_array = builder->Finish(); + if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + return *maybe_array; +} + std::unique_ptr from_arrow(arrow::Scalar const& input, rmm::mr::device_memory_resource* mr) { diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 42ef880964b..d8cb21037a7 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -21,7 +21,9 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move import cudf + from cudf._lib cimport pylibcudf + from cudf._lib.types import ( LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, datetime_unit_map, @@ -70,6 +72,14 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns +def nestrepl(lst): + for i, item in enumerate(lst): + if item is NA or item is NaT: + lst[i] = None + elif isinstance(item, list): + nestrepl(item) + + cdef class DeviceScalar: # I think this should be removable, except that currently the way that @@ -100,8 +110,14 @@ cdef class DeviceScalar: _set_decimal_from_scalar( self.c_value.c_obj, value, self._dtype, valid) elif isinstance(self._dtype, cudf.ListDtype): - _set_list_from_pylist( - self.c_value.c_obj, value, self._dtype, valid) + if value is NA: + value = None + else: + nestrepl(value) + + pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) + set_dtype = False elif isinstance(self._dtype, cudf.StructDtype): _set_struct_from_pydict(self.c_value.c_obj, value, self._dtype, valid) elif pd.api.types.is_string_dtype(self._dtype): @@ -365,25 +381,6 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype): python_dict = table.to_pydict()["None"][0] return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict} -cdef _set_list_from_pylist(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - - value = value if valid else [NA] - cdef Column col - if isinstance(dtype.element_type, ListDtype): - pa_type = dtype.element_type.to_arrow() - else: - pa_type = dtype.to_arrow().value_type - col = cudf.core.column.as_column( - pa.array(value, from_pandas=True, type=pa_type) - ) - cdef column_view col_view = col.view() - s.reset( - new list_scalar(col_view, valid) - ) - cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype): From 636242ba0d33aeefb618a7b948f40c5a1495ebca Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 17:21:15 -0700 Subject: [PATCH 23/54] Enable structs --- cpp/src/interop/from_arrow.cu | 60 +++++++++++++++++++++++++------- python/cudf/cudf/_lib/scalar.pyx | 25 ++++++++++--- 2 files changed, 69 insertions(+), 16 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 2b6f803edf0..5e77d946a48 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -14,6 +14,7 @@ * limitations under the License. */ #include +#include #include #include #include @@ -513,18 +514,17 @@ constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, return f.template operator()(std::forward(args)...); case arrow::Type::LIST: return f.template operator()(std::forward(args)...); - // case arrow::Type::DECIMAL32: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DECIMAL64: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DECIMAL128: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::STRUCT: - // return f.template operator()( - // std::forward(args)...); + // case arrow::Type::DECIMAL32: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DECIMAL64: + // return f.template operator()( + // std::forward(args)...); + // case arrow::Type::DECIMAL128: + // return f.template operator()( + // std::forward(args)...); + case arrow::Type::STRUCT: + return f.template operator()(std::forward(args)...); default: { CUDF_FAIL("Invalid type."); } @@ -563,6 +563,13 @@ std::shared_ptr BuilderGenerator::operator() +std::shared_ptr BuilderGenerator::operator()( + std::shared_ptr const& type) +{ + CUDF_FAIL("Not implemented"); +} + template <> auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) { @@ -593,6 +600,35 @@ auto ArrowArrayGenerator::operator()(arrow::Scalar const& scala return *maybe_array; } +std::shared_ptr make_struct_builder( + std::shared_ptr const& type) +{ + std::vector> field_builders; + + for (auto i = 0; i < type->num_fields(); ++i) { + auto const vt = type->field(i)->type(); + if (vt->id() == arrow::Type::STRUCT) { + field_builders.push_back(make_struct_builder(vt)); + } else { + field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); + } + } + return std::make_shared(type, arrow::default_memory_pool(), field_builders); +} + +template <> +auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) +{ + std::shared_ptr builder = make_struct_builder(scalar.type); + + auto status = builder->AppendScalar(scalar); + if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } + + auto maybe_array = builder->Finish(); + if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + return *maybe_array; +} + std::unique_ptr from_arrow(arrow::Scalar const& input, rmm::mr::device_memory_resource* mr) { diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index d8cb21037a7..9a0a8e6e719 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -72,12 +72,22 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns -def nestrepl(lst): +def nestrepl_list(lst): + # TODO: Account for struct inside list for i, item in enumerate(lst): if item is NA or item is NaT: lst[i] = None elif isinstance(item, list): - nestrepl(item) + nestrepl_list(item) + + +def nestrepl_dict(dct): + # TODO: Account for list inside struct + for k, v in dct.items(): + if v is NA or v is NaT: + dct[k] = None + elif isinstance(v, dict): + nestrepl_dict(v) cdef class DeviceScalar: @@ -113,13 +123,20 @@ cdef class DeviceScalar: if value is NA: value = None else: - nestrepl(value) + nestrepl_list(value) pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) set_dtype = False elif isinstance(self._dtype, cudf.StructDtype): - _set_struct_from_pydict(self.c_value.c_obj, value, self._dtype, valid) + if value is NA: + value = None + else: + nestrepl_dict(value) + + pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) + set_dtype = False elif pd.api.types.is_string_dtype(self._dtype): if value is NA: value = None From 4c08dc34c5bb2519718dff1660a488445e1be11c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 17:43:02 -0700 Subject: [PATCH 24/54] Make list and struct code paths more parallel --- cpp/src/interop/from_arrow.cu | 44 ++++++++++++++++++++--------------- 1 file changed, 25 insertions(+), 19 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 5e77d946a48..31c1ad8b158 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -531,22 +531,6 @@ constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, } } -struct ArrowArrayGenerator { - template - auto operator()(arrow::Scalar const& scalar) - { - // Get a builder for the scalar type - typename arrow::TypeTraits::BuilderType builder{scalar.type, arrow::default_memory_pool()}; - - auto status = builder.AppendScalar(scalar); - if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } - - auto maybe_array = builder.Finish(); - if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } - return *maybe_array; - } -}; - struct BuilderGenerator { template std::shared_ptr operator()(std::shared_ptr const& type) @@ -570,13 +554,28 @@ std::shared_ptr BuilderGenerator::operator() -auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) +struct ArrowArrayGenerator { + template + auto operator()(arrow::Scalar const& scalar) + { + // Get a builder for the scalar type + auto builder = arrow_type_dispatcher(*scalar.type, BuilderGenerator{}, scalar.type); + + auto status = builder->AppendScalar(scalar); + if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } + + auto maybe_array = builder->Finish(); + if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } + return *maybe_array; + } +}; + +std::shared_ptr make_list_builder(std::shared_ptr const& type) { - std::shared_ptr vt = scalar.type; std::vector> types; // TODO: Consider the possibility of list of structs/struct of lists + std::shared_ptr vt = type; while (vt->id() == arrow::Type::LIST) { types.push_back(vt); // TODO: Error handling on dynamic_cast @@ -591,6 +590,13 @@ auto ArrowArrayGenerator::operator()(arrow::Scalar const& scala builder = std::make_shared(arrow::default_memory_pool(), inner_builder); inner_builder = builder; } + return builder; +} + +template <> +auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) +{ + std::shared_ptr builder = make_list_builder(scalar.type); auto status = builder->AppendScalar(scalar); if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } From 50bbdc00535b4795ef68fd4c30cfc87702498870 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 18:06:43 -0700 Subject: [PATCH 25/54] Unify builders and use recursion for all paths --- cpp/src/interop/from_arrow.cu | 103 ++++++++++------------------------ 1 file changed, 29 insertions(+), 74 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 31c1ad8b158..5d63046c60d 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -554,91 +555,45 @@ std::shared_ptr BuilderGenerator::operator() - auto operator()(arrow::Scalar const& scalar) - { - // Get a builder for the scalar type - auto builder = arrow_type_dispatcher(*scalar.type, BuilderGenerator{}, scalar.type); - - auto status = builder->AppendScalar(scalar); - if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } - - auto maybe_array = builder->Finish(); - if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } - return *maybe_array; - } -}; - -std::shared_ptr make_list_builder(std::shared_ptr const& type) -{ - std::vector> types; - - // TODO: Consider the possibility of list of structs/struct of lists - std::shared_ptr vt = type; - while (vt->id() == arrow::Type::LIST) { - types.push_back(vt); - // TODO: Error handling on dynamic_cast - vt = dynamic_cast(*vt).value_type(); - } - - // Now walk the list backwards and create the builders. Start with the - // current vt, which is the innermost one and not a list. - auto inner_builder = arrow_type_dispatcher(*vt, BuilderGenerator{}, vt); - std::shared_ptr builder; - for (auto it = types.rbegin(); it != types.rend(); ++it) { - builder = std::make_shared(arrow::default_memory_pool(), inner_builder); - inner_builder = builder; - } - return builder; -} - -template <> -auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) +std::shared_ptr make_builder(std::shared_ptr const& type) { - std::shared_ptr builder = make_list_builder(scalar.type); - - auto status = builder->AppendScalar(scalar); - if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } - - auto maybe_array = builder->Finish(); - if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } - return *maybe_array; -} - -std::shared_ptr make_struct_builder( - std::shared_ptr const& type) -{ - std::vector> field_builders; - - for (auto i = 0; i < type->num_fields(); ++i) { - auto const vt = type->field(i)->type(); - if (vt->id() == arrow::Type::STRUCT) { - field_builders.push_back(make_struct_builder(vt)); - } else { - field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); + switch (type->id()) { + case arrow::Type::STRUCT: { + std::vector> field_builders; + + for (auto i = 0; i < type->num_fields(); ++i) { + auto const vt = type->field(i)->type(); + if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { + field_builders.push_back(make_builder(vt)); + } else { + field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); + } + } + return std::make_shared( + type, arrow::default_memory_pool(), field_builders); + } + case arrow::Type::LIST: { + return std::make_shared(arrow::default_memory_pool(), + make_builder(type->field(0)->type())); + } + default: { + return arrow_type_dispatcher(*type, BuilderGenerator{}, type); } } - return std::make_shared(type, arrow::default_memory_pool(), field_builders); } -template <> -auto ArrowArrayGenerator::operator()(arrow::Scalar const& scalar) +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::mr::device_memory_resource* mr) { - std::shared_ptr builder = make_struct_builder(scalar.type); + // Get a builder for the scalar type + auto builder = make_builder(input.type); - auto status = builder->AppendScalar(scalar); + auto status = builder->AppendScalar(input); if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } auto maybe_array = builder->Finish(); if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } - return *maybe_array; -} - -std::unique_ptr from_arrow(arrow::Scalar const& input, - rmm::mr::device_memory_resource* mr) -{ - auto array = arrow_type_dispatcher(*input.type, ArrowArrayGenerator{}, input); + auto array = *maybe_array; auto field = arrow::field("", input.type); From 7b34df464edb6261e2e2d0a97bc790e47a4c7db6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 19:24:12 -0700 Subject: [PATCH 26/54] Enable decimals --- cpp/src/interop/from_arrow.cu | 11 +---- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 55 ++++++++++++++++++++-- python/cudf/cudf/_lib/scalar.pyx | 19 ++++++-- 3 files changed, 68 insertions(+), 17 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 5d63046c60d..21f49a327b3 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -515,15 +515,8 @@ constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, return f.template operator()(std::forward(args)...); case arrow::Type::LIST: return f.template operator()(std::forward(args)...); - // case arrow::Type::DECIMAL32: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DECIMAL64: - // return f.template operator()( - // std::forward(args)...); - // case arrow::Type::DECIMAL128: - // return f.template operator()( - // std::forward(args)...); + case arrow::Type::DECIMAL128: + return f.template operator()(std::forward(args)...); case arrow::Type::STRUCT: return f.template operator()(std::forward(args)...); default: { diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 3a365d7ad80..0821258ac6c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -2,14 +2,21 @@ cimport pyarrow.lib from cython cimport no_gc_clear +from libcpp.memory cimport unique_ptr import pyarrow.lib from rmm._lib.memory_resource cimport get_current_device_resource -from cudf._lib.cpp.scalar.scalar cimport scalar +from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) -from .types cimport DataType +from .types cimport DataType, type_id # The DeviceMemoryResource attribute could be released prematurely @@ -36,12 +43,52 @@ cdef class Scalar: raise ValueError("Scalar should be constructed with a factory") @staticmethod - def from_pyarrow_scalar(pyarrow.lib.Scalar value): + def from_pyarrow_scalar(pyarrow.lib.Scalar value, DataType data_type=None): + # Allow passing a dtype, but only for the purpose of decimals for now + # Need a local import here to avoid a circular dependency because # from_arrow_scalar returns a Scalar. from .interop import from_arrow_scalar - return from_arrow_scalar(value) + cdef Scalar s = from_arrow_scalar(value) + if s.type().id() != type_id.DECIMAL128: + if data_type is not None: + raise ValueError( + "dtype may not be passed for non-decimal types" + ) + return s + + if data_type is None: + raise ValueError( + "Decimal scalars must be constructed with a dtype" + ) + + cdef type_id tid = data_type.id() + if tid not in (type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128): + raise ValueError( + "Decimal scalars may only be cast to decimals" + ) + + if tid == type_id.DECIMAL128: + return s + + if tid == type_id.DECIMAL32: + s.c_obj.reset( + new fixed_point_scalar[decimal32]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + elif tid == type_id.DECIMAL64: + s.c_obj.reset( + new fixed_point_scalar[decimal64]( + ( s.c_obj.get()).value(), + scale_type(-value.type.scale), + s.c_obj.get().is_valid() + ) + ) + return s cdef const scalar* get(self) except *: return self.c_obj.get() diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 9a0a8e6e719..23fd455b623 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -22,7 +22,7 @@ from libcpp.utility cimport move import cudf -from cudf._lib cimport pylibcudf +from cudf._lib.pylibcudf.types cimport type_id from cudf._lib.types import ( LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, @@ -113,12 +113,23 @@ cdef class DeviceScalar: A NumPy dtype. """ self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') - valid = not _is_null_host_scalar(value) set_dtype = True + cdef type_id tid if isinstance(self._dtype, cudf.core.dtypes.DecimalDtype): - _set_decimal_from_scalar( - self.c_value.c_obj, value, self._dtype, valid) + if value is NA: + value = None + pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) + + tid = type_id.DECIMAL128 + if isinstance(self._dtype, cudf.core.dtypes.Decimal32Dtype): + tid = type_id.DECIMAL32 + elif isinstance(self._dtype, cudf.core.dtypes.Decimal64Dtype): + tid = type_id.DECIMAL64 + dt = pylibcudf.DataType(tid, -self._dtype.scale) + + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar, dt) + set_dtype = False elif isinstance(self._dtype, cudf.ListDtype): if value is NA: value = None From 392407fb046de6d50287df1aff99e7c8f0cfba1c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 19:27:42 -0700 Subject: [PATCH 27/54] Some cleanup --- python/cudf/cudf/_lib/scalar.pyx | 67 ++------------------------------ python/cudf/cudf/utils/dtypes.py | 18 --------- 2 files changed, 3 insertions(+), 82 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 23fd455b623..44e1255e9c8 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -37,7 +37,7 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id -from cudf._lib.interop import from_arrow, to_arrow +from cudf._lib.interop import to_arrow cimport cudf._lib.cpp.types as libcudf_types from cudf._lib cimport pylibcudf @@ -51,12 +51,7 @@ from cudf._lib.cpp.scalar.scalar cimport ( struct_scalar, timestamp_scalar, ) -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) +from cudf._lib.cpp.wrappers.decimals cimport decimal32, decimal64, decimal128 from cudf._lib.cpp.wrappers.durations cimport ( duration_ms, duration_ns, @@ -69,7 +64,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) -from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns +from cudf._lib.utils cimport columns_from_table_view def nestrepl_list(lst): @@ -338,62 +333,6 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, else: raise ValueError(f"dtype not supported: {dtype}") -cdef _set_decimal_from_scalar(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0 - if isinstance(dtype, cudf.Decimal64Dtype): - s.reset( - new fixed_point_scalar[decimal64]( - np.int64(value), scale_type(-dtype.scale), valid - ) - ) - elif isinstance(dtype, cudf.Decimal32Dtype): - s.reset( - new fixed_point_scalar[decimal32]( - np.int32(value), scale_type(-dtype.scale), valid - ) - ) - elif isinstance(dtype, cudf.Decimal128Dtype): - s.reset( - new fixed_point_scalar[decimal128]( - value, scale_type(-dtype.scale), valid - ) - ) - else: - raise ValueError(f"dtype not supported: {dtype}") - -cdef _set_struct_from_pydict(unique_ptr[scalar]& s, - object value, - object dtype, - bool valid=True): - arrow_schema = dtype.to_arrow() - columns = [str(i) for i in range(len(arrow_schema))] - if valid: - pyarrow_table = pa.Table.from_arrays( - [ - pa.array([value[f.name]], from_pandas=True, type=f.type) - for f in arrow_schema - ], - names=columns - ) - else: - pyarrow_table = pa.Table.from_arrays( - [ - pa.array([NA], from_pandas=True, type=f.type) - for f in arrow_schema - ], - names=columns - ) - - data = from_arrow(pyarrow_table) - cdef table_view struct_view = table_view_from_columns(data) - - s.reset( - new struct_scalar(struct_view, valid) - ) - cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype): if not s.get()[0].is_valid(): return NA diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 1b94db75340..73ea8e2cfc4 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -463,24 +463,6 @@ def _get_nan_for_dtype(dtype): return np.float64("nan") -def _decimal_to_int64(decimal: Decimal) -> int: - """ - Scale a Decimal such that the result is the integer - that would result from removing the decimal point. - - Examples - -------- - >>> _decimal_to_int64(Decimal('1.42')) - 142 - >>> _decimal_to_int64(Decimal('0.0042')) - 42 - >>> _decimal_to_int64(Decimal('-1.004201')) - -1004201 - - """ - return int(f"{decimal:0f}".replace(".", "")) - - def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): error = TypeError( f"{op} not supported between {dtype_l} and {dtype_r} scalars" From f5def95789ea268f12d0c0653055b9e3f0720343 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 19:32:29 -0700 Subject: [PATCH 28/54] Combine nestrepl --- python/cudf/cudf/_lib/scalar.pyx | 33 +++++++++++++++----------------- 1 file changed, 15 insertions(+), 18 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 44e1255e9c8..3b0ebbf93f2 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -67,22 +67,19 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( from cudf._lib.utils cimport columns_from_table_view -def nestrepl_list(lst): - # TODO: Account for struct inside list - for i, item in enumerate(lst): - if item is NA or item is NaT: - lst[i] = None - elif isinstance(item, list): - nestrepl_list(item) - - -def nestrepl_dict(dct): - # TODO: Account for list inside struct - for k, v in dct.items(): - if v is NA or v is NaT: - dct[k] = None - elif isinstance(v, dict): - nestrepl_dict(v) +def nestrepl(obj): + if isinstance(obj, list): + for i, item in enumerate(obj): + if item is NA or item is NaT: + obj[i] = None + elif isinstance(item, (dict, list)): + nestrepl(item) + elif isinstance(obj, dict): + for k, v in obj.items(): + if v is NA or v is NaT: + obj[k] = None + elif isinstance(v, (dict, list)): + nestrepl(v) cdef class DeviceScalar: @@ -129,7 +126,7 @@ cdef class DeviceScalar: if value is NA: value = None else: - nestrepl_list(value) + nestrepl(value) pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) @@ -138,7 +135,7 @@ cdef class DeviceScalar: if value is NA: value = None else: - nestrepl_dict(value) + nestrepl(value) pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) From 16cef1504fc71e7b624e734bafc0c1576616008d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 20:05:15 -0700 Subject: [PATCH 29/54] Unify constructor --- python/cudf/cudf/_lib/scalar.pyx | 99 ++++++++++---------------------- 1 file changed, 31 insertions(+), 68 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 3b0ebbf93f2..7211fe90317 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -67,19 +67,21 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( from cudf._lib.utils cimport columns_from_table_view -def nestrepl(obj): +# TODO: Check if this could replace _nested_na_replace +def _replace_nested_nulls(obj): if isinstance(obj, list): for i, item in enumerate(obj): - if item is NA or item is NaT: + # TODO: Check if this should use _is_null_host_scalar + if cudf.utils.utils.is_na_like(item): obj[i] = None elif isinstance(item, (dict, list)): - nestrepl(item) + _replace_nested_nulls(item) elif isinstance(obj, dict): for k, v in obj.items(): - if v is NA or v is NaT: + if cudf.utils.utils.is_na_like(v): obj[k] = None elif isinstance(v, (dict, list)): - nestrepl(v) + _replace_nested_nulls(v) cdef class DeviceScalar: @@ -104,75 +106,36 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') - set_dtype = True + dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') - cdef type_id tid - if isinstance(self._dtype, cudf.core.dtypes.DecimalDtype): - if value is NA: - value = None - pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) + if cudf.utils.utils.is_na_like(value): + value = None + else: + _replace_nested_nulls(value) + + if isinstance(dtype, cudf.core.dtypes._BaseDtype): + pa_type = dtype.to_arrow() + elif pd.api.types.is_string_dtype(dtype): + # Have to manually convert object types, which we use internally + # for strings but pyarrow only supports as unicode 'U' + pa_type = pa.string() + else: + pa_type = pa.from_numpy_dtype(dtype) + pa_scalar = pa.scalar(value, type=pa_type) + + cdef type_id tid + data_type = None + if isinstance(dtype, cudf.core.dtypes.DecimalDtype): tid = type_id.DECIMAL128 - if isinstance(self._dtype, cudf.core.dtypes.Decimal32Dtype): + if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): tid = type_id.DECIMAL32 - elif isinstance(self._dtype, cudf.core.dtypes.Decimal64Dtype): + elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): tid = type_id.DECIMAL64 - dt = pylibcudf.DataType(tid, -self._dtype.scale) - - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar, dt) - set_dtype = False - elif isinstance(self._dtype, cudf.ListDtype): - if value is NA: - value = None - else: - nestrepl(value) - - pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) - set_dtype = False - elif isinstance(self._dtype, cudf.StructDtype): - if value is NA: - value = None - else: - nestrepl(value) - - pa_scalar = pa.scalar(value, type=self._dtype.to_arrow()) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) - set_dtype = False - elif pd.api.types.is_string_dtype(self._dtype): - if value is NA: - value = None - pa_scalar = pa.scalar(value, type=pa.string()) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) - set_dtype = False - elif pd.api.types.is_numeric_dtype(self._dtype): - if value is NA: - value = None - pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) - set_dtype = False - elif pd.api.types.is_datetime64_dtype(self._dtype): - if value is NA or value is NaT: - value = None - pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) - set_dtype = False - elif pd.api.types.is_timedelta64_dtype(self._dtype): - if value is NA or value is NaT: - value = None - pa_scalar = pa.scalar(value, type=pa.from_numpy_dtype(self._dtype)) - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar) - set_dtype = False - else: - raise ValueError( - f"Cannot convert value of type " - f"{type(value).__name__} to cudf scalar" - ) + data_type = pylibcudf.DataType(tid, -dtype.scale) - cdef libcudf_types.data_type cdtype = self.get_raw_ptr()[0].type() - if set_dtype: - self.c_value._data_type = pylibcudf.DataType.from_libcudf(cdtype) + self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar, data_type) + self._dtype = dtype def _to_host_scalar(self): if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): From edcca20f9a3f419aa89bf94abcb7eede04e8c2c4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 8 Sep 2023 20:17:24 -0700 Subject: [PATCH 30/54] Copy to avoid overwriting inputs --- python/cudf/cudf/_lib/scalar.pyx | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 7211fe90317..b95a9e1fe6b 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. +import copy import decimal import numpy as np @@ -111,6 +112,11 @@ cdef class DeviceScalar: if cudf.utils.utils.is_na_like(value): value = None else: + # TODO: For now we always deepcopy the input value to avoid + # overwriting the input values when replacing nulls. Since it's + # just host values it's not that expensive, but we could consider + # alternatives. + value = copy.deepcopy(value) _replace_nested_nulls(value) if isinstance(dtype, cudf.core.dtypes._BaseDtype): From c81e158748dd2837ba098cbbaaaf4c1f883347c0 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 10 Sep 2023 19:49:06 -0700 Subject: [PATCH 31/54] Add error check for nested null in scalar and fix test --- cpp/src/interop/from_arrow.cu | 10 +++++++- python/cudf/cudf/tests/test_struct.py | 35 +++++++++++++++++++++++++-- 2 files changed, 42 insertions(+), 3 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 21f49a327b3..d309b6dfc57 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -39,6 +39,7 @@ #include #include +#include #include namespace cudf { @@ -582,7 +583,14 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, auto builder = make_builder(input.type); auto status = builder->AppendScalar(input); - if (status != arrow::Status::OK()) { CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); } + if (status != arrow::Status::OK()) { + if (status == arrow::Status::NotImplemented()) { + // The only known failure case here is for nulls + CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes", + std::invalid_argument); + } + CUDF_FAIL("Arrow ArrayBuilder::AppendScalar failed"); + } auto maybe_array = builder->Finish(); if (!maybe_array.ok()) { CUDF_FAIL("Arrow ArrayBuilder::Finish failed"); } diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index a3593e55b97..ce6dc587320 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -150,9 +150,7 @@ def test_struct_setitem(data, item): "data", [ {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]}, - {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, {"a": "Hello"}, - {"b": [], "c": [1, 2, 3]}, ], ) def test_struct_scalar_host_construction(data): @@ -161,6 +159,39 @@ def test_struct_scalar_host_construction(data): assert list(slr.device_value.value.values()) == list(data.values()) +@pytest.mark.parametrize( + ("data", "dtype"), + [ + ( + {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, + cudf.StructDtype( + { + "a": np.dtype(np.int64), + "b": np.dtype(np.str_), + "c": cudf.ListDtype(np.dtype(np.int64)), + "d": np.dtype(np.int64), + } + ), + ), + ( + {"b": [], "c": [1, 2, 3]}, + cudf.StructDtype( + { + "b": cudf.ListDtype(np.dtype(np.int64)), + "c": cudf.ListDtype(np.dtype(np.int64)), + } + ), + ), + ], +) +def test_struct_scalar_host_construction_no_dtype_inference(data, dtype): + # cudf cannot infer the dtype of the scalar when it contains only nulls or + # is empty. + slr = cudf.Scalar(data, dtype=dtype) + assert slr.value == data + assert list(slr.device_value.value.values()) == list(data.values()) + + def test_struct_scalar_null(): slr = cudf.Scalar(cudf.NA, dtype=StructDtype) assert slr.device_value.value is cudf.NA From 14b2f86813c01cf862ff082e5662f0a42bf75ff5 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 10 Sep 2023 20:40:05 -0700 Subject: [PATCH 32/54] Initial implementation of to_arrow --- cpp/include/cudf/interop.hpp | 11 +++++++++++ cpp/src/interop/from_arrow.cu | 2 +- cpp/src/interop/to_arrow.cu | 19 +++++++++++++++++++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index da3380e1d44..77ee1a1ff1d 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -133,6 +133,17 @@ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); +/** + * @brief Create `arrow::Scalar` from cudf scalar `input` + * + * Converts the `cudf::scalar` to `arrow::Scalar`. + * + * @param input scalar that needs to be converted to arrow Scalar + * @param ar_mr arrow memory pool to allocate memory for arrow Scalar + * @return arrow Scalar generated from `input` + */ +std::shared_ptr to_arrow(cudf::scalar const& input, + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `cudf::table` from given arrow Table input * diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index d309b6dfc57..8ff361d7e04 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -584,7 +584,7 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, auto status = builder->AppendScalar(input); if (status != arrow::Status::OK()) { - if (status == arrow::Status::NotImplemented()) { + if (status.IsNotImplemented()) { // The only known failure case here is for nulls CUDF_FAIL("Cannot create untyped null scalars or nested types with untyped null leaf nodes", std::invalid_argument); diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 958a2fcb95f..6ab6c7cd3c7 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -16,9 +16,12 @@ #include #include +#include #include #include +#include #include +#include #include #include #include @@ -37,6 +40,8 @@ #include #include +#include "cudf/filling.hpp" +#include "cudf/utilities/error.hpp" #include "detail/arrow_allocator.hpp" namespace cudf { @@ -413,4 +418,18 @@ std::shared_ptr to_arrow(table_view input, return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); } +std::shared_ptr to_arrow(cudf::scalar const& input, arrow::MemoryPool* ar_mr) +{ + auto stream = cudf::get_default_stream(); + auto column = cudf::empty_like(input); + + auto view = column->mutable_view(); + cudf::fill_in_place(view, 0, 1, input); + cudf::table_view tv{{column->view()}}; + auto arrow_table = cudf::to_arrow(tv); + auto ac = arrow_table->column(0); + auto maybe_scalar = ac->GetScalar(0); + if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } + return maybe_scalar.ValueOrDie(); +} } // namespace cudf From cd80fe0d88f1332caacbbabab26c35667231b1e7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 10 Sep 2023 20:46:00 -0700 Subject: [PATCH 33/54] Add from_arrow_scalar to Cython API --- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd index dd518aeea52..ab464d7e465 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -1,10 +1,15 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from pyarrow.lib cimport Table as pa_Table +from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table +from .scalar cimport Scalar from .table cimport Table cpdef Table from_arrow( pa_Table pyarrow_table, ) + +cpdef Scalar from_arrow_scalar( + pa_Scalar pyarrow_scalar, +) From 5fffc4f9c440c2b53467361aa764c2a3bc0034c6 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Sep 2023 10:59:07 -0700 Subject: [PATCH 34/54] Add Python wrappers for to_arrow --- python/cudf/cudf/_lib/cpp/interop.pxd | 11 +++++++++ python/cudf/cudf/_lib/pylibcudf/interop.pxd | 4 ++++ python/cudf/cudf/_lib/pylibcudf/interop.pyx | 26 ++++++++++++++++++++- 3 files changed, 40 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd index bbfcd8c4a0a..d6891f31646 100644 --- a/python/cudf/cudf/_lib/cpp/interop.pxd +++ b/python/cudf/cudf/_lib/cpp/interop.pxd @@ -33,7 +33,18 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ string name vector[column_metadata] children_meta + # TODO: Adding this for pylibcudf because in pylibcudf we don't have column + # names. However we need to figure out how this will propagate up when cudf + # starts using pylibcudf for interop functionality. + cdef shared_ptr[CTable] to_arrow( + table_view input, + ) except + + cdef shared_ptr[CTable] to_arrow( table_view input, vector[column_metadata] metadata, ) except + + + cdef shared_ptr[CScalar] to_arrow( + const scalar& input, + ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd index ab464d7e465..a547813a9d2 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -13,3 +13,7 @@ cpdef Table from_arrow( cpdef Scalar from_arrow_scalar( pa_Scalar pyarrow_scalar, ) + +cpdef pa_Table to_arrow(Table tbl) + +cpdef pa_Scalar to_arrow_scalar(Scalar slr) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 63f4ed68f7e..426534a1fed 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -1,5 +1,6 @@ # Copyright (c) 2023, NVIDIA CORPORATION. +from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move from pyarrow.lib cimport ( @@ -9,9 +10,14 @@ from pyarrow.lib cimport ( Table as pa_Table, pyarrow_unwrap_scalar, pyarrow_unwrap_table, + pyarrow_wrap_scalar, + pyarrow_wrap_table, ) -from cudf._lib.cpp.interop cimport from_arrow as cpp_from_arrow +from cudf._lib.cpp.interop cimport ( + from_arrow as cpp_from_arrow, + to_arrow as cpp_to_arrow, +) from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table @@ -45,3 +51,21 @@ cpdef Scalar from_arrow_scalar( c_result = move(cpp_from_arrow(cscalar.get()[0])) return Scalar.from_libcudf(move(c_result)) + + +cpdef pa_Table to_arrow(Table tbl): + cdef shared_ptr[pa_CTable] c_result + + with nogil: + c_result = move(cpp_to_arrow(tbl.view())) + + return pyarrow_wrap_table(c_result) + + +cpdef pa_Scalar to_arrow_scalar(Scalar slr): + cdef shared_ptr[pa_CScalar] c_result + + with nogil: + c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()))) + + return pyarrow_wrap_scalar(c_result) From cb52d2b3d5b0229392115de34400b6f659c9fa71 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Sep 2023 11:04:53 -0700 Subject: [PATCH 35/54] Add pylibcudf converter --- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 2 ++ python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 4 ++++ 2 files changed, 6 insertions(+) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index 85d57305622..87932a78f6e 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -26,3 +26,5 @@ cdef class Scalar: @staticmethod cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) + + cpdef to_pyarrow_scalar(self) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 0821258ac6c..05306126d2c 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -90,6 +90,10 @@ cdef class Scalar: ) return s + cpdef to_pyarrow_scalar(self): + from interop import to_arrow_scalar + return to_arrow_scalar(self) + cdef const scalar* get(self) except *: return self.c_obj.get() From 643d992004793d7e06b03e92a0890f58e436c213 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Sep 2023 12:24:31 -0700 Subject: [PATCH 36/54] Fix bugs in impl --- cpp/src/interop/to_arrow.cu | 7 ++++--- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 2 +- python/cudf/cudf/_lib/scalar.pyx | 4 ++++ 3 files changed, 9 insertions(+), 4 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 6ab6c7cd3c7..ffe402504e7 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -420,13 +420,14 @@ std::shared_ptr to_arrow(table_view input, std::shared_ptr to_arrow(cudf::scalar const& input, arrow::MemoryPool* ar_mr) { - auto stream = cudf::get_default_stream(); - auto column = cudf::empty_like(input); + auto stream = cudf::get_default_stream(); + auto tmp_column = cudf::empty_like(input); + auto column = cudf::allocate_like(tmp_column->view(), 1); auto view = column->mutable_view(); cudf::fill_in_place(view, 0, 1, input); cudf::table_view tv{{column->view()}}; - auto arrow_table = cudf::to_arrow(tv); + auto arrow_table = cudf::to_arrow(tv, {column_metadata{""}}); auto ac = arrow_table->column(0); auto maybe_scalar = ac->GetScalar(0); if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 05306126d2c..0b3e429c9c8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -91,7 +91,7 @@ cdef class Scalar: return s cpdef to_pyarrow_scalar(self): - from interop import to_arrow_scalar + from .interop import to_arrow_scalar return to_arrow_scalar(self) cdef const scalar* get(self) except *: diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index b95a9e1fe6b..140f5c68171 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -164,6 +164,10 @@ cdef class DeviceScalar: ) return result + # TODO: This is just here for testing and should be removed. + def get(self): + return self.c_value + @property def dtype(self): """ From 309cdfa4809676c41043d85d3a8ab7421fe901f8 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Sep 2023 15:56:44 -0700 Subject: [PATCH 37/54] Make to_arrow work for all column types --- cpp/src/interop/to_arrow.cu | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index ffe402504e7..b850e767b4d 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -15,6 +15,7 @@ */ #include +#include #include #include #include @@ -40,8 +41,6 @@ #include #include -#include "cudf/filling.hpp" -#include "cudf/utilities/error.hpp" #include "detail/arrow_allocator.hpp" namespace cudf { @@ -420,12 +419,8 @@ std::shared_ptr to_arrow(table_view input, std::shared_ptr to_arrow(cudf::scalar const& input, arrow::MemoryPool* ar_mr) { - auto stream = cudf::get_default_stream(); - auto tmp_column = cudf::empty_like(input); - auto column = cudf::allocate_like(tmp_column->view(), 1); - - auto view = column->mutable_view(); - cudf::fill_in_place(view, 0, 1, input); + auto stream = cudf::get_default_stream(); + auto column = cudf::make_column_from_scalar(input, 1); cudf::table_view tv{{column->view()}}; auto arrow_table = cudf::to_arrow(tv, {column_metadata{""}}); auto ac = arrow_table->column(0); From 4e4bc6441cc3fe2130170ecdfd7213059b52117b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 11 Sep 2023 18:15:35 -0700 Subject: [PATCH 38/54] Add to_arrow overload for 32-bit decimal --- cpp/src/interop/to_arrow.cu | 40 +++++++++++++++++++++++++++++++++++++ 1 file changed, 40 insertions(+) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index b850e767b4d..2d7f9ca4d55 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -143,6 +143,46 @@ struct dispatch_to_arrow { } }; +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id, + column_metadata const&, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + using DeviceType = int32_t; + size_type const BIT_WIDTH_RATIO = 4; // Array::Type:type::DECIMAL (128) / int32_t + + rmm::device_uvector<__int128_t> buf(input.size() * BIT_WIDTH_RATIO, stream); + + auto count = thrust::make_counting_iterator(0); + + thrust::for_each(rmm::exec_policy(cudf::get_default_stream()), + count, + count + input.size(), + [in = input.begin(), out = buf.data()] __device__(auto in_idx) { + auto const out_idx = in_idx; + auto unsigned_value = in[in_idx] < 0 ? -in[in_idx] : in[in_idx]; + auto unsigned_128bit = static_cast<__int128_t>(unsigned_value); + auto signed_128bit = in[in_idx] < 0 ? -unsigned_128bit : unsigned_128bit; + out[out_idx] = signed_128bit; + }); + + auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); + auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); + + CUDF_CUDA_TRY(cudaMemcpyAsync( + data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); + + auto type = arrow::decimal(9, -input.type().scale()); + auto mask = fetch_mask_buffer(input, ar_mr, stream); + auto buffers = std::vector>{mask, std::move(data_buffer)}; + auto data = std::make_shared(type, input.size(), buffers); + + return std::make_shared(data); +} + template <> std::shared_ptr dispatch_to_arrow::operator()( column_view input, From c62c71bf81318fc3c8d7467c8a28298edf6ba911 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 13 Sep 2023 18:02:06 -0700 Subject: [PATCH 39/54] Add support for list/struct up to output field naming --- python/cudf/cudf/_lib/scalar.pyx | 63 ++++++++++++++++++++++++++++---- 1 file changed, 56 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 140f5c68171..9bf373b544c 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -85,6 +85,21 @@ def _replace_nested_nulls(obj): _replace_nested_nulls(v) +def _replace_nested_none(obj): + if isinstance(obj, list): + for i, item in enumerate(obj): + if item is None: + obj[i] = NA + elif isinstance(item, (dict, list)): + _replace_nested_none(item) + elif isinstance(obj, dict): + for k, v in obj.items(): + if v is None: + obj[k] = NA + elif isinstance(v, (dict, list)): + _replace_nested_none(v) + + cdef class DeviceScalar: # I think this should be removable, except that currently the way that @@ -145,19 +160,53 @@ cdef class DeviceScalar: def _to_host_scalar(self): if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): - result = _get_py_decimal_from_fixed_point(self.c_value.c_obj) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NA + return ps.as_py() + # For nested types we should eventually account for the special cases + # that we handle for other types, e.g. numerics being casted to numpy + # types or datetime/timedelta needing to be cast to int64 to handle + # overflow. However, the old implementation didn't handle these cases + # either, so we can leave that for a follow-up PR. elif cudf.api.types.is_struct_dtype(self.dtype): - result = _get_py_dict_from_struct(self.c_value.c_obj, self.dtype) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NA + ret = ps.as_py() + _replace_nested_none(ret) + return ret elif cudf.api.types.is_list_dtype(self.dtype): - result = _get_py_list_from_list(self.c_value.c_obj, self.dtype) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NA + ret = ps.as_py() + _replace_nested_none(ret) + return ret elif pd.api.types.is_string_dtype(self.dtype): - result = _get_py_string_from_string(self.c_value.c_obj) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NA + return ps.as_py() elif pd.api.types.is_numeric_dtype(self.dtype): - result = _get_np_scalar_from_numeric(self.c_value.c_obj) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NA + return ps.type.to_pandas_dtype()(ps.as_py()) elif pd.api.types.is_datetime64_dtype(self.dtype): - result = _get_np_scalar_from_timestamp64(self.c_value.c_obj) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NaT + time_unit, _ = np.datetime_data(self.dtype) + # Cast to int64 to avoid overflow + return np.datetime64(ps.cast('int64').as_py(), time_unit) elif pd.api.types.is_timedelta64_dtype(self.dtype): - result = _get_np_scalar_from_timedelta64(self.c_value.c_obj) + ps = self.c_value.to_pyarrow_scalar() + if not ps.is_valid: + return NaT + time_unit, _ = np.datetime_data(self.dtype) + # Cast to int64 to avoid overflow + return np.timedelta64(ps.cast('int64').as_py(), time_unit) else: raise ValueError( "Could not convert cudf::scalar to a Python value" From d6eee4b2ec7942f6656e7e7bd6452b4cdc2d7c83 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 10:01:10 -0700 Subject: [PATCH 40/54] Fully implement metadata --- cpp/include/cudf/interop.hpp | 2 ++ cpp/src/interop/to_arrow.cu | 7 ++-- python/cudf/cudf/_lib/cpp/interop.pxd | 8 +---- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 11 +++++-- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 35 +++++++++++++++++--- python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 4 ++- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 4 +-- python/cudf/cudf/_lib/scalar.pyx | 36 +++++++++++++++++---- python/cudf/cudf/tests/test_list.py | 4 +-- 9 files changed, 84 insertions(+), 27 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 77ee1a1ff1d..3ad99cbcc99 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -139,10 +139,12 @@ std::shared_ptr to_arrow(table_view input, * Converts the `cudf::scalar` to `arrow::Scalar`. * * @param input scalar that needs to be converted to arrow Scalar + * @param metadata Contains hierarchy of names of columns and children * @param ar_mr arrow memory pool to allocate memory for arrow Scalar * @return arrow Scalar generated from `input` */ std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata = {}, arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `cudf::table` from given arrow Table input diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 2d7f9ca4d55..b501c1e1d7e 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -457,12 +457,15 @@ std::shared_ptr to_arrow(table_view input, return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); } -std::shared_ptr to_arrow(cudf::scalar const& input, arrow::MemoryPool* ar_mr) +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + + arrow::MemoryPool* ar_mr) { auto stream = cudf::get_default_stream(); auto column = cudf::make_column_from_scalar(input, 1); cudf::table_view tv{{column->view()}}; - auto arrow_table = cudf::to_arrow(tv, {column_metadata{""}}); + auto arrow_table = cudf::to_arrow(tv, {metadata}); auto ac = arrow_table->column(0); auto maybe_scalar = ac->GetScalar(0); if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd index d6891f31646..88e9d83ee98 100644 --- a/python/cudf/cudf/_lib/cpp/interop.pxd +++ b/python/cudf/cudf/_lib/cpp/interop.pxd @@ -33,13 +33,6 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ string name vector[column_metadata] children_meta - # TODO: Adding this for pylibcudf because in pylibcudf we don't have column - # names. However we need to figure out how this will propagate up when cudf - # starts using pylibcudf for interop functionality. - cdef shared_ptr[CTable] to_arrow( - table_view input, - ) except + - cdef shared_ptr[CTable] to_arrow( table_view input, vector[column_metadata] metadata, @@ -47,4 +40,5 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ cdef shared_ptr[CScalar] to_arrow( const scalar& input, + column_metadata metadata, ) except + diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd index a547813a9d2..c1268b8ca1a 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pxd @@ -2,10 +2,17 @@ from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table +from cudf._lib.cpp.interop cimport column_metadata + from .scalar cimport Scalar from .table cimport Table +cdef class ColumnMetadata: + cdef public object name + cdef public object children_meta + cdef column_metadata to_c_metadata(self) + cpdef Table from_arrow( pa_Table pyarrow_table, ) @@ -14,6 +21,6 @@ cpdef Scalar from_arrow_scalar( pa_Scalar pyarrow_scalar, ) -cpdef pa_Table to_arrow(Table tbl) +cpdef pa_Table to_arrow(Table tbl, list metadata) -cpdef pa_Scalar to_arrow_scalar(Scalar slr) +cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx index 426534a1fed..9e6b44b117d 100644 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/interop.pyx @@ -3,6 +3,7 @@ from cython.operator cimport dereference from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.utility cimport move +from libcpp.vector cimport vector from pyarrow.lib cimport ( CScalar as pa_CScalar, CTable as pa_CTable, @@ -15,6 +16,7 @@ from pyarrow.lib cimport ( ) from cudf._lib.cpp.interop cimport ( + column_metadata, from_arrow as cpp_from_arrow, to_arrow as cpp_to_arrow, ) @@ -25,6 +27,26 @@ from .scalar cimport Scalar from .table cimport Table +cdef class ColumnMetadata: + def __init__(self, name): + self.name = name + self.children_meta = [] + + cdef column_metadata to_c_metadata(self): + """Convert to C++ column_metadata. + + Since this class is mutable and cheap, it is easier to create the C++ + object on the fly rather than have it directly backing the storage for + the Cython class. + """ + cdef column_metadata c_metadata + cdef ColumnMetadata child_meta + c_metadata.name = self.name.encode() + for child_meta in self.children_meta: + c_metadata.children_meta.push_back(child_meta.to_c_metadata()) + return c_metadata + + cpdef Table from_arrow( pa_Table pyarrow_table, ): @@ -53,19 +75,24 @@ cpdef Scalar from_arrow_scalar( return Scalar.from_libcudf(move(c_result)) -cpdef pa_Table to_arrow(Table tbl): +cpdef pa_Table to_arrow(Table tbl, list metadata): cdef shared_ptr[pa_CTable] c_result + cdef vector[column_metadata] c_metadata + cdef ColumnMetadata meta + for meta in metadata: + c_metadata.push_back(meta.to_c_metadata()) with nogil: - c_result = move(cpp_to_arrow(tbl.view())) + c_result = move(cpp_to_arrow(tbl.view(), c_metadata)) return pyarrow_wrap_table(c_result) -cpdef pa_Scalar to_arrow_scalar(Scalar slr): +cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata): cdef shared_ptr[pa_CScalar] c_result + cdef column_metadata c_metadata = metadata.to_c_metadata() with nogil: - c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()))) + c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata)) return pyarrow_wrap_scalar(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd index 87932a78f6e..d20c65f0be0 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd @@ -27,4 +27,6 @@ cdef class Scalar: @staticmethod cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) - cpdef to_pyarrow_scalar(self) + # TODO: Make sure I'm correct to avoid typing the metadata as + # ColumnMetadata, I assume that will cause circular cimport problems + cpdef to_pyarrow_scalar(self, metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx index 0b3e429c9c8..d06bb9adeb9 100644 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx @@ -90,9 +90,9 @@ cdef class Scalar: ) return s - cpdef to_pyarrow_scalar(self): + cpdef to_pyarrow_scalar(self, metadata): from .interop import to_arrow_scalar - return to_arrow_scalar(self) + return to_arrow_scalar(self, metadata) cdef const scalar* get(self) except *: return self.c_obj.get() diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 9bf373b544c..30d46bd3fe2 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -22,6 +22,7 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move import cudf +from cudf._lib.pylibcudf.interop import ColumnMetadata from cudf._lib.pylibcudf.types cimport type_id @@ -30,6 +31,7 @@ from cudf._lib.types import ( datetime_unit_map, duration_unit_map, ) +from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT @@ -100,6 +102,25 @@ def _replace_nested_none(obj): _replace_nested_none(v) +def gather_metadata(dtypes): + # dtypes is a dict mapping names to column dtypes + # This interface is a bit clunky, but it matches libcudf. May want to + # consider better approaches to building up the metadata eventually. + out = [] + for name, dtype in dtypes.items(): + v = ColumnMetadata(name) + if is_struct_dtype(dtype): + v.children_meta = gather_metadata(dtype.fields) + elif is_list_dtype(dtype): + # Offsets column is unnamed and has no children + v.children_meta.append(ColumnMetadata("")) + v.children_meta.extend( + gather_metadata({"": dtype.element_type}) + ) + out.append(v) + return out + + cdef class DeviceScalar: # I think this should be removable, except that currently the way that @@ -159,8 +180,9 @@ cdef class DeviceScalar: self._dtype = dtype def _to_host_scalar(self): + metadata = gather_metadata({"": self.dtype})[0] if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NA return ps.as_py() @@ -170,38 +192,38 @@ cdef class DeviceScalar: # overflow. However, the old implementation didn't handle these cases # either, so we can leave that for a follow-up PR. elif cudf.api.types.is_struct_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NA ret = ps.as_py() _replace_nested_none(ret) return ret elif cudf.api.types.is_list_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NA ret = ps.as_py() _replace_nested_none(ret) return ret elif pd.api.types.is_string_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NA return ps.as_py() elif pd.api.types.is_numeric_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NA return ps.type.to_pandas_dtype()(ps.as_py()) elif pd.api.types.is_datetime64_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NaT time_unit, _ = np.datetime_data(self.dtype) # Cast to int64 to avoid overflow return np.datetime64(ps.cast('int64').as_py(), time_unit) elif pd.api.types.is_timedelta64_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar() + ps = self.c_value.to_pyarrow_scalar(metadata) if not ps.is_valid: return NaT time_unit, _ = np.datetime_data(self.dtype) diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5dd58d8a875..ac10dd97c56 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -895,14 +895,14 @@ def test_memory_usage(): "data, idx", [ ( - [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]], + [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]], 0, ), ( [ [ {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"}, - {"f1": "sf12", "f2": None}, + {"f1": "sf12", "f2": NA}, ] ], 0, From 1591bdba3ecdaab33399f7d969e5c30eaf337c12 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 10:56:55 -0700 Subject: [PATCH 41/54] Split up detail and public APIs --- cpp/include/cudf/detail/interop.hpp | 79 ++++++++++++- cpp/include/cudf/interop.hpp | 2 +- cpp/src/interop/from_arrow.cu | 177 +++++++++++----------------- cpp/src/interop/to_arrow.cu | 25 ++-- 4 files changed, 161 insertions(+), 122 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 3d4832c8d17..6ddb4fbd1ed 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -104,11 +104,67 @@ std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) } } +/** + * @brief Invokes an `operator()` template with the type instantiation based on + * the specified `arrow::DataType`'s `id()`. + * + * This function is analogous to libcudf's type_dispatcher, but instead applies + * to Arrow functions. It's primary use case is to leverage Arrow's + * metaprogramming facilities like arrow::TypeTraits that require translating + * the runtime dtype information into compile-time types. + */ +template +constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, + Functor f, + Ts&&... args) +{ + switch (dtype.id()) { + case arrow::Type::INT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::INT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT8: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT16: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT32: + return f.template operator()(std::forward(args)...); + case arrow::Type::UINT64: + return f.template operator()(std::forward(args)...); + case arrow::Type::FLOAT: + return f.template operator()(std::forward(args)...); + case arrow::Type::DOUBLE: + return f.template operator()(std::forward(args)...); + case arrow::Type::BOOL: + return f.template operator()(std::forward(args)...); + case arrow::Type::TIMESTAMP: + return f.template operator()(std::forward(args)...); + case arrow::Type::DURATION: + return f.template operator()(std::forward(args)...); + case arrow::Type::STRING: + return f.template operator()(std::forward(args)...); + case arrow::Type::LIST: + return f.template operator()(std::forward(args)...); + case arrow::Type::DECIMAL128: + return f.template operator()(std::forward(args)...); + case arrow::Type::STRUCT: + return f.template operator()(std::forward(args)...); + default: { + CUDF_FAIL("Invalid type."); + } + } +} + // Converting arrow type to cudf type data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); /** - * @copydoc cudf::to_arrow + * @copydoc cudf::to_arrow(table_view input, std::vector const& metadata, + * arrow::MemoryPool* ar_mr) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -118,7 +174,18 @@ std::shared_ptr to_arrow(table_view input, arrow::MemoryPool* ar_mr); /** - * @copydoc cudf::arrow_to_cudf + * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata, + * arrow::MemoryPool* ar_mr) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr); +/** + * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::mr::device_memory_resource* mr) * * @param stream CUDA stream used for device memory operations and kernel launches. */ @@ -126,5 +193,13 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); +/** + * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::mr::device_memory_resource* mr) + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); } // namespace detail } // namespace cudf diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 3ad99cbcc99..34232a90cbc 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -159,7 +159,7 @@ std::unique_ptr
from_arrow( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Create `cudf::table` from given arrow Scalar input + * @brief Create `cudf::scalar` from given arrow Scalar input * * @param input arrow:Scalar that needs to be converted to `cudf::scalar` * @param mr Device memory resource used to allocate `cudf::scalar` diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 8ff361d7e04..67b32f0e831 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -424,6 +424,52 @@ std::unique_ptr get_column(arrow::Array const& array, : get_empty_type_column(array.length()); } +struct BuilderGenerator { + template && + !std::is_same_v)> + std::shared_ptr operator()(std::shared_ptr const& type) + { + return std::make_shared::BuilderType>( + type, arrow::default_memory_pool()); + } + + template || + std::is_same_v)> + std::shared_ptr operator()(std::shared_ptr const& type) + { + CUDF_FAIL("Type not support for BuilderGenerator"); + } +}; + +std::shared_ptr make_builder(std::shared_ptr const& type) +{ + switch (type->id()) { + case arrow::Type::STRUCT: { + std::vector> field_builders; + + for (auto i = 0; i < type->num_fields(); ++i) { + auto const vt = type->field(i)->type(); + if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { + field_builders.push_back(make_builder(vt)); + } else { + field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); + } + } + return std::make_shared( + type, arrow::default_memory_pool(), field_builders); + } + case arrow::Type::LIST: { + return std::make_shared(arrow::default_memory_pool(), + make_builder(type->field(0)->type())); + } + default: { + return arrow_type_dispatcher(*type, BuilderGenerator{}, type); + } + } +} + } // namespace std::unique_ptr
from_arrow(arrow::Table const& input_table, @@ -467,120 +513,12 @@ std::unique_ptr
from_arrow(arrow::Table const& input_table, return std::make_unique
(std::move(columns)); } -} // namespace detail - -std::unique_ptr
from_arrow(arrow::Table const& input_table, - rmm::mr::device_memory_resource* mr) -{ - CUDF_FUNC_RANGE(); - - return detail::from_arrow(input_table, cudf::get_default_stream(), mr); -} - -template -constexpr decltype(auto) arrow_type_dispatcher(arrow::DataType const& dtype, - Functor f, - Ts&&... args) -{ - switch (dtype.id()) { - case arrow::Type::INT8: - return f.template operator()(std::forward(args)...); - case arrow::Type::INT16: - return f.template operator()(std::forward(args)...); - case arrow::Type::INT32: - return f.template operator()(std::forward(args)...); - case arrow::Type::INT64: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT8: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT16: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT32: - return f.template operator()(std::forward(args)...); - case arrow::Type::UINT64: - return f.template operator()(std::forward(args)...); - case arrow::Type::FLOAT: - return f.template operator()(std::forward(args)...); - case arrow::Type::DOUBLE: - return f.template operator()(std::forward(args)...); - case arrow::Type::BOOL: - return f.template operator()(std::forward(args)...); - case arrow::Type::TIMESTAMP: - return f.template operator()(std::forward(args)...); - case arrow::Type::DURATION: - return f.template operator()(std::forward(args)...); - // case arrow::Type::DICTIONARY32: - // return f.template operator()( - // std::forward(args)...); - case arrow::Type::STRING: - return f.template operator()(std::forward(args)...); - case arrow::Type::LIST: - return f.template operator()(std::forward(args)...); - case arrow::Type::DECIMAL128: - return f.template operator()(std::forward(args)...); - case arrow::Type::STRUCT: - return f.template operator()(std::forward(args)...); - default: { - CUDF_FAIL("Invalid type."); - } - } -} - -struct BuilderGenerator { - template - std::shared_ptr operator()(std::shared_ptr const& type) - { - return std::make_shared::BuilderType>( - type, arrow::default_memory_pool()); - } -}; - -template <> -std::shared_ptr BuilderGenerator::operator()( - std::shared_ptr const& type) -{ - CUDF_FAIL("Not implemented"); -} - -template <> -std::shared_ptr BuilderGenerator::operator()( - std::shared_ptr const& type) -{ - CUDF_FAIL("Not implemented"); -} - -std::shared_ptr make_builder(std::shared_ptr const& type) -{ - switch (type->id()) { - case arrow::Type::STRUCT: { - std::vector> field_builders; - - for (auto i = 0; i < type->num_fields(); ++i) { - auto const vt = type->field(i)->type(); - if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { - field_builders.push_back(make_builder(vt)); - } else { - field_builders.push_back(arrow_type_dispatcher(*vt, BuilderGenerator{}, vt)); - } - } - return std::make_shared( - type, arrow::default_memory_pool(), field_builders); - } - case arrow::Type::LIST: { - return std::make_shared(arrow::default_memory_pool(), - make_builder(type->field(0)->type())); - } - default: { - return arrow_type_dispatcher(*type, BuilderGenerator{}, type); - } - } -} - std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { // Get a builder for the scalar type - auto builder = make_builder(input.type); + auto builder = detail::make_builder(input.type); auto status = builder->AppendScalar(input); if (status != arrow::Status::OK()) { @@ -600,7 +538,7 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, auto table = arrow::Table::Make(arrow::schema({field}), {array}); - auto cudf_table = from_arrow(*table); + auto cudf_table = detail::from_arrow(*table, stream, mr); auto col = cudf_table->get_column(0); @@ -608,4 +546,21 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, return get_element(cv, 0); } +} // namespace detail + +std::unique_ptr
from_arrow(arrow::Table const& input_table, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow(input_table, cudf::get_default_stream(), mr); +} + +std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + + return detail::from_arrow(input, cudf::get_default_stream(), mr); +} } // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index b501c1e1d7e..5874867fc6f 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -447,6 +447,21 @@ std::shared_ptr to_arrow(table_view input, return result; } + +std::shared_ptr to_arrow(cudf::scalar const& input, + column_metadata const& metadata, + + rmm::cuda_stream_view stream, + arrow::MemoryPool* ar_mr) +{ + auto column = cudf::make_column_from_scalar(input, 1); + cudf::table_view tv{{column->view()}}; + auto arrow_table = cudf::to_arrow(tv, {metadata}); + auto ac = arrow_table->column(0); + auto maybe_scalar = ac->GetScalar(0); + if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } + return maybe_scalar.ValueOrDie(); +} } // namespace detail std::shared_ptr to_arrow(table_view input, @@ -462,13 +477,7 @@ std::shared_ptr to_arrow(cudf::scalar const& input, arrow::MemoryPool* ar_mr) { - auto stream = cudf::get_default_stream(); - auto column = cudf::make_column_from_scalar(input, 1); - cudf::table_view tv{{column->view()}}; - auto arrow_table = cudf::to_arrow(tv, {metadata}); - auto ac = arrow_table->column(0); - auto maybe_scalar = ac->GetScalar(0); - if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } - return maybe_scalar.ValueOrDie(); + CUDF_FUNC_RANGE(); + return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); } } // namespace cudf From 7fff56db10dc713ddf60acf82703f63742be4357 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 11:00:40 -0700 Subject: [PATCH 42/54] Add streams to public APIs --- cpp/include/cudf/detail/interop.hpp | 18 ++++++------------ cpp/include/cudf/interop.hpp | 10 +++++++++- cpp/src/interop/from_arrow.cu | 6 ++++-- cpp/src/interop/to_arrow.cu | 7 ++++--- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 6ddb4fbd1ed..6a294b1d360 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -164,9 +164,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type); /** * @copydoc cudf::to_arrow(table_view input, std::vector const& metadata, - * arrow::MemoryPool* ar_mr) - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata, @@ -175,9 +173,7 @@ std::shared_ptr to_arrow(table_view input, /** * @copydoc cudf::to_arrow(cudf::scalar const& input, column_metadata const& metadata, - * arrow::MemoryPool* ar_mr) - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) */ std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata, @@ -185,18 +181,16 @@ std::shared_ptr to_arrow(cudf::scalar const& input, rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr); /** - * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::mr::device_memory_resource* mr) - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @copydoc cudf::from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) */ std::unique_ptr
from_arrow(arrow::Table const& input_table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr); /** - * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::mr::device_memory_resource* mr) - * - * @param stream CUDA stream used for device memory operations and kernel launches. + * @copydoc cudf::from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, + * rmm::mr::device_memory_resource* mr) */ std::unique_ptr from_arrow(arrow::Scalar const& input, rmm::cuda_stream_view stream, diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 34232a90cbc..b1514699e21 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -126,12 +126,14 @@ struct column_metadata { * * @param input table_view that needs to be converted to arrow Table * @param metadata Contains hierarchy of names of columns and children + * @param stream CUDA stream used for device memory operations and kernel launches. * @param ar_mr arrow memory pool to allocate memory for arrow Table * @return arrow Table generated from `input` */ std::shared_ptr to_arrow(table_view input, std::vector const& metadata = {}, - arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); + rmm::cuda_stream_view stream = cudf::get_default_stream(), + arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `arrow::Scalar` from cudf scalar `input` @@ -140,34 +142,40 @@ std::shared_ptr to_arrow(table_view input, * * @param input scalar that needs to be converted to arrow Scalar * @param metadata Contains hierarchy of names of columns and children + * @param stream CUDA stream used for device memory operations and kernel launches. * @param ar_mr arrow memory pool to allocate memory for arrow Scalar * @return arrow Scalar generated from `input` */ std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata = {}, + rmm::cuda_stream_view stream = cudf::get_default_stream(), arrow::MemoryPool* ar_mr = arrow::default_memory_pool()); /** * @brief Create `cudf::table` from given arrow Table input * * @param input arrow:Table that needs to be converted to `cudf::table` + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow Table */ std::unique_ptr
from_arrow( arrow::Table const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** * @brief Create `cudf::scalar` from given arrow Scalar input * * @param input arrow:Scalar that needs to be converted to `cudf::scalar` + * @param stream CUDA stream used for device memory operations and kernel launches. * @param mr Device memory resource used to allocate `cudf::scalar` * @return cudf scalar generated from given arrow Scalar */ std::unique_ptr from_arrow( arrow::Scalar const& input, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 67b32f0e831..d3e781db986 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -549,18 +549,20 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, } // namespace detail std::unique_ptr
from_arrow(arrow::Table const& input_table, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_arrow(input_table, cudf::get_default_stream(), mr); + return detail::from_arrow(input_table, stream, mr); } std::unique_ptr from_arrow(arrow::Scalar const& input, + rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::from_arrow(input, cudf::get_default_stream(), mr); + return detail::from_arrow(input, stream, mr); } } // namespace cudf diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 5874867fc6f..6a6dc3fa0e2 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -466,18 +466,19 @@ std::shared_ptr to_arrow(cudf::scalar const& input, std::shared_ptr to_arrow(table_view input, std::vector const& metadata, + rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) { CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); + return detail::to_arrow(input, metadata, stream, ar_mr); } std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata, - + rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) { CUDF_FUNC_RANGE(); - return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr); + return detail::to_arrow(input, metadata, stream, ar_mr); } } // namespace cudf From 00050f6fc74dc86c0f4f2af55a8c35061d2331af Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 13:35:38 -0700 Subject: [PATCH 43/54] Add tests of from_arrow --- cpp/tests/interop/from_arrow_test.cpp | 92 +++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 9a5cc3733af..6716bac0b96 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -456,3 +456,95 @@ INSTANTIATE_TEST_CASE_P(FromArrowTest, std::make_tuple(0, 0), std::make_tuple(0, 3000), std::make_tuple(10000, 10000))); + +template +struct FromArrowNumericScalarTest : public cudf::test::BaseFixture {}; + +TYPED_TEST_SUITE(FromArrowNumericScalarTest, cudf::test::NumericTypes); + +TYPED_TEST(FromArrowNumericScalarTest, Basic) +{ + auto const value = TypeParam(42); + auto arrow_scalar = arrow::MakeScalar(value); + auto cudf_scalar = cudf::from_arrow(*arrow_scalar.get()); + auto cudf_numeric_scalar = dynamic_cast*>(cudf_scalar.get()); + if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); } + EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id())); + EXPECT_EQ(cudf_numeric_scalar->value(), value); +} + +struct FromArrowDecimalScalarTest : public cudf::test::BaseFixture {}; + +// Only testing Decimal128 because that's the only size cudf and arrow have in common. +TEST_F(FromArrowDecimalScalarTest, Basic) +{ + auto const value = 42; + auto const precision = 8; + auto const scale = 4; + auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale)); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); + + // Arrow offers a minimum of 128 bits for the Decimal type. + auto cudf_decimal_scalar = + dynamic_cast*>(cudf_scalar.get()); + EXPECT_EQ(cudf_decimal_scalar->type(), + cudf::data_type(cudf::type_to_id(), scale)); + EXPECT_EQ(cudf_decimal_scalar->value(), value); +} + +struct FromArrowStringScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowStringScalarTest, Basic) +{ + auto value = std::string("hello world"); + auto arrow_scalar = arrow::StringScalar(value); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto cudf_string_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING)); + EXPECT_EQ(cudf_string_scalar->to_string(), value); +} + +struct FromArrowListScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowListScalarTest, Basic) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + arrow::Int64Builder builder; + auto status = builder.AppendValues(host_values, host_validity); + auto maybe_array = builder.Finish(); + auto array = *maybe_array; + + auto arrow_scalar = arrow::ListScalar(array); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto cudf_list_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST)); + + cudf::test::fixed_width_column_wrapper lhs( + host_values.begin(), host_values.end(), host_validity.begin()); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(lhs, cudf_list_scalar->view()); +} + +struct FromArrowStructScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(FromArrowStructScalarTest, Basic) +{ + int64_t const value = 42; + auto underlying_arrow_scalar = arrow::MakeScalar(value); + + auto field = arrow::field("", underlying_arrow_scalar->type); + auto arrow_type = arrow::struct_({field}); + auto arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); + + auto cudf_struct_scalar = dynamic_cast(cudf_scalar.get()); + EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT)); + + cudf::test::fixed_width_column_wrapper col({value}); + cudf::table_view lhs({col}); + + CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view()); +} From 1c2570b76a1b59fd45b77da00750d1ed3300b970 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 15:53:36 -0700 Subject: [PATCH 44/54] Add tests of to_arrow --- cpp/tests/interop/from_arrow_test.cpp | 20 ++--- cpp/tests/interop/to_arrow_test.cpp | 102 ++++++++++++++++++++++++++ 2 files changed, 113 insertions(+), 9 deletions(-) diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 6716bac0b96..5305b0103d8 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -460,13 +460,15 @@ INSTANTIATE_TEST_CASE_P(FromArrowTest, template struct FromArrowNumericScalarTest : public cudf::test::BaseFixture {}; -TYPED_TEST_SUITE(FromArrowNumericScalarTest, cudf::test::NumericTypes); +using NumericTypesNotBool = + cudf::test::Concat; +TYPED_TEST_SUITE(FromArrowNumericScalarTest, NumericTypesNotBool); TYPED_TEST(FromArrowNumericScalarTest, Basic) { - auto const value = TypeParam(42); + TypeParam const value{42}; auto arrow_scalar = arrow::MakeScalar(value); - auto cudf_scalar = cudf::from_arrow(*arrow_scalar.get()); + auto cudf_scalar = cudf::from_arrow(*arrow_scalar); auto cudf_numeric_scalar = dynamic_cast*>(cudf_scalar.get()); if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); } EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id())); @@ -478,11 +480,11 @@ struct FromArrowDecimalScalarTest : public cudf::test::BaseFixture {}; // Only testing Decimal128 because that's the only size cudf and arrow have in common. TEST_F(FromArrowDecimalScalarTest, Basic) { - auto const value = 42; - auto const precision = 8; - auto const scale = 4; - auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale)); - auto cudf_scalar = cudf::from_arrow(arrow_scalar); + auto const value{42}; + auto const precision{8}; + auto const scale{4}; + auto arrow_scalar = arrow::Decimal128Scalar(value, arrow::decimal128(precision, -scale)); + auto cudf_scalar = cudf::from_arrow(arrow_scalar); // Arrow offers a minimum of 128 bits for the Decimal type. auto cudf_decimal_scalar = @@ -532,7 +534,7 @@ struct FromArrowStructScalarTest : public cudf::test::BaseFixture {}; TEST_F(FromArrowStructScalarTest, Basic) { - int64_t const value = 42; + int64_t const value{42}; auto underlying_arrow_scalar = arrow::MakeScalar(value); auto field = arrow::field("", underlying_arrow_scalar->type); diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 97d80984272..2982cdb2760 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -578,4 +579,105 @@ INSTANTIATE_TEST_CASE_P(ToArrowTest, std::make_tuple(0, 0), std::make_tuple(0, 3000))); +template +struct ToArrowNumericScalarTest : public cudf::test::BaseFixture {}; + +using NumericTypesNotBool = + cudf::test::Concat; +TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool); + +TYPED_TEST(ToArrowNumericScalarTest, Basic) +{ + TypeParam const value{42}; + auto cudf_scalar = cudf::make_fixed_width_scalar(value); + + cudf::column_metadata metadata{""}; + auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto ref_arrow_scalar = arrow::MakeScalar(value); + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowDecimalScalarTest : public cudf::test::BaseFixture {}; + +// Only testing Decimal128 because that's the only size cudf and arrow have in common. +TEST_F(ToArrowDecimalScalarTest, Basic) +{ + auto const value{42}; + auto const precision{18}; // cudf will convert to the widest-precision Arrow scalar of the type + int32_t const scale{4}; + + auto cudf_scalar = + cudf::make_fixed_point_scalar(value, numeric::scale_type{scale}); + + cudf::column_metadata metadata{""}; + auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto maybe_ref_arrow_scalar = arrow::MakeScalar(arrow::decimal128(precision, -scale), value); + if (!maybe_ref_arrow_scalar.ok()) { CUDF_FAIL("Failed to construct reference scalar"); } + auto ref_arrow_scalar = *maybe_ref_arrow_scalar; + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowStringScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowStringScalarTest, Basic) +{ + std::string const value{"hello world"}; + auto cudf_scalar = cudf::make_string_scalar(value); + cudf::column_metadata metadata{""}; + auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto ref_arrow_scalar = arrow::MakeScalar(value); + EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); +} + +struct ToArrowListScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowListScalarTest, Basic) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + cudf::test::fixed_width_column_wrapper col( + host_values.begin(), host_values.end(), host_validity.begin()); + + auto cudf_scalar = cudf::make_list_scalar(col); + + cudf::column_metadata metadata{""}; + auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + arrow::Int64Builder builder; + auto status = builder.AppendValues(host_values, host_validity); + auto maybe_array = builder.Finish(); + auto array = *maybe_array; + + auto ref_arrow_scalar = arrow::ListScalar(array); + + EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); +} + +struct ToArrowStructScalarTest : public cudf::test::BaseFixture {}; + +TEST_F(ToArrowStructScalarTest, Basic) +{ + int64_t const value{42}; + auto const field_name{"a"}; + + cudf::test::fixed_width_column_wrapper col{value}; + cudf::table_view tbl({col}); + auto cudf_scalar = cudf::make_struct_scalar(tbl); + + cudf::column_metadata metadata{""}; + metadata.children_meta.emplace_back(field_name); + auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + + auto underlying_arrow_scalar = arrow::MakeScalar(value); + auto field = arrow::field(field_name, underlying_arrow_scalar->type, false); + auto arrow_type = arrow::struct_({field}); + auto ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + + EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); +} + CUDF_TEST_PROGRAM_MAIN() From 663fde08e081003545ad38166afbe1fcd1467904 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 16:56:23 -0700 Subject: [PATCH 45/54] Add stream tests and fix missing stream usage --- cpp/src/interop/from_arrow.cu | 4 +-- cpp/src/interop/to_arrow.cu | 5 ++-- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/interop_test.cpp | 41 ++++++++++++++++++++++++++++++ 4 files changed, 46 insertions(+), 5 deletions(-) create mode 100644 cpp/tests/streams/interop_test.cpp diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index d3e781db986..6e0ef462ac9 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -540,10 +540,10 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, auto cudf_table = detail::from_arrow(*table, stream, mr); - auto col = cudf_table->get_column(0); + auto& col = cudf_table->get_column(0); auto cv = col.view(); - return get_element(cv, 0); + return get_element(cv, 0, stream); } } // namespace detail diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 6a6dc3fa0e2..186f3798154 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -450,13 +450,12 @@ std::shared_ptr to_arrow(table_view input, std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata, - rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) { - auto column = cudf::make_column_from_scalar(input, 1); + auto column = cudf::make_column_from_scalar(input, 1, stream); cudf::table_view tv{{column->view()}}; - auto arrow_table = cudf::to_arrow(tv, {metadata}); + auto arrow_table = cudf::to_arrow(tv, {metadata}, stream); auto ac = arrow_table->column(0); auto maybe_scalar = ac->GetScalar(0); if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index a69dc9bf2f8..af241996e27 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -627,6 +627,7 @@ ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE t ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_REPLACE_TEST streams/replace_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_SEARCH_TEST streams/search_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_INTEROP_TEST streams/interop_test.cpp STREAM_MODE testing) # ################################################################################################## # Install tests #################################################################################### diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp new file mode 100644 index 00000000000..e16b3c9cf0d --- /dev/null +++ b/cpp/tests/streams/interop_test.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include + +struct ArrowTest : public cudf::test::BaseFixture {}; + +TEST_F(ArrowTest, ToArrow) +{ + int32_t const value{42}; + auto cudf_scalar = + cudf::make_fixed_width_scalar(value, cudf::test::get_default_stream()); + + cudf::column_metadata metadata{""}; + auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, FromArrow) +{ + int32_t const value{42}; + auto arrow_scalar = arrow::MakeScalar(value); + cudf::from_arrow(*arrow_scalar, cudf::test::get_default_stream()); +} From cf30fef29071d3674135d114968dd7db6e2266e4 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 17:07:12 -0700 Subject: [PATCH 46/54] Also add stream tests for the table APIs --- cpp/tests/streams/interop_test.cpp | 29 ++++++++++++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp index e16b3c9cf0d..5235714c342 100644 --- a/cpp/tests/streams/interop_test.cpp +++ b/cpp/tests/streams/interop_test.cpp @@ -19,11 +19,38 @@ #include #include +#include #include struct ArrowTest : public cudf::test::BaseFixture {}; TEST_F(ArrowTest, ToArrow) +{ + int32_t const value{42}; + auto col = cudf::test::fixed_width_column_wrapper{{value}}; + cudf::table_view tbl{{col}}; + + std::vector metadata{{""}}; + auto arrow_scalar = cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, FromArrow) +{ + std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector host_validity = {true, true, true, false, true, true, true}; + + arrow::Int64Builder builder; + auto status = builder.AppendValues(host_values, host_validity); + auto maybe_array = builder.Finish(); + auto array = *maybe_array; + + auto field = arrow::field("", arrow::int32()); + auto schema = arrow::schema({field}); + auto table = arrow::Table::Make(schema, {array}); + cudf::from_arrow(*table, cudf::test::get_default_stream()); +} + +TEST_F(ArrowTest, ToArrowScalar) { int32_t const value{42}; auto cudf_scalar = @@ -33,7 +60,7 @@ TEST_F(ArrowTest, ToArrow) auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream()); } -TEST_F(ArrowTest, FromArrow) +TEST_F(ArrowTest, FromArrowScalar) { int32_t const value{42}; auto arrow_scalar = arrow::MakeScalar(value); From 49b26935d97b39bcfdcda3ada648fb5486843a21 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Sep 2023 17:07:33 -0700 Subject: [PATCH 47/54] Revert Python changes --- python/cudf/cudf/_lib/cpp/interop.pxd | 11 +- .../cudf/cudf/_lib/cpp/libcpp/functional.pxd | 4 +- python/cudf/cudf/_lib/cpp/reduce.pxd | 5 +- python/cudf/cudf/_lib/datetime.pyx | 6 +- .../cudf/cudf/_lib/pylibcudf/CMakeLists.txt | 27 +- python/cudf/cudf/_lib/pylibcudf/__init__.pxd | 8 +- python/cudf/cudf/_lib/pylibcudf/__init__.py | 5 +- python/cudf/cudf/_lib/pylibcudf/copying.pxd | 6 - python/cudf/cudf/_lib/pylibcudf/copying.pyx | 53 --- .../cudf/_lib/pylibcudf/gpumemoryview.pxd | 7 +- .../cudf/_lib/pylibcudf/gpumemoryview.pyx | 14 +- python/cudf/cudf/_lib/pylibcudf/interop.pxd | 26 -- python/cudf/cudf/_lib/pylibcudf/interop.pyx | 98 ------ python/cudf/cudf/_lib/pylibcudf/scalar.pxd | 32 -- python/cudf/cudf/_lib/pylibcudf/scalar.pyx | 119 ------- python/cudf/cudf/_lib/scalar.pxd | 10 +- python/cudf/cudf/_lib/scalar.pyx | 321 ++++++++++-------- python/cudf/cudf/tests/test_list.py | 4 +- python/cudf/cudf/tests/test_struct.py | 35 +- python/cudf/cudf/utils/dtypes.py | 18 + 20 files changed, 215 insertions(+), 594 deletions(-) delete mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pxd delete mode 100644 python/cudf/cudf/_lib/pylibcudf/interop.pyx delete mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pxd delete mode 100644 python/cudf/cudf/_lib/pylibcudf/scalar.pyx diff --git a/python/cudf/cudf/_lib/cpp/interop.pxd b/python/cudf/cudf/_lib/cpp/interop.pxd index 88e9d83ee98..e81f0d617fb 100644 --- a/python/cudf/cudf/_lib/cpp/interop.pxd +++ b/python/cudf/cudf/_lib/cpp/interop.pxd @@ -1,13 +1,12 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. from libcpp.memory cimport shared_ptr, unique_ptr from libcpp.string cimport string from libcpp.vector cimport vector -from pyarrow.lib cimport CScalar, CTable +from pyarrow.lib cimport CTable from cudf._lib.types import cudf_to_np_types, np_to_cudf_types -from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view @@ -25,7 +24,6 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ ) except + cdef unique_ptr[table] from_arrow(CTable input) except + - cdef unique_ptr[scalar] from_arrow(CScalar input) except + cdef cppclass column_metadata: column_metadata() except + @@ -37,8 +35,3 @@ cdef extern from "cudf/interop.hpp" namespace "cudf" \ table_view input, vector[column_metadata] metadata, ) except + - - cdef shared_ptr[CScalar] to_arrow( - const scalar& input, - column_metadata metadata, - ) except + diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd index c38db036119..f3e2d6d0878 100644 --- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd +++ b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd @@ -1,8 +1,6 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020, NVIDIA CORPORATION. -# TODO: Can be replaced once https://github.com/cython/cython/pull/5671 is -# merged and released cdef extern from "" namespace "std" nogil: cdef cppclass reference_wrapper[T]: reference_wrapper() diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd index 997782dec6c..7952c717916 100644 --- a/python/cudf/cudf/_lib/cpp/reduce.pxd +++ b/python/cudf/cudf/_lib/cpp/reduce.pxd @@ -1,13 +1,14 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr from libcpp.utility cimport pair -from cudf._lib.cpp.aggregation cimport reduce_aggregation, scan_aggregation +from cudf._lib.aggregation cimport reduce_aggregation, scan_aggregation from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport data_type +from cudf._lib.scalar cimport DeviceScalar cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil: diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx index 3d96f59c4d6..81949dbaa20 100644 --- a/python/cudf/cudf/_lib/datetime.pyx +++ b/python/cudf/cudf/_lib/datetime.pyx @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from cudf.core.buffer import acquire_spill_lock @@ -10,7 +10,6 @@ from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.filling cimport calendrical_month_sequence -from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.types cimport size_type from cudf._lib.scalar cimport DeviceScalar @@ -167,11 +166,10 @@ def date_range(DeviceScalar start, size_type n, offset): + offset.kwds.get("months", 0) ) - cdef const scalar* c_start = start.c_value.get() with nogil: c_result = move(calendrical_month_sequence( n, - c_start[0], + start.c_value.get()[0], months )) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt index 64adb38ace3..0ce42dc43ff 100644 --- a/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt +++ b/python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt @@ -12,35 +12,10 @@ # the License. # ============================================================================= -set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx - types.pyx utils.pyx -) +set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx) set(linked_libraries cudf::cudf) rapids_cython_create_modules( CXX SOURCE_FILES "${cython_sources}" LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf ) - -find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) - -execute_process( - COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" - OUTPUT_VARIABLE PYARROW_INCLUDE_DIR - OUTPUT_STRIP_TRAILING_WHITESPACE -) - -set(targets_using_arrow_headers pylibcudf_interop pylibcudf_scalar) -foreach(target IN LISTS targets_using_arrow_headers) - target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") -endforeach() - -# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for -# more info -find_package(NumPy REQUIRED) -set(targets_using_numpy pylibcudf_interop pylibcudf_scalar) -foreach(target IN LISTS targets_using_numpy) - target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") - # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. - # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") -endforeach() diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd index cf0007b9303..ba7822b0a54 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.pxd @@ -1,13 +1,9 @@ # Copyright (c) 2023, NVIDIA CORPORATION. # TODO: Verify consistent usage of relative/absolute imports in pylibcudf. -# TODO: Cannot import interop because it introduces a build-time pyarrow header -# dependency for everything that cimports pylibcudf. See if there's a way to -# avoid that before polluting the whole package. -from . cimport copying # , interop +from . cimport copying from .column cimport Column from .gpumemoryview cimport gpumemoryview -from .scalar cimport Scalar from .table cimport Table # TODO: cimport type_id once # https://github.com/cython/cython/issues/5609 is resolved @@ -16,9 +12,7 @@ from .types cimport DataType __all__ = [ "Column", "DataType", - "Scalar", "Table", "copying", "gpumemoryview", - # "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/__init__.py b/python/cudf/cudf/_lib/pylibcudf/__init__.py index 72b74a57b87..3edff9a53e8 100644 --- a/python/cudf/cudf/_lib/pylibcudf/__init__.py +++ b/python/cudf/cudf/_lib/pylibcudf/__init__.py @@ -1,19 +1,16 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from . import copying, interop +from . import copying from .column import Column from .gpumemoryview import gpumemoryview -from .scalar import Scalar from .table import Table from .types import DataType, TypeId __all__ = [ "Column", "DataType", - "Scalar", "Table", "TypeId", "copying", "gpumemoryview", - "interop", ] diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pxd b/python/cudf/cudf/_lib/pylibcudf/copying.pxd index a2232fc5d81..d57be650710 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pxd @@ -13,9 +13,3 @@ cpdef Table gather( Column gather_map, out_of_bounds_policy bounds_policy ) - -cpdef Table scatter ( - list source_scalars, - Column indices, - Table target, -) diff --git a/python/cudf/cudf/_lib/pylibcudf/copying.pyx b/python/cudf/cudf/_lib/pylibcudf/copying.pyx index dff3e95328d..a27b44b3107 100644 --- a/python/cudf/cudf/_lib/pylibcudf/copying.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/copying.pyx @@ -2,7 +2,6 @@ from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -from libcpp.vector cimport vector # TODO: We want to make cpp a more full-featured package so that we can access # directly from that. It will make namespacing much cleaner in pylibcudf. What @@ -10,16 +9,13 @@ from libcpp.vector cimport vector # cimport libcudf... libcudf.copying.algo(...) from cudf._lib.cpp cimport copying as cpp_copying from cudf._lib.cpp.copying cimport out_of_bounds_policy -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.copying import \ out_of_bounds_policy as OutOfBoundsPolicy # no-cython-lint -from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from .column cimport Column -from .scalar cimport Scalar from .table cimport Table @@ -59,52 +55,3 @@ cpdef Table gather( ) ) return Table.from_libcudf(move(c_result)) - - -ctypedef const scalar constscalar - -cpdef Table scatter ( - list source_scalars, - Column indices, - Table target, -): - """Scatter source_scalars into target according to the indices. - - For details on the implementation, see cudf::scatter in libcudf. - - Parameters - ---------- - source_scalars : List[Scalar] - A list containing one scalar for each column in target. - indices : Column - The rows of the target into which the source_scalars should be written. - target : Table - The table into which data should be written. - - Returns - ------- - pylibcudf.Table - The result of the scatter - """ - cdef unique_ptr[table] c_result - # TODO: This doesn't require the constscalar ctypedef - cdef vector[reference_wrapper[const scalar]] c_scalars - c_scalars.reserve(len(source_scalars)) - cdef Scalar d_slr - for d_slr in source_scalars: - c_scalars.push_back( - # TODO: This requires the constscalar ctypedef - # Possibly the same as https://github.com/cython/cython/issues/4180 - reference_wrapper[constscalar](d_slr.get()[0]) - ) - - with nogil: - c_result = move( - cpp_copying.scatter( - c_scalars, - indices.view(), - target.view(), - ) - ) - - return Table.from_libcudf(move(c_result)) diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd index 93449fd02d1..713697bd139 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pxd @@ -1,14 +1,9 @@ # Copyright (c) 2023, NVIDIA CORPORATION. -from libcpp cimport bool - cdef class gpumemoryview: # TODO: Eventually probably want to make this opaque, but for now it's fine # to treat this object as something like a POD struct cdef readonly: Py_ssize_t ptr - object _obj - bool _released - - cpdef release(self) + object obj diff --git a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx index 9121092ecdd..fc98f087a1b 100644 --- a/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx +++ b/python/cudf/cudf/_lib/pylibcudf/gpumemoryview.pyx @@ -19,21 +19,9 @@ cdef class gpumemoryview: "gpumemoryview must be constructed from an object supporting " "the CUDA array interface" ) - self._obj = obj - self._released = False + self.obj = obj # TODO: Need to respect readonly self.ptr = cai["data"][0] def __cuda_array_interface__(self): return self.obj.__cuda_array_interface__ - - @property - def obj(self): - if not self._released: - return self._obj - else: - raise ValueError("operation forbidden on released gpumemoryview object") - - cpdef release(self): - self._obj = None - self._released = True diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pxd b/python/cudf/cudf/_lib/pylibcudf/interop.pxd deleted file mode 100644 index c1268b8ca1a..00000000000 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pxd +++ /dev/null @@ -1,26 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -from pyarrow.lib cimport Scalar as pa_Scalar, Table as pa_Table - -from cudf._lib.cpp.interop cimport column_metadata - -from .scalar cimport Scalar -from .table cimport Table - - -cdef class ColumnMetadata: - cdef public object name - cdef public object children_meta - cdef column_metadata to_c_metadata(self) - -cpdef Table from_arrow( - pa_Table pyarrow_table, -) - -cpdef Scalar from_arrow_scalar( - pa_Scalar pyarrow_scalar, -) - -cpdef pa_Table to_arrow(Table tbl, list metadata) - -cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/interop.pyx b/python/cudf/cudf/_lib/pylibcudf/interop.pyx deleted file mode 100644 index 9e6b44b117d..00000000000 --- a/python/cudf/cudf/_lib/pylibcudf/interop.pyx +++ /dev/null @@ -1,98 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -from cython.operator cimport dereference -from libcpp.memory cimport shared_ptr, unique_ptr -from libcpp.utility cimport move -from libcpp.vector cimport vector -from pyarrow.lib cimport ( - CScalar as pa_CScalar, - CTable as pa_CTable, - Scalar as pa_Scalar, - Table as pa_Table, - pyarrow_unwrap_scalar, - pyarrow_unwrap_table, - pyarrow_wrap_scalar, - pyarrow_wrap_table, -) - -from cudf._lib.cpp.interop cimport ( - column_metadata, - from_arrow as cpp_from_arrow, - to_arrow as cpp_to_arrow, -) -from cudf._lib.cpp.scalar.scalar cimport scalar -from cudf._lib.cpp.table.table cimport table - -from .scalar cimport Scalar -from .table cimport Table - - -cdef class ColumnMetadata: - def __init__(self, name): - self.name = name - self.children_meta = [] - - cdef column_metadata to_c_metadata(self): - """Convert to C++ column_metadata. - - Since this class is mutable and cheap, it is easier to create the C++ - object on the fly rather than have it directly backing the storage for - the Cython class. - """ - cdef column_metadata c_metadata - cdef ColumnMetadata child_meta - c_metadata.name = self.name.encode() - for child_meta in self.children_meta: - c_metadata.children_meta.push_back(child_meta.to_c_metadata()) - return c_metadata - - -cpdef Table from_arrow( - pa_Table pyarrow_table, -): - cdef shared_ptr[pa_CTable] ctable = ( - pyarrow_unwrap_table(pyarrow_table) - ) - cdef unique_ptr[table] c_result - - with nogil: - c_result = move(cpp_from_arrow(ctable.get()[0])) - - return Table.from_libcudf(move(c_result)) - - -cpdef Scalar from_arrow_scalar( - pa_Scalar pyarrow_scalar, -): - cdef shared_ptr[pa_CScalar] cscalar = ( - pyarrow_unwrap_scalar(pyarrow_scalar) - ) - cdef unique_ptr[scalar] c_result - - with nogil: - c_result = move(cpp_from_arrow(cscalar.get()[0])) - - return Scalar.from_libcudf(move(c_result)) - - -cpdef pa_Table to_arrow(Table tbl, list metadata): - cdef shared_ptr[pa_CTable] c_result - cdef vector[column_metadata] c_metadata - cdef ColumnMetadata meta - for meta in metadata: - c_metadata.push_back(meta.to_c_metadata()) - - with nogil: - c_result = move(cpp_to_arrow(tbl.view(), c_metadata)) - - return pyarrow_wrap_table(c_result) - - -cpdef pa_Scalar to_arrow_scalar(Scalar slr, ColumnMetadata metadata): - cdef shared_ptr[pa_CScalar] c_result - cdef column_metadata c_metadata = metadata.to_c_metadata() - - with nogil: - c_result = move(cpp_to_arrow(dereference(slr.c_obj.get()), c_metadata)) - - return pyarrow_wrap_scalar(c_result) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd b/python/cudf/cudf/_lib/pylibcudf/scalar.pxd deleted file mode 100644 index d20c65f0be0..00000000000 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pxd +++ /dev/null @@ -1,32 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -from libcpp cimport bool -from libcpp.memory cimport unique_ptr - -from rmm._lib.memory_resource cimport DeviceMemoryResource - -from cudf._lib.cpp.scalar.scalar cimport scalar - -from .types cimport DataType - - -cdef class Scalar: - cdef unique_ptr[scalar] c_obj - cdef DataType _data_type - - # Holds a reference to the DeviceMemoryResource used for allocation. - # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is - # needed for deallocation - cdef DeviceMemoryResource mr - - cdef const scalar* get(self) except * - - cpdef DataType type(self) - cpdef bool is_valid(self) - - @staticmethod - cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*) - - # TODO: Make sure I'm correct to avoid typing the metadata as - # ColumnMetadata, I assume that will cause circular cimport problems - cpdef to_pyarrow_scalar(self, metadata) diff --git a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx b/python/cudf/cudf/_lib/pylibcudf/scalar.pyx deleted file mode 100644 index d06bb9adeb9..00000000000 --- a/python/cudf/cudf/_lib/pylibcudf/scalar.pyx +++ /dev/null @@ -1,119 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -cimport pyarrow.lib -from cython cimport no_gc_clear -from libcpp.memory cimport unique_ptr - -import pyarrow.lib - -from rmm._lib.memory_resource cimport get_current_device_resource - -from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar -from cudf._lib.cpp.wrappers.decimals cimport ( - decimal32, - decimal64, - decimal128, - scale_type, -) - -from .types cimport DataType, type_id - - -# The DeviceMemoryResource attribute could be released prematurely -# by the gc if the DeviceScalar is in a reference cycle. Removing -# the tp_clear function with the no_gc_clear decoration prevents that. -# See https://github.com/rapidsai/rmm/pull/931 for details. -@no_gc_clear -cdef class Scalar: - """A scalar value in device memory.""" - # Unlike for columns, libcudf does not support scalar views. All APIs that - # accept scalar values accept references to the owning object rather than a - # special view type. As a result, pylibcudf.Scalar has a simpler structure - # than pylibcudf.Column because it can be a true wrapper around a libcudf - # column - - def __cinit__(self, *args, **kwargs): - self.mr = get_current_device_resource() - - def __init__(self, pyarrow.lib.Scalar value=None): - # TODO: This case is not something we really want to - # support, but it here for now to ease the transition of - # DeviceScalar. - if value is not None: - raise ValueError("Scalar should be constructed with a factory") - - @staticmethod - def from_pyarrow_scalar(pyarrow.lib.Scalar value, DataType data_type=None): - # Allow passing a dtype, but only for the purpose of decimals for now - - # Need a local import here to avoid a circular dependency because - # from_arrow_scalar returns a Scalar. - from .interop import from_arrow_scalar - - cdef Scalar s = from_arrow_scalar(value) - if s.type().id() != type_id.DECIMAL128: - if data_type is not None: - raise ValueError( - "dtype may not be passed for non-decimal types" - ) - return s - - if data_type is None: - raise ValueError( - "Decimal scalars must be constructed with a dtype" - ) - - cdef type_id tid = data_type.id() - if tid not in (type_id.DECIMAL32, type_id.DECIMAL64, type_id.DECIMAL128): - raise ValueError( - "Decimal scalars may only be cast to decimals" - ) - - if tid == type_id.DECIMAL128: - return s - - if tid == type_id.DECIMAL32: - s.c_obj.reset( - new fixed_point_scalar[decimal32]( - ( s.c_obj.get()).value(), - scale_type(-value.type.scale), - s.c_obj.get().is_valid() - ) - ) - elif tid == type_id.DECIMAL64: - s.c_obj.reset( - new fixed_point_scalar[decimal64]( - ( s.c_obj.get()).value(), - scale_type(-value.type.scale), - s.c_obj.get().is_valid() - ) - ) - return s - - cpdef to_pyarrow_scalar(self, metadata): - from .interop import to_arrow_scalar - return to_arrow_scalar(self, metadata) - - cdef const scalar* get(self) except *: - return self.c_obj.get() - - cpdef DataType type(self): - """The type of data in the column.""" - return self._data_type - - cpdef bool is_valid(self): - """True if the scalar is valid, false if not""" - return self.get().is_valid() - - @staticmethod - cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=None): - """Construct a Scalar object from a libcudf scalar. - - This method is for pylibcudf's functions to use to ingest outputs of - calling libcudf algorithms, and should generally not be needed by users - (even direct pylibcudf Cython users). - """ - cdef Scalar s = Scalar.__new__(Scalar) - s.c_obj.swap(libcudf_scalar) - s._data_type = DataType.from_libcudf(s.get().type()) - return s diff --git a/python/cudf/cudf/_lib/scalar.pxd b/python/cudf/cudf/_lib/scalar.pxd index ae1d350edc6..1deed60d67d 100644 --- a/python/cudf/cudf/_lib/scalar.pxd +++ b/python/cudf/cudf/_lib/scalar.pxd @@ -1,16 +1,20 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2022, NVIDIA CORPORATION. from libcpp cimport bool from libcpp.memory cimport unique_ptr from rmm._lib.memory_resource cimport DeviceMemoryResource -from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport scalar cdef class DeviceScalar: - cdef pylibcudf.Scalar c_value + cdef unique_ptr[scalar] c_value + + # Holds a reference to the DeviceMemoryResource used for allocation. + # Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is + # needed for deallocation + cdef DeviceMemoryResource mr cdef object _dtype diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx index 30d46bd3fe2..0407785b2d8 100644 --- a/python/cudf/cudf/_lib/scalar.pyx +++ b/python/cudf/cudf/_lib/scalar.pyx @@ -1,6 +1,7 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -import copy +cimport cython + import decimal import numpy as np @@ -21,17 +22,14 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr from libcpp.utility cimport move -import cudf -from cudf._lib.pylibcudf.interop import ColumnMetadata - -from cudf._lib.pylibcudf.types cimport type_id +from rmm._lib.memory_resource cimport get_current_device_resource +import cudf from cudf._lib.types import ( LIBCUDF_TO_SUPPORTED_NUMPY_TYPES, datetime_unit_map, duration_unit_map, ) -from cudf.api.types import is_list_dtype, is_struct_dtype from cudf.core.dtypes import ListDtype, StructDtype from cudf.core.missing import NA, NaT @@ -40,10 +38,9 @@ from cudf._lib.cpp.column.column_view cimport column_view from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.types cimport dtype_from_column_view, underlying_type_t_type_id -from cudf._lib.interop import to_arrow +from cudf._lib.interop import from_arrow, to_arrow cimport cudf._lib.cpp.types as libcudf_types -from cudf._lib cimport pylibcudf from cudf._lib.cpp.scalar.scalar cimport ( duration_scalar, fixed_point_scalar, @@ -54,7 +51,12 @@ from cudf._lib.cpp.scalar.scalar cimport ( struct_scalar, timestamp_scalar, ) -from cudf._lib.cpp.wrappers.decimals cimport decimal32, decimal64, decimal128 +from cudf._lib.cpp.wrappers.decimals cimport ( + decimal32, + decimal64, + decimal128, + scale_type, +) from cudf._lib.cpp.wrappers.durations cimport ( duration_ms, duration_ns, @@ -67,67 +69,18 @@ from cudf._lib.cpp.wrappers.timestamps cimport ( timestamp_s, timestamp_us, ) -from cudf._lib.utils cimport columns_from_table_view - - -# TODO: Check if this could replace _nested_na_replace -def _replace_nested_nulls(obj): - if isinstance(obj, list): - for i, item in enumerate(obj): - # TODO: Check if this should use _is_null_host_scalar - if cudf.utils.utils.is_na_like(item): - obj[i] = None - elif isinstance(item, (dict, list)): - _replace_nested_nulls(item) - elif isinstance(obj, dict): - for k, v in obj.items(): - if cudf.utils.utils.is_na_like(v): - obj[k] = None - elif isinstance(v, (dict, list)): - _replace_nested_nulls(v) - - -def _replace_nested_none(obj): - if isinstance(obj, list): - for i, item in enumerate(obj): - if item is None: - obj[i] = NA - elif isinstance(item, (dict, list)): - _replace_nested_none(item) - elif isinstance(obj, dict): - for k, v in obj.items(): - if v is None: - obj[k] = NA - elif isinstance(v, (dict, list)): - _replace_nested_none(v) - - -def gather_metadata(dtypes): - # dtypes is a dict mapping names to column dtypes - # This interface is a bit clunky, but it matches libcudf. May want to - # consider better approaches to building up the metadata eventually. - out = [] - for name, dtype in dtypes.items(): - v = ColumnMetadata(name) - if is_struct_dtype(dtype): - v.children_meta = gather_metadata(dtype.fields) - elif is_list_dtype(dtype): - # Offsets column is unnamed and has no children - v.children_meta.append(ColumnMetadata("")) - v.children_meta.extend( - gather_metadata({"": dtype.element_type}) - ) - out.append(v) - return out +from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns +# The DeviceMemoryResource attribute could be released prematurely +# by the gc if the DeviceScalar is in a reference cycle. Removing +# the tp_clear function with the no_gc_clear decoration prevents that. +# See https://github.com/rapidsai/rmm/pull/931 for details. +@cython.no_gc_clear cdef class DeviceScalar: - # I think this should be removable, except that currently the way that - # from_unique_ptr is implemented is probably dereferencing this in an - # invalid state. See what the best way to fix that is. def __cinit__(self, *args, **kwargs): - self.c_value = pylibcudf.Scalar() + self.mr = get_current_device_resource() def __init__(self, value, dtype): """ @@ -143,102 +96,63 @@ cdef class DeviceScalar: dtype : dtype A NumPy dtype. """ - dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') + self._dtype = dtype if dtype.kind != 'U' else cudf.dtype('object') + self._set_value(value, self._dtype) - if cudf.utils.utils.is_na_like(value): - value = None - else: - # TODO: For now we always deepcopy the input value to avoid - # overwriting the input values when replacing nulls. Since it's - # just host values it's not that expensive, but we could consider - # alternatives. - value = copy.deepcopy(value) - _replace_nested_nulls(value) - - if isinstance(dtype, cudf.core.dtypes._BaseDtype): - pa_type = dtype.to_arrow() - elif pd.api.types.is_string_dtype(dtype): - # Have to manually convert object types, which we use internally - # for strings but pyarrow only supports as unicode 'U' - pa_type = pa.string() - else: - pa_type = pa.from_numpy_dtype(dtype) - - pa_scalar = pa.scalar(value, type=pa_type) + def _set_value(self, value, dtype): + # IMPORTANT: this should only ever be called from __init__ + valid = not _is_null_host_scalar(value) - cdef type_id tid - data_type = None if isinstance(dtype, cudf.core.dtypes.DecimalDtype): - tid = type_id.DECIMAL128 - if isinstance(dtype, cudf.core.dtypes.Decimal32Dtype): - tid = type_id.DECIMAL32 - elif isinstance(dtype, cudf.core.dtypes.Decimal64Dtype): - tid = type_id.DECIMAL64 - data_type = pylibcudf.DataType(tid, -dtype.scale) - - self.c_value = pylibcudf.Scalar.from_pyarrow_scalar(pa_scalar, data_type) - self._dtype = dtype + _set_decimal_from_scalar( + self.c_value, value, dtype, valid) + elif isinstance(dtype, cudf.ListDtype): + _set_list_from_pylist( + self.c_value, value, dtype, valid) + elif isinstance(dtype, cudf.StructDtype): + _set_struct_from_pydict(self.c_value, value, dtype, valid) + elif pd.api.types.is_string_dtype(dtype): + _set_string_from_np_string(self.c_value, value, valid) + elif pd.api.types.is_numeric_dtype(dtype): + _set_numeric_from_np_scalar(self.c_value, + value, + dtype, + valid) + elif pd.api.types.is_datetime64_dtype(dtype): + _set_datetime64_from_np_scalar( + self.c_value, value, dtype, valid + ) + elif pd.api.types.is_timedelta64_dtype(dtype): + _set_timedelta64_from_np_scalar( + self.c_value, value, dtype, valid + ) + else: + raise ValueError( + f"Cannot convert value of type " + f"{type(value).__name__} to cudf scalar" + ) def _to_host_scalar(self): - metadata = gather_metadata({"": self.dtype})[0] if isinstance(self.dtype, cudf.core.dtypes.DecimalDtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NA - return ps.as_py() - # For nested types we should eventually account for the special cases - # that we handle for other types, e.g. numerics being casted to numpy - # types or datetime/timedelta needing to be cast to int64 to handle - # overflow. However, the old implementation didn't handle these cases - # either, so we can leave that for a follow-up PR. + result = _get_py_decimal_from_fixed_point(self.c_value) elif cudf.api.types.is_struct_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NA - ret = ps.as_py() - _replace_nested_none(ret) - return ret + result = _get_py_dict_from_struct(self.c_value, self.dtype) elif cudf.api.types.is_list_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NA - ret = ps.as_py() - _replace_nested_none(ret) - return ret + result = _get_py_list_from_list(self.c_value, self.dtype) elif pd.api.types.is_string_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NA - return ps.as_py() + result = _get_py_string_from_string(self.c_value) elif pd.api.types.is_numeric_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NA - return ps.type.to_pandas_dtype()(ps.as_py()) + result = _get_np_scalar_from_numeric(self.c_value) elif pd.api.types.is_datetime64_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NaT - time_unit, _ = np.datetime_data(self.dtype) - # Cast to int64 to avoid overflow - return np.datetime64(ps.cast('int64').as_py(), time_unit) + result = _get_np_scalar_from_timestamp64(self.c_value) elif pd.api.types.is_timedelta64_dtype(self.dtype): - ps = self.c_value.to_pyarrow_scalar(metadata) - if not ps.is_valid: - return NaT - time_unit, _ = np.datetime_data(self.dtype) - # Cast to int64 to avoid overflow - return np.timedelta64(ps.cast('int64').as_py(), time_unit) + result = _get_np_scalar_from_timedelta64(self.c_value) else: raise ValueError( "Could not convert cudf::scalar to a Python value" ) return result - # TODO: This is just here for testing and should be removed. - def get(self): - return self.c_value - @property def dtype(self): """ @@ -255,13 +169,13 @@ cdef class DeviceScalar: return self._to_host_scalar() cdef const scalar* get_raw_ptr(self) except *: - return self.c_value.c_obj.get() + return self.c_value.get() cpdef bool is_valid(self): """ Returns if the Scalar is valid or not(i.e., ). """ - return self.c_value.is_valid() + return self.get_raw_ptr()[0].is_valid() def __repr__(self): if cudf.utils.utils.is_na_like(self.value): @@ -280,7 +194,7 @@ cdef class DeviceScalar: cdef DeviceScalar s = DeviceScalar.__new__(DeviceScalar) cdef libcudf_types.data_type cdtype - s.c_value = pylibcudf.Scalar.from_libcudf(move(ptr)) + s.c_value = move(ptr) cdtype = s.get_raw_ptr()[0].type() if dtype is not None: @@ -322,6 +236,42 @@ cdef class DeviceScalar: return s +cdef _set_string_from_np_string(unique_ptr[scalar]& s, value, bool valid=True): + value = value if valid else "" + s.reset(new string_scalar(value.encode(), valid)) + + +cdef _set_numeric_from_np_scalar(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): + value = value if valid else 0 + if dtype == "int8": + s.reset(new numeric_scalar[int8_t](value, valid)) + elif dtype == "int16": + s.reset(new numeric_scalar[int16_t](value, valid)) + elif dtype == "int32": + s.reset(new numeric_scalar[int32_t](value, valid)) + elif dtype == "int64": + s.reset(new numeric_scalar[int64_t](value, valid)) + elif dtype == "uint8": + s.reset(new numeric_scalar[uint8_t](value, valid)) + elif dtype == "uint16": + s.reset(new numeric_scalar[uint16_t](value, valid)) + elif dtype == "uint32": + s.reset(new numeric_scalar[uint32_t](value, valid)) + elif dtype == "uint64": + s.reset(new numeric_scalar[uint64_t](value, valid)) + elif dtype == "float32": + s.reset(new numeric_scalar[float](value, valid)) + elif dtype == "float64": + s.reset(new numeric_scalar[double](value, valid)) + elif dtype == "bool": + s.reset(new numeric_scalar[bool](value, valid)) + else: + raise ValueError(f"dtype not supported: {dtype}") + + cdef _set_datetime64_from_np_scalar(unique_ptr[scalar]& s, object value, object dtype, @@ -374,6 +324,62 @@ cdef _set_timedelta64_from_np_scalar(unique_ptr[scalar]& s, else: raise ValueError(f"dtype not supported: {dtype}") +cdef _set_decimal_from_scalar(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): + value = cudf.utils.dtypes._decimal_to_int64(value) if valid else 0 + if isinstance(dtype, cudf.Decimal64Dtype): + s.reset( + new fixed_point_scalar[decimal64]( + np.int64(value), scale_type(-dtype.scale), valid + ) + ) + elif isinstance(dtype, cudf.Decimal32Dtype): + s.reset( + new fixed_point_scalar[decimal32]( + np.int32(value), scale_type(-dtype.scale), valid + ) + ) + elif isinstance(dtype, cudf.Decimal128Dtype): + s.reset( + new fixed_point_scalar[decimal128]( + value, scale_type(-dtype.scale), valid + ) + ) + else: + raise ValueError(f"dtype not supported: {dtype}") + +cdef _set_struct_from_pydict(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): + arrow_schema = dtype.to_arrow() + columns = [str(i) for i in range(len(arrow_schema))] + if valid: + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([value[f.name]], from_pandas=True, type=f.type) + for f in arrow_schema + ], + names=columns + ) + else: + pyarrow_table = pa.Table.from_arrays( + [ + pa.array([NA], from_pandas=True, type=f.type) + for f in arrow_schema + ], + names=columns + ) + + data = from_arrow(pyarrow_table) + cdef table_view struct_view = table_view_from_columns(data) + + s.reset( + new struct_scalar(struct_view, valid) + ) + cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype): if not s.get()[0].is_valid(): return NA @@ -389,6 +395,25 @@ cdef _get_py_dict_from_struct(unique_ptr[scalar]& s, dtype): python_dict = table.to_pydict()["None"][0] return {k: _nested_na_replace([python_dict[k]])[0] for k in python_dict} +cdef _set_list_from_pylist(unique_ptr[scalar]& s, + object value, + object dtype, + bool valid=True): + + value = value if valid else [NA] + cdef Column col + if isinstance(dtype.element_type, ListDtype): + pa_type = dtype.element_type.to_arrow() + else: + pa_type = dtype.to_arrow().value_type + col = cudf.core.column.as_column( + pa.array(value, from_pandas=True, type=pa_type) + ) + cdef column_view col_view = col.view() + s.reset( + new list_scalar(col_view, valid) + ) + cdef _get_py_list_from_list(unique_ptr[scalar]& s, dtype): @@ -576,9 +601,9 @@ def _create_proxy_nat_scalar(dtype): if dtype.char in 'mM': nat = dtype.type('NaT').astype(dtype) if dtype.type == np.datetime64: - _set_datetime64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) + _set_datetime64_from_np_scalar(result.c_value, nat, dtype, True) elif dtype.type == np.timedelta64: - _set_timedelta64_from_np_scalar(result.c_value.c_obj, nat, dtype, True) + _set_timedelta64_from_np_scalar(result.c_value, nat, dtype, True) return result else: raise TypeError('NAT only valid for datetime and timedelta') diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index ac10dd97c56..5dd58d8a875 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -895,14 +895,14 @@ def test_memory_usage(): "data, idx", [ ( - [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": NA}]], + [[{"f2": {"a": 100}, "f1": "a"}, {"f1": "sf12", "f2": None}]], 0, ), ( [ [ {"f2": {"a": 100, "c": 90, "f2": 10}, "f1": "a"}, - {"f1": "sf12", "f2": NA}, + {"f1": "sf12", "f2": None}, ] ], 0, diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py index ce6dc587320..a3593e55b97 100644 --- a/python/cudf/cudf/tests/test_struct.py +++ b/python/cudf/cudf/tests/test_struct.py @@ -150,7 +150,9 @@ def test_struct_setitem(data, item): "data", [ {"a": 1, "b": "rapids", "c": [1, 2, 3, 4]}, + {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, {"a": "Hello"}, + {"b": [], "c": [1, 2, 3]}, ], ) def test_struct_scalar_host_construction(data): @@ -159,39 +161,6 @@ def test_struct_scalar_host_construction(data): assert list(slr.device_value.value.values()) == list(data.values()) -@pytest.mark.parametrize( - ("data", "dtype"), - [ - ( - {"a": 1, "b": "rapids", "c": [1, 2, 3, 4], "d": cudf.NA}, - cudf.StructDtype( - { - "a": np.dtype(np.int64), - "b": np.dtype(np.str_), - "c": cudf.ListDtype(np.dtype(np.int64)), - "d": np.dtype(np.int64), - } - ), - ), - ( - {"b": [], "c": [1, 2, 3]}, - cudf.StructDtype( - { - "b": cudf.ListDtype(np.dtype(np.int64)), - "c": cudf.ListDtype(np.dtype(np.int64)), - } - ), - ), - ], -) -def test_struct_scalar_host_construction_no_dtype_inference(data, dtype): - # cudf cannot infer the dtype of the scalar when it contains only nulls or - # is empty. - slr = cudf.Scalar(data, dtype=dtype) - assert slr.value == data - assert list(slr.device_value.value.values()) == list(data.values()) - - def test_struct_scalar_null(): slr = cudf.Scalar(cudf.NA, dtype=StructDtype) assert slr.device_value.value is cudf.NA diff --git a/python/cudf/cudf/utils/dtypes.py b/python/cudf/cudf/utils/dtypes.py index 73ea8e2cfc4..1b94db75340 100644 --- a/python/cudf/cudf/utils/dtypes.py +++ b/python/cudf/cudf/utils/dtypes.py @@ -463,6 +463,24 @@ def _get_nan_for_dtype(dtype): return np.float64("nan") +def _decimal_to_int64(decimal: Decimal) -> int: + """ + Scale a Decimal such that the result is the integer + that would result from removing the decimal point. + + Examples + -------- + >>> _decimal_to_int64(Decimal('1.42')) + 142 + >>> _decimal_to_int64(Decimal('0.0042')) + 42 + >>> _decimal_to_int64(Decimal('-1.004201')) + -1004201 + + """ + return int(f"{decimal:0f}".replace(".", "")) + + def get_allowed_combinations_for_operator(dtype_l, dtype_r, op): error = TypeError( f"{op} not supported between {dtype_l} and {dtype_r} scalars" From 9b7f2402a3ca3db193a99cec7a92a0474c4b9b8c Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Sun, 17 Sep 2023 18:06:35 -0700 Subject: [PATCH 48/54] Switch back to older implementation of writing decimal bytes directly without needing intermediates --- cpp/src/interop/to_arrow.cu | 80 +++++++++++++++++-------------------- 1 file changed, 36 insertions(+), 44 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 186f3798154..0673d8e8ec2 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -143,18 +143,22 @@ struct dispatch_to_arrow { } }; -template <> -std::shared_ptr dispatch_to_arrow::operator()( - column_view input, - cudf::type_id, - column_metadata const&, - arrow::MemoryPool* ar_mr, - rmm::cuda_stream_view stream) +// Convert decimal types from libcudf to arrow where those types are not +// directly supported by Arrow. These types must be fit into 128 bits, the +// smallest decimal resolution supported by Arrow. +template +std::shared_ptr unsupported_decimals_to_arrow(column_view input, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) { - using DeviceType = int32_t; - size_type const BIT_WIDTH_RATIO = 4; // Array::Type:type::DECIMAL (128) / int32_t + constexpr size_type BIT_WIDTH_RATIO = size_type{128} / sizeof(DeviceType); + + // libcudf decimals are always converted to the highest Arrow precision + // because libcudf does not support precision. + constexpr auto decimal128_max_precision = int32_t{38}; + constexpr auto precision = decimal128_max_precision / BIT_WIDTH_RATIO; - rmm::device_uvector<__int128_t> buf(input.size() * BIT_WIDTH_RATIO, stream); + rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); auto count = thrust::make_counting_iterator(0); @@ -162,11 +166,15 @@ std::shared_ptr dispatch_to_arrow::operator()( count, count + input.size(), [in = input.begin(), out = buf.data()] __device__(auto in_idx) { - auto const out_idx = in_idx; - auto unsigned_value = in[in_idx] < 0 ? -in[in_idx] : in[in_idx]; - auto unsigned_128bit = static_cast<__int128_t>(unsigned_value); - auto signed_128bit = in[in_idx] < 0 ? -unsigned_128bit : unsigned_128bit; - out[out_idx] = signed_128bit; + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // The lowest order bits are the value, the remainder + // simply match the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } }); auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); @@ -175,7 +183,7 @@ std::shared_ptr dispatch_to_arrow::operator()( CUDF_CUDA_TRY(cudaMemcpyAsync( data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); - auto type = arrow::decimal(9, -input.type().scale()); + auto type = arrow::decimal(precision, -input.type().scale()); auto mask = fetch_mask_buffer(input, ar_mr, stream); auto buffers = std::vector>{mask, std::move(data_buffer)}; auto data = std::make_shared(type, input.size(), buffers); @@ -184,41 +192,25 @@ std::shared_ptr dispatch_to_arrow::operator()( } template <> -std::shared_ptr dispatch_to_arrow::operator()( +std::shared_ptr dispatch_to_arrow::operator()( column_view input, cudf::type_id, column_metadata const&, arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - using DeviceType = int64_t; - size_type const BIT_WIDTH_RATIO = 2; // Array::Type:type::DECIMAL (128) / int64_t - - rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); - - auto count = thrust::make_counting_iterator(0); - - thrust::for_each(rmm::exec_policy(cudf::get_default_stream()), - count, - count + input.size(), - [in = input.begin(), out = buf.data()] __device__(auto in_idx) { - auto const out_idx = in_idx * 2; - out[out_idx] = in[in_idx]; - out[out_idx + 1] = in[in_idx] < 0 ? -1 : 0; - }); - - auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); - auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); - - CUDF_CUDA_TRY(cudaMemcpyAsync( - data_buffer->mutable_data(), buf.data(), buf_size_in_bytes, cudaMemcpyDefault, stream.value())); - - auto type = arrow::decimal(18, -input.type().scale()); - auto mask = fetch_mask_buffer(input, ar_mr, stream); - auto buffers = std::vector>{mask, std::move(data_buffer)}; - auto data = std::make_shared(type, input.size(), buffers); + return unsupported_decimals_to_arrow(input, ar_mr, stream); +} - return std::make_shared(data); +template <> +std::shared_ptr dispatch_to_arrow::operator()( + column_view input, + cudf::type_id, + column_metadata const&, + arrow::MemoryPool* ar_mr, + rmm::cuda_stream_view stream) +{ + return unsupported_decimals_to_arrow(input, ar_mr, stream); } template <> From 3d4ac776296ff70f7b2ff220e4359e8e57417a0b Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Sep 2023 08:39:38 -0700 Subject: [PATCH 49/54] Specify unroll --- cpp/src/interop/to_arrow.cu | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 0673d8e8ec2..6524d58815d 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -162,20 +162,21 @@ std::shared_ptr unsupported_decimals_to_arrow(column_view input, auto count = thrust::make_counting_iterator(0); - thrust::for_each(rmm::exec_policy(cudf::get_default_stream()), - count, - count + input.size(), - [in = input.begin(), out = buf.data()] __device__(auto in_idx) { - auto const out_idx = in_idx * BIT_WIDTH_RATIO; - // The lowest order bits are the value, the remainder - // simply match the sign bit to satisfy the two's - // complement integer representation of negative numbers. - out[out_idx] = in[in_idx]; -#pragma unroll - for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { - out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; - } - }); + thrust::for_each( + rmm::exec_policy(cudf::get_default_stream()), + count, + count + input.size(), + [in = input.begin(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) { + auto const out_idx = in_idx * BIT_WIDTH_RATIO; + // The lowest order bits are the value, the remainder + // simply match the sign bit to satisfy the two's + // complement integer representation of negative numbers. + out[out_idx] = in[in_idx]; +#pragma unroll BIT_WIDTH_RATIO - 1 + for (auto i = 1; i < BIT_WIDTH_RATIO; ++i) { + out[out_idx + i] = in[in_idx] < 0 ? -1 : 0; + } + }); auto const buf_size_in_bytes = buf.size() * sizeof(DeviceType); auto data_buffer = allocate_arrow_buffer(buf_size_in_bytes, ar_mr); From 93116633a15de46e74c7ee060896bea96334b614 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Sep 2023 08:45:10 -0700 Subject: [PATCH 50/54] Remove unnecessary includes --- cpp/src/interop/from_arrow.cu | 5 ----- 1 file changed, 5 deletions(-) diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index 6e0ef462ac9..ecdb0933b2b 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -13,10 +13,6 @@ * See the License for the specific language governing permissions and * limitations under the License. */ -#include -#include -#include -#include #include #include #include @@ -39,7 +35,6 @@ #include #include -#include #include namespace cudf { From 431321298f53bcc8882c8b425b3d864c26430e0d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Sep 2023 10:26:21 -0700 Subject: [PATCH 51/54] Pass in precision instead of computing --- cpp/src/interop/to_arrow.cu | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 6524d58815d..7beec1b9879 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -148,15 +148,11 @@ struct dispatch_to_arrow { // smallest decimal resolution supported by Arrow. template std::shared_ptr unsupported_decimals_to_arrow(column_view input, + int32_t precision, arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - constexpr size_type BIT_WIDTH_RATIO = size_type{128} / sizeof(DeviceType); - - // libcudf decimals are always converted to the highest Arrow precision - // because libcudf does not support precision. - constexpr auto decimal128_max_precision = int32_t{38}; - constexpr auto precision = decimal128_max_precision / BIT_WIDTH_RATIO; + constexpr size_type BIT_WIDTH_RATIO = sizeof(__int128_t) / sizeof(DeviceType); rmm::device_uvector buf(input.size() * BIT_WIDTH_RATIO, stream); @@ -200,7 +196,7 @@ std::shared_ptr dispatch_to_arrow::operator()( arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - return unsupported_decimals_to_arrow(input, ar_mr, stream); + return unsupported_decimals_to_arrow(input, 9, ar_mr, stream); } template <> @@ -211,7 +207,7 @@ std::shared_ptr dispatch_to_arrow::operator()( arrow::MemoryPool* ar_mr, rmm::cuda_stream_view stream) { - return unsupported_decimals_to_arrow(input, ar_mr, stream); + return unsupported_decimals_to_arrow(input, 18, ar_mr, stream); } template <> From 2aade20da1f0b6ecd1d4beabd04baf1bc737423d Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Sep 2023 17:31:43 -0700 Subject: [PATCH 52/54] Address reviews --- cpp/include/cudf/detail/interop.hpp | 3 +- cpp/src/interop/from_arrow.cu | 2 +- cpp/src/interop/to_arrow.cu | 12 ++--- cpp/tests/interop/from_arrow_test.cpp | 47 ++++++++++---------- cpp/tests/interop/to_arrow_test.cpp | 63 ++++++++++++++------------- 5 files changed, 64 insertions(+), 63 deletions(-) diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp index 6a294b1d360..44024333239 100644 --- a/cpp/include/cudf/detail/interop.hpp +++ b/cpp/include/cudf/detail/interop.hpp @@ -109,7 +109,7 @@ std::shared_ptr to_arrow_array(cudf::type_id id, Ts&&... args) * the specified `arrow::DataType`'s `id()`. * * This function is analogous to libcudf's type_dispatcher, but instead applies - * to Arrow functions. It's primary use case is to leverage Arrow's + * to Arrow functions. Its primary use case is to leverage Arrow's * metaprogramming facilities like arrow::TypeTraits that require translating * the runtime dtype information into compile-time types. */ @@ -177,7 +177,6 @@ std::shared_ptr to_arrow(table_view input, */ std::shared_ptr to_arrow(cudf::scalar const& input, column_metadata const& metadata, - rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr); /** diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index ecdb0933b2b..aa7a70719a6 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -434,7 +434,7 @@ struct BuilderGenerator { std::is_same_v)> std::shared_ptr operator()(std::shared_ptr const& type) { - CUDF_FAIL("Type not support for BuilderGenerator"); + CUDF_FAIL("Type not supported by BuilderGenerator"); } }; diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 7beec1b9879..66c4bd2cb6c 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -165,7 +165,7 @@ std::shared_ptr unsupported_decimals_to_arrow(column_view input, [in = input.begin(), out = buf.data(), BIT_WIDTH_RATIO] __device__(auto in_idx) { auto const out_idx = in_idx * BIT_WIDTH_RATIO; // The lowest order bits are the value, the remainder - // simply match the sign bit to satisfy the two's + // simply matches the sign bit to satisfy the two's // complement integer representation of negative numbers. out[out_idx] = in[in_idx]; #pragma unroll BIT_WIDTH_RATIO - 1 @@ -442,11 +442,11 @@ std::shared_ptr to_arrow(cudf::scalar const& input, rmm::cuda_stream_view stream, arrow::MemoryPool* ar_mr) { - auto column = cudf::make_column_from_scalar(input, 1, stream); - cudf::table_view tv{{column->view()}}; - auto arrow_table = cudf::to_arrow(tv, {metadata}, stream); - auto ac = arrow_table->column(0); - auto maybe_scalar = ac->GetScalar(0); + auto const column = cudf::make_column_from_scalar(input, 1, stream); + cudf::table_view const tv{{column->view()}}; + auto const arrow_table = cudf::to_arrow(tv, {metadata}, stream); + auto const ac = arrow_table->column(0); + auto const maybe_scalar = ac->GetScalar(0); if (!maybe_scalar.ok()) { CUDF_FAIL("Failed to produce a scalar"); } return maybe_scalar.ValueOrDie(); } diff --git a/cpp/tests/interop/from_arrow_test.cpp b/cpp/tests/interop/from_arrow_test.cpp index 5305b0103d8..a898106a5b2 100644 --- a/cpp/tests/interop/from_arrow_test.cpp +++ b/cpp/tests/interop/from_arrow_test.cpp @@ -467,9 +467,10 @@ TYPED_TEST_SUITE(FromArrowNumericScalarTest, NumericTypesNotBool); TYPED_TEST(FromArrowNumericScalarTest, Basic) { TypeParam const value{42}; - auto arrow_scalar = arrow::MakeScalar(value); - auto cudf_scalar = cudf::from_arrow(*arrow_scalar); - auto cudf_numeric_scalar = dynamic_cast*>(cudf_scalar.get()); + auto const arrow_scalar = arrow::MakeScalar(value); + auto const cudf_scalar = cudf::from_arrow(*arrow_scalar); + auto const cudf_numeric_scalar = + dynamic_cast*>(cudf_scalar.get()); if (cudf_numeric_scalar == nullptr) { CUDF_FAIL("Attempted to test with a non-numeric type."); } EXPECT_EQ(cudf_numeric_scalar->type(), cudf::data_type(cudf::type_to_id())); EXPECT_EQ(cudf_numeric_scalar->value(), value); @@ -487,7 +488,7 @@ TEST_F(FromArrowDecimalScalarTest, Basic) auto cudf_scalar = cudf::from_arrow(arrow_scalar); // Arrow offers a minimum of 128 bits for the Decimal type. - auto cudf_decimal_scalar = + auto const cudf_decimal_scalar = dynamic_cast*>(cudf_scalar.get()); EXPECT_EQ(cudf_decimal_scalar->type(), cudf::data_type(cudf::type_to_id(), scale)); @@ -498,11 +499,11 @@ struct FromArrowStringScalarTest : public cudf::test::BaseFixture {}; TEST_F(FromArrowStringScalarTest, Basic) { - auto value = std::string("hello world"); - auto arrow_scalar = arrow::StringScalar(value); - auto cudf_scalar = cudf::from_arrow(arrow_scalar); + auto const value = std::string("hello world"); + auto const arrow_scalar = arrow::StringScalar(value); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); - auto cudf_string_scalar = dynamic_cast(cudf_scalar.get()); + auto const cudf_string_scalar = dynamic_cast(cudf_scalar.get()); EXPECT_EQ(cudf_string_scalar->type(), cudf::data_type(cudf::type_id::STRING)); EXPECT_EQ(cudf_string_scalar->to_string(), value); } @@ -515,17 +516,17 @@ TEST_F(FromArrowListScalarTest, Basic) std::vector host_validity = {true, true, true, false, true, true, true}; arrow::Int64Builder builder; - auto status = builder.AppendValues(host_values, host_validity); - auto maybe_array = builder.Finish(); - auto array = *maybe_array; + auto const status = builder.AppendValues(host_values, host_validity); + auto const maybe_array = builder.Finish(); + auto const array = *maybe_array; - auto arrow_scalar = arrow::ListScalar(array); - auto cudf_scalar = cudf::from_arrow(arrow_scalar); + auto const arrow_scalar = arrow::ListScalar(array); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); - auto cudf_list_scalar = dynamic_cast(cudf_scalar.get()); + auto const cudf_list_scalar = dynamic_cast(cudf_scalar.get()); EXPECT_EQ(cudf_list_scalar->type(), cudf::data_type(cudf::type_id::LIST)); - cudf::test::fixed_width_column_wrapper lhs( + cudf::test::fixed_width_column_wrapper const lhs( host_values.begin(), host_values.end(), host_validity.begin()); CUDF_TEST_EXPECT_COLUMNS_EQUAL(lhs, cudf_list_scalar->view()); } @@ -535,18 +536,18 @@ struct FromArrowStructScalarTest : public cudf::test::BaseFixture {}; TEST_F(FromArrowStructScalarTest, Basic) { int64_t const value{42}; - auto underlying_arrow_scalar = arrow::MakeScalar(value); + auto const underlying_arrow_scalar = arrow::MakeScalar(value); - auto field = arrow::field("", underlying_arrow_scalar->type); - auto arrow_type = arrow::struct_({field}); - auto arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); - auto cudf_scalar = cudf::from_arrow(arrow_scalar); + auto const field = arrow::field("", underlying_arrow_scalar->type); + auto const arrow_type = arrow::struct_({field}); + auto const arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + auto const cudf_scalar = cudf::from_arrow(arrow_scalar); - auto cudf_struct_scalar = dynamic_cast(cudf_scalar.get()); + auto const cudf_struct_scalar = dynamic_cast(cudf_scalar.get()); EXPECT_EQ(cudf_struct_scalar->type(), cudf::data_type(cudf::type_id::STRUCT)); - cudf::test::fixed_width_column_wrapper col({value}); - cudf::table_view lhs({col}); + cudf::test::fixed_width_column_wrapper const col({value}); + cudf::table_view const lhs({col}); CUDF_TEST_EXPECT_TABLES_EQUAL(lhs, cudf_struct_scalar->view()); } diff --git a/cpp/tests/interop/to_arrow_test.cpp b/cpp/tests/interop/to_arrow_test.cpp index 2982cdb2760..6bb4cdfd747 100644 --- a/cpp/tests/interop/to_arrow_test.cpp +++ b/cpp/tests/interop/to_arrow_test.cpp @@ -589,12 +589,12 @@ TYPED_TEST_SUITE(ToArrowNumericScalarTest, NumericTypesNotBool); TYPED_TEST(ToArrowNumericScalarTest, Basic) { TypeParam const value{42}; - auto cudf_scalar = cudf::make_fixed_width_scalar(value); + auto const cudf_scalar = cudf::make_fixed_width_scalar(value); - cudf::column_metadata metadata{""}; - auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); - auto ref_arrow_scalar = arrow::MakeScalar(value); + auto const ref_arrow_scalar = arrow::MakeScalar(value); EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); } @@ -607,15 +607,16 @@ TEST_F(ToArrowDecimalScalarTest, Basic) auto const precision{18}; // cudf will convert to the widest-precision Arrow scalar of the type int32_t const scale{4}; - auto cudf_scalar = + auto const cudf_scalar = cudf::make_fixed_point_scalar(value, numeric::scale_type{scale}); - cudf::column_metadata metadata{""}; - auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); - auto maybe_ref_arrow_scalar = arrow::MakeScalar(arrow::decimal128(precision, -scale), value); + auto const maybe_ref_arrow_scalar = + arrow::MakeScalar(arrow::decimal128(precision, -scale), value); if (!maybe_ref_arrow_scalar.ok()) { CUDF_FAIL("Failed to construct reference scalar"); } - auto ref_arrow_scalar = *maybe_ref_arrow_scalar; + auto const ref_arrow_scalar = *maybe_ref_arrow_scalar; EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); } @@ -624,11 +625,11 @@ struct ToArrowStringScalarTest : public cudf::test::BaseFixture {}; TEST_F(ToArrowStringScalarTest, Basic) { std::string const value{"hello world"}; - auto cudf_scalar = cudf::make_string_scalar(value); - cudf::column_metadata metadata{""}; - auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + auto const cudf_scalar = cudf::make_string_scalar(value); + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); - auto ref_arrow_scalar = arrow::MakeScalar(value); + auto const ref_arrow_scalar = arrow::MakeScalar(value); EXPECT_TRUE(arrow_scalar->Equals(*ref_arrow_scalar)); } @@ -636,23 +637,23 @@ struct ToArrowListScalarTest : public cudf::test::BaseFixture {}; TEST_F(ToArrowListScalarTest, Basic) { - std::vector host_values = {1, 2, 3, 5, 6, 7, 8}; - std::vector host_validity = {true, true, true, false, true, true, true}; + std::vector const host_values = {1, 2, 3, 5, 6, 7, 8}; + std::vector const host_validity = {true, true, true, false, true, true, true}; - cudf::test::fixed_width_column_wrapper col( + cudf::test::fixed_width_column_wrapper const col( host_values.begin(), host_values.end(), host_validity.begin()); - auto cudf_scalar = cudf::make_list_scalar(col); + auto const cudf_scalar = cudf::make_list_scalar(col); - cudf::column_metadata metadata{""}; - auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + cudf::column_metadata const metadata{""}; + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); arrow::Int64Builder builder; - auto status = builder.AppendValues(host_values, host_validity); - auto maybe_array = builder.Finish(); - auto array = *maybe_array; + auto const status = builder.AppendValues(host_values, host_validity); + auto const maybe_array = builder.Finish(); + auto const array = *maybe_array; - auto ref_arrow_scalar = arrow::ListScalar(array); + auto const ref_arrow_scalar = arrow::ListScalar(array); EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); } @@ -664,18 +665,18 @@ TEST_F(ToArrowStructScalarTest, Basic) int64_t const value{42}; auto const field_name{"a"}; - cudf::test::fixed_width_column_wrapper col{value}; - cudf::table_view tbl({col}); - auto cudf_scalar = cudf::make_struct_scalar(tbl); + cudf::test::fixed_width_column_wrapper const col{value}; + cudf::table_view const tbl({col}); + auto const cudf_scalar = cudf::make_struct_scalar(tbl); cudf::column_metadata metadata{""}; metadata.children_meta.emplace_back(field_name); - auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); + auto const arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata); - auto underlying_arrow_scalar = arrow::MakeScalar(value); - auto field = arrow::field(field_name, underlying_arrow_scalar->type, false); - auto arrow_type = arrow::struct_({field}); - auto ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); + auto const underlying_arrow_scalar = arrow::MakeScalar(value); + auto const field = arrow::field(field_name, underlying_arrow_scalar->type, false); + auto const arrow_type = arrow::struct_({field}); + auto const ref_arrow_scalar = arrow::StructScalar({underlying_arrow_scalar}, arrow_type); EXPECT_TRUE(arrow_scalar->Equals(ref_arrow_scalar)); } From 37376e0cbc05afbccdd64721c95b76f39ea8d6c3 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 21 Sep 2023 09:44:48 -0700 Subject: [PATCH 53/54] PR reviews --- cpp/include/cudf/interop.hpp | 8 ++++---- cpp/src/interop/from_arrow.cu | 8 +++----- cpp/src/interop/to_arrow.cu | 7 ++++--- cpp/tests/streams/interop_test.cpp | 4 ++-- 4 files changed, 13 insertions(+), 14 deletions(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index b1514699e21..50e1a2dbf5c 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -126,7 +126,7 @@ struct column_metadata { * * @param input table_view that needs to be converted to arrow Table * @param metadata Contains hierarchy of names of columns and children - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param stream CUDA stream used for device memory operations and kernel launches * @param ar_mr arrow memory pool to allocate memory for arrow Table * @return arrow Table generated from `input` */ @@ -142,7 +142,7 @@ std::shared_ptr to_arrow(table_view input, * * @param input scalar that needs to be converted to arrow Scalar * @param metadata Contains hierarchy of names of columns and children - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param stream CUDA stream used for device memory operations and kernel launches * @param ar_mr arrow memory pool to allocate memory for arrow Scalar * @return arrow Scalar generated from `input` */ @@ -154,7 +154,7 @@ std::shared_ptr to_arrow(cudf::scalar const& input, * @brief Create `cudf::table` from given arrow Table input * * @param input arrow:Table that needs to be converted to `cudf::table` - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::table` * @return cudf table generated from given arrow Table */ @@ -168,7 +168,7 @@ std::unique_ptr
from_arrow( * @brief Create `cudf::scalar` from given arrow Scalar input * * @param input arrow:Scalar that needs to be converted to `cudf::scalar` - * @param stream CUDA stream used for device memory operations and kernel launches. + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::scalar` * @return cudf scalar generated from given arrow Scalar */ diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu index aa7a70719a6..e39625c92e7 100644 --- a/cpp/src/interop/from_arrow.cu +++ b/cpp/src/interop/from_arrow.cu @@ -444,8 +444,8 @@ std::shared_ptr make_builder(std::shared_ptr> field_builders; - for (auto i = 0; i < type->num_fields(); ++i) { - auto const vt = type->field(i)->type(); + for (auto field : type->fields()) { + auto const vt = field->type(); if (vt->id() == arrow::Type::STRUCT || vt->id() == arrow::Type::LIST) { field_builders.push_back(make_builder(vt)); } else { @@ -535,9 +535,7 @@ std::unique_ptr from_arrow(arrow::Scalar const& input, auto cudf_table = detail::from_arrow(*table, stream, mr); - auto& col = cudf_table->get_column(0); - - auto cv = col.view(); + auto cv = cudf_table->view().column(0); return get_element(cv, 0, stream); } diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu index 66c4bd2cb6c..0cd750bc947 100644 --- a/cpp/src/interop/to_arrow.cu +++ b/cpp/src/interop/to_arrow.cu @@ -22,11 +22,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -81,7 +79,10 @@ std::shared_ptr fetch_mask_buffer(column_view input_view, auto mask_buffer = allocate_arrow_bitmap(static_cast(input_view.size()), ar_mr); CUDF_CUDA_TRY(cudaMemcpyAsync( mask_buffer->mutable_data(), - (input_view.offset() > 0) ? cudf::copy_bitmask(input_view).data() : input_view.null_mask(), + (input_view.offset() > 0) + ? cudf::detail::copy_bitmask(input_view, stream, rmm::mr::get_current_device_resource()) + .data() + : input_view.null_mask(), mask_size_in_bytes, cudaMemcpyDefault, stream.value())); diff --git a/cpp/tests/streams/interop_test.cpp b/cpp/tests/streams/interop_test.cpp index 5235714c342..7eac9e016eb 100644 --- a/cpp/tests/streams/interop_test.cpp +++ b/cpp/tests/streams/interop_test.cpp @@ -31,7 +31,7 @@ TEST_F(ArrowTest, ToArrow) cudf::table_view tbl{{col}}; std::vector metadata{{""}}; - auto arrow_scalar = cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream()); + cudf::to_arrow(tbl, metadata, cudf::test::get_default_stream()); } TEST_F(ArrowTest, FromArrow) @@ -57,7 +57,7 @@ TEST_F(ArrowTest, ToArrowScalar) cudf::make_fixed_width_scalar(value, cudf::test::get_default_stream()); cudf::column_metadata metadata{""}; - auto arrow_scalar = cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream()); + cudf::to_arrow(*cudf_scalar, metadata, cudf::test::get_default_stream()); } TEST_F(ArrowTest, FromArrowScalar) From 637e0a43d2d311ae96ac174a3e2d35099011fefb Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Fri, 22 Sep 2023 16:17:35 -0700 Subject: [PATCH 54/54] Update cpp/include/cudf/interop.hpp Co-authored-by: Bradley Dice --- cpp/include/cudf/interop.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp index 50e1a2dbf5c..865cc004107 100644 --- a/cpp/include/cudf/interop.hpp +++ b/cpp/include/cudf/interop.hpp @@ -167,7 +167,7 @@ std::unique_ptr
from_arrow( /** * @brief Create `cudf::scalar` from given arrow Scalar input * - * @param input arrow:Scalar that needs to be converted to `cudf::scalar` + * @param input `arrow::Scalar` that needs to be converted to `cudf::scalar` * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate `cudf::scalar` * @return cudf scalar generated from given arrow Scalar