Skip to content

Commit

Permalink
Add pylibcudf.Scalar that interoperates with Arrow scalars (#14133)
Browse files Browse the repository at this point in the history
This PR adds a new Scalar object to pylibcudf that will function as the pylibcudf equivalent of cudf::scalar. Unlike columns, which are typically operated on in the form of views rather than owning types by libcudf, owning scalars are accepted by (const) ref by libcudf APIs and no corresponding view type exists. Therefore, pylibcudf.Scalar differs from pylibcudf.Column by actually owning an instance of the underlying libcudf type (cudf::scalar). Construction of pylibcudf Scalars is expected to be done from an Arrow scalar.

This PR relies on #14124 and should not be merged until after that one.

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14133
  • Loading branch information
vyasr authored Oct 6, 2023
1 parent fc36947 commit 96664ec
Show file tree
Hide file tree
Showing 18 changed files with 378 additions and 138 deletions.
8 changes: 6 additions & 2 deletions python/cudf/cudf/_lib/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,12 @@ if(${PYARROW_RESULT})
message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
endif()

set(targets_using_arrow_headers interop avro csv orc json parquet)
foreach(target IN LISTS targets_using_arrow_headers)
# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
# These requirements will go away once all scalar-related Cython code is removed from cudf.
foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
endforeach()

Expand Down
6 changes: 4 additions & 2 deletions python/cudf/cudf/_lib/datetime.pyx
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

from cudf.core.buffer import acquire_spill_lock

Expand All @@ -10,6 +10,7 @@ from cudf._lib.column cimport Column
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.filling cimport calendrical_month_sequence
from cudf._lib.cpp.scalar.scalar cimport scalar
from cudf._lib.cpp.types cimport size_type
from cudf._lib.scalar cimport DeviceScalar

Expand Down Expand Up @@ -166,10 +167,11 @@ def date_range(DeviceScalar start, size_type n, offset):
+ offset.kwds.get("months", 0)
)

cdef const scalar* c_start = start.c_value.get()
with nogil:
c_result = move(calendrical_month_sequence(
n,
start.c_value.get()[0],
c_start[0],
months
))
return Column.from_unique_ptr(move(c_result))
Expand Down
95 changes: 1 addition & 94 deletions python/cudf/cudf/_lib/interop.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -4,14 +4,7 @@ from cpython cimport pycapsule
from libcpp.memory cimport shared_ptr, unique_ptr
from libcpp.utility cimport move
from libcpp.vector cimport vector
from pyarrow.lib cimport (
CScalar,
CTable,
pyarrow_unwrap_scalar,
pyarrow_unwrap_table,
pyarrow_wrap_scalar,
pyarrow_wrap_table,
)
from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table

from cudf._lib.cpp.interop cimport (
DLManagedTensor,
Expand All @@ -21,22 +14,12 @@ from cudf._lib.cpp.interop cimport (
to_arrow as cpp_to_arrow,
to_dlpack as cpp_to_dlpack,
)
from cudf._lib.cpp.scalar.scalar cimport fixed_point_scalar, scalar
from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport type_id
from cudf._lib.cpp.wrappers.decimals cimport (
decimal32,
decimal64,
decimal128,
scale_type,
)
from cudf._lib.scalar cimport DeviceScalar
from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns

from cudf.api.types import is_list_dtype, is_struct_dtype
from cudf.core.buffer import acquire_spill_lock
from cudf.core.dtypes import Decimal32Dtype, Decimal64Dtype


def from_dlpack(dlpack_capsule):
Expand Down Expand Up @@ -199,79 +182,3 @@ def from_arrow(object input_table):
c_result = move(cpp_from_arrow(cpp_arrow_table.get()[0]))

return columns_from_unique_ptr(move(c_result))


@acquire_spill_lock()
def to_arrow_scalar(DeviceScalar source_scalar):
"""Convert a scalar to a PyArrow scalar.
Parameters
----------
source_scalar : the scalar to convert
Returns
-------
pyarrow.lib.Scalar
"""
cdef vector[column_metadata] cpp_metadata = gather_metadata(
[("", source_scalar.dtype)]
)
cdef const scalar* source_scalar_ptr = source_scalar.get_raw_ptr()

cdef shared_ptr[CScalar] cpp_arrow_scalar
with nogil:
cpp_arrow_scalar = cpp_to_arrow(
source_scalar_ptr[0], cpp_metadata[0]
)

return pyarrow_wrap_scalar(cpp_arrow_scalar)


@acquire_spill_lock()
def from_arrow_scalar(object input_scalar, output_dtype=None):
"""Convert from PyArrow scalar to a cudf scalar.
Parameters
----------
input_scalar : PyArrow scalar
output_dtype : output type to cast to, ignored except for decimals
Returns
-------
cudf._lib.DeviceScalar
"""
cdef shared_ptr[CScalar] cpp_arrow_scalar = (
pyarrow_unwrap_scalar(input_scalar)
)
cdef unique_ptr[scalar] c_result

with nogil:
c_result = move(cpp_from_arrow(cpp_arrow_scalar.get()[0]))

cdef type_id ctype = c_result.get().type().id()
if ctype == type_id.DECIMAL128:
if output_dtype is None:
# Decimals must be cast to the cudf dtype of the right width
raise ValueError(
"Decimal scalars must be constructed with a dtype"
)

if isinstance(output_dtype, Decimal32Dtype):
c_result.reset(
new fixed_point_scalar[decimal32](
(<fixed_point_scalar[decimal128]*> c_result.get()).value(),
scale_type(-input_scalar.type.scale),
c_result.get().is_valid()
)
)
elif isinstance(output_dtype, Decimal64Dtype):
c_result.reset(
new fixed_point_scalar[decimal64](
(<fixed_point_scalar[decimal128]*> c_result.get()).value(),
scale_type(-input_scalar.type.scale),
c_result.get().is_valid()
)
)
# Decimal128Dtype is a no-op, no conversion needed.

return DeviceScalar.from_unique_ptr(move(c_result), output_dtype)
8 changes: 8 additions & 0 deletions python/cudf/cudf/_lib/nvtext/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -22,3 +22,11 @@ rapids_cython_create_modules(
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
)
# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
# These requirements will go away once all scalar-related Cython code is removed from cudf.
foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
endforeach()
25 changes: 24 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,33 @@
# the License.
# =============================================================================

set(cython_sources column.pyx copying.pyx gpumemoryview.pyx table.pyx types.pyx utils.pyx)
set(cython_sources column.pyx copying.pyx gpumemoryview.pyx interop.pyx scalar.pyx table.pyx
types.pyx utils.pyx
)
set(linked_libraries cudf::cudf)
rapids_cython_create_modules(
CXX
SOURCE_FILES "${cython_sources}"
LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX pylibcudf_ ASSOCIATED_TARGETS cudf
)

find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)

execute_process(
COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
OUTPUT_STRIP_TRAILING_WHITESPACE
)

foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
endforeach()

# TODO: Clean up this include when switching to scikit-build-core. See cudf/_lib/CMakeLists.txt for
# more info
find_package(NumPy REQUIRED)
foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
# Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
# target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
endforeach()
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.pxd
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

# TODO: Verify consistent usage of relative/absolute imports in pylibcudf.
from . cimport copying
from . cimport copying, interop
from .column cimport Column
from .gpumemoryview cimport gpumemoryview
from .scalar cimport Scalar
from .table cimport Table
# TODO: cimport type_id once
# https://github.com/cython/cython/issues/5609 is resolved
Expand All @@ -12,7 +13,9 @@ from .types cimport DataType
__all__ = [
"Column",
"DataType",
"Scalar",
"Table",
"copying",
"gpumemoryview",
"interop",
]
5 changes: 4 additions & 1 deletion python/cudf/cudf/_lib/pylibcudf/__init__.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,19 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from . import copying
from . import copying, interop
from .column import Column
from .gpumemoryview import gpumemoryview
from .scalar import Scalar
from .table import Table
from .types import DataType, TypeId

__all__ = [
"Column",
"DataType",
"Scalar",
"Table",
"TypeId",
"copying",
"gpumemoryview",
"interop",
]
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/interop.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from cudf._lib.cpp.interop cimport column_metadata


cdef class ColumnMetadata:
cdef public object name
cdef public object children_meta
cdef column_metadata to_libcudf(self)
23 changes: 23 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/interop.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from cudf._lib.cpp.interop cimport column_metadata


cdef class ColumnMetadata:
def __init__(self, name):
self.name = name
self.children_meta = []

cdef column_metadata to_libcudf(self):
"""Convert to C++ column_metadata.
Since this class is mutable and cheap, it is easier to create the C++
object on the fly rather than have it directly backing the storage for
the Cython class.
"""
cdef column_metadata c_metadata
cdef ColumnMetadata child_meta
c_metadata.name = self.name.encode()
for child_meta in self.children_meta:
c_metadata.children_meta.push_back(child_meta.to_libcudf())
return c_metadata
32 changes: 32 additions & 0 deletions python/cudf/cudf/_lib/pylibcudf/scalar.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# Copyright (c) 2023, NVIDIA CORPORATION.

from libcpp cimport bool
from libcpp.memory cimport unique_ptr
from pyarrow cimport lib as pa

from rmm._lib.memory_resource cimport DeviceMemoryResource

from cudf._lib.cpp.scalar.scalar cimport scalar

from .interop cimport ColumnMetadata
from .types cimport DataType


cdef class Scalar:
cdef unique_ptr[scalar] c_obj
cdef DataType _data_type

# Holds a reference to the DeviceMemoryResource used for allocation.
# Ensures the MR does not get destroyed before this DeviceBuffer. `mr` is
# needed for deallocation
cdef DeviceMemoryResource mr

cdef const scalar* get(self) except *

cpdef DataType type(self)
cpdef bool is_valid(self)

@staticmethod
cdef Scalar from_libcudf(unique_ptr[scalar] libcudf_scalar, dtype=*)

cpdef pa.Scalar to_arrow(self, ColumnMetadata metadata)
Loading

0 comments on commit 96664ec

Please sign in to comment.