Skip to content

Commit

Permalink
feat: Add to_cudf (scikit-hep#3051)
Browse files Browse the repository at this point in the history
* Start

* style: pre-commit fixes

* results of chatting

* style: pre-commit fixes

* Add toplevel func

* fix

* fix in indexedoptionarray

* Add tests and fixes

* style: pre-commit fixes

* use direct np module (for now)

* style: pre-commit fixes

* Don't accidentally iterate cupy with CPU

* style: pre-commit fixes

* Update src/awkward/_errors.py

* add docstring

* Add string

* style: pre-commit fixes

* Simpler for numerics

This works also for time types, without going via cupy

---------

Co-authored-by: Martin Durant <[email protected]>
Co-authored-by: pre-commit-ci[bot] <66853113+pre-commit-ci[bot]@users.noreply.github.com>
  • Loading branch information
3 people authored Sep 13, 2024
1 parent 1420121 commit 1c368f1
Show file tree
Hide file tree
Showing 15 changed files with 210 additions and 1 deletion.
2 changes: 1 addition & 1 deletion awkward-cpp/rapidjson
19 changes: 19 additions & 0 deletions src/awkward/contents/bitmaskedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from awkward._backends.backend import Backend
from awkward._meta.bitmaskedmeta import BitMaskedMeta
from awkward._nplikes.array_like import ArrayLike
from awkward._nplikes.cupy import Cupy
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
from awkward._nplikes.placeholder import PlaceholderArray
Expand Down Expand Up @@ -687,6 +688,24 @@ def _to_arrow(
pyarrow, mask_node, validbytes, length, options
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
cp = Cupy.instance()._module

assert mask is None # this class has its own mask
if not self.lsb_order:
m = cp.flip(
cp.packbits(cp.flip(cp.unpackbits(cp.asarray(self._mask.data))))
)
else:
m = self._mask.data

if m.nbytes % 64:
m = cp.resize(m, ((m.nbytes // 64) + 1) * 64)
m = cudf.core.buffer.as_buffer(m)
inner = self._content._to_cudf(cudf, mask=None, length=length)
inner.set_base_mask(m)
return inner

def _to_backend_array(self, allow_missing, backend):
return self.to_ByteMaskedArray()._to_backend_array(allow_missing, backend)

Expand Down
13 changes: 13 additions & 0 deletions src/awkward/contents/bytemaskedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
from awkward._layout import maybe_posaxis
from awkward._meta.bytemaskedmeta import ByteMaskedMeta
from awkward._nplikes.array_like import ArrayLike
from awkward._nplikes.cupy import Cupy
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
from awkward._nplikes.placeholder import PlaceholderArray
Expand Down Expand Up @@ -1051,6 +1052,18 @@ def _to_arrow(
options,
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
cp = Cupy.instance()._module

assert mask is None # this class has its own mask
m = cp.packbits(cp.asarray(self._mask), bitorder="little")
if m.nbytes % 64:
m = cp.resize(m, ((m.nbytes // 64) + 1) * 64)
m = cudf.core.buffer.as_buffer(m)
inner = self._content._to_cudf(cudf, mask=None, length=length)
inner.set_base_mask(m)
return inner

def _to_backend_array(self, allow_missing, backend):
return self.to_IndexedOptionArray64()._to_backend_array(allow_missing, backend)

Expand Down
4 changes: 4 additions & 0 deletions src/awkward/contents/content.py
Original file line number Diff line number Diff line change
Expand Up @@ -1010,6 +1010,10 @@ def _to_arrow(
):
raise NotImplementedError

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
# prototype abstract signature
raise NotImplementedError

def to_backend_array(
self, allow_missing: bool = True, *, backend: Backend | str | None = None
):
Expand Down
9 changes: 9 additions & 0 deletions src/awkward/contents/emptyarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -387,6 +387,15 @@ def _to_arrow(
)
return next._to_arrow(pyarrow, mask_node, validbytes, length, options)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
dtype = np.dtype("float64")
next = ak.contents.NumpyArray(
numpy.empty(length, dtype=dtype),
parameters=self._parameters,
backend=self._backend,
)
return next._to_cudf(cudf, None, 0)

@classmethod
def _arrow_needs_option_type(cls):
return True # This overrides Content._arrow_needs_option_type
Expand Down
10 changes: 10 additions & 0 deletions src/awkward/contents/indexedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1049,6 +1049,16 @@ def _to_arrow(
)
return next2._to_arrow(pyarrow, mask_node, validbytes, length, options)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
if self._content.length == 0:
# IndexedOptionArray._to_arrow replaces -1 in the index with 0. So behind
# every masked value is self._content[0], unless self._content.length == 0.
# In that case, don't call self._content[index]; it's empty anyway.
next = self._content
else:
next = self._content._carry(self._index, False)
return next._to_cudf(cudf, None, len(next))

def _to_backend_array(self, allow_missing, backend):
return self.project()._to_backend_array(allow_missing, backend)

Expand Down
3 changes: 3 additions & 0 deletions src/awkward/contents/indexedoptionarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1576,6 +1576,9 @@ def _to_arrow(
options,
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
return self.to_ByteMaskedArray(True)._to_cudf(cudf, mask, length)

def _to_backend_array(self, allow_missing, backend):
nplike = backend.nplike
index_nplike = backend.index_nplike
Expand Down
3 changes: 3 additions & 0 deletions src/awkward/contents/listarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1498,6 +1498,9 @@ def _to_arrow(
pyarrow, mask_node, validbytes, length, options
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
return self.to_ListOffsetArray64(False)._to_cudf(cudf, mask, length)

def _to_backend_array(self, allow_missing, backend):
array_param = self.parameter("__array__")
if array_param in {"bytestring", "string"}:
Expand Down
34 changes: 34 additions & 0 deletions src/awkward/contents/listoffsetarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from awkward._layout import maybe_posaxis
from awkward._meta.listoffsetmeta import ListOffsetMeta
from awkward._nplikes.array_like import ArrayLike
from awkward._nplikes.cupy import Cupy
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
from awkward._nplikes.placeholder import PlaceholderArray
Expand Down Expand Up @@ -1999,6 +2000,39 @@ def _to_arrow(
),
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
cupy = Cupy.instance()
index = self._offsets.raw(cupy).astype("int32")
buf = cudf.core.buffer.as_buffer(index)
ind_buf = cudf.core.column.numerical.NumericalColumn(
buf, index.dtype, None, size=len(index)
)
cont = self._content._to_cudf(cudf, None, len(self._content))
if mask is not None:
m = np._module.packbits(mask, bitorder="little")
if m.nbytes % 64:
m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64)
m = cudf.core.buffer.as_buffer(cupy.asarray(m))
else:
m = None
if self.parameters.get("__array__") == "string":
from cudf.core.column.string import StringColumn

data = cudf.core.buffer.as_buffer(cupy.asarray(self._content.data))
# docs for StringColumn says there should be two children instead of a data=
return StringColumn(
data=data,
children=(ind_buf,),
mask=m,
)

return cudf.core.column.lists.ListColumn(
length,
mask=m,
children=(ind_buf, cont),
dtype=cudf.core.dtypes.ListDtype(cont.dtype),
)

def _to_backend_array(self, allow_missing, backend):
array_param = self.parameter("__array__")
if array_param == "string":
Expand Down
15 changes: 15 additions & 0 deletions src/awkward/contents/numpyarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
from awkward._meta.numpymeta import NumpyMeta
from awkward._nplikes import to_nplike
from awkward._nplikes.array_like import ArrayLike
from awkward._nplikes.cupy import Cupy
from awkward._nplikes.jax import Jax
from awkward._nplikes.numpy import Numpy
from awkward._nplikes.numpy_like import IndexType, NumpyMetadata
Expand Down Expand Up @@ -1220,6 +1221,20 @@ def _to_arrow(
),
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
cupy = Cupy.instance()
from cudf.core.column.column import as_column

assert self._backend.nplike.known_data
data = as_column(self._data)
if mask is not None:
m = cupy.packbits(cupy.asarray(mask), bitorder="little")
if m.nbytes % 64:
m = cupy.resize(m, ((m.nbytes // 64) + 1) * 64)
m = cudf.core.buffer.as_buffer(m)
data.set_base_data(m)
return data

def _to_backend_array(self, allow_missing, backend):
return to_nplike(self.data, backend.nplike, from_nplike=self._backend.nplike)

Expand Down
17 changes: 17 additions & 0 deletions src/awkward/contents/recordarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -1101,6 +1101,23 @@ def _to_arrow(
children=values,
)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
children = tuple(
c._to_cudf(cudf, mask=None, length=length) for c in self.contents
)
dt = cudf.core.dtypes.StructDtype(
{field: c.dtype for field, c in zip(self.fields, children)}
)
m = mask._to_cudf(cudf, None, length) if mask else None
return cudf.core.column.struct.StructColumn(
data=None,
children=children,
dtype=dt,
mask=m,
size=length,
offset=0,
)

def _to_backend_array(self, allow_missing, backend):
if self.fields is None:
return backend.nplike.empty(self.length, dtype=[])
Expand Down
3 changes: 3 additions & 0 deletions src/awkward/contents/unmaskedarray.py
Original file line number Diff line number Diff line change
Expand Up @@ -498,6 +498,9 @@ def _to_arrow(
):
return self._content._to_arrow(pyarrow, self, None, length, options)

def _to_cudf(self, cudf: Any, mask: Content | None, length: int):
return self._content._to_cudf(cudf, mask, length)

def _to_backend_array(self, allow_missing, backend):
content = self.content._to_backend_array(allow_missing, backend)
if allow_missing:
Expand Down
1 change: 1 addition & 0 deletions src/awkward/operations/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@
from awkward.operations.ak_to_arrow_table import *
from awkward.operations.ak_to_backend import *
from awkward.operations.ak_to_buffers import *
from awkward.operations.ak_to_cudf import *
from awkward.operations.ak_to_cupy import *
from awkward.operations.ak_to_dataframe import *
from awkward.operations.ak_to_feather import *
Expand Down
21 changes: 21 additions & 0 deletions src/awkward/operations/ak_to_cudf.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# BSD 3-Clause License; see https://github.com/scikit-hep/awkward/blob/main/LICENSE
from __future__ import annotations

import awkward as ak
from awkward._dispatch import high_level_function

__all__ = ("to_cudf",)


@high_level_function()
def to_cudf(
array: ak.Array,
):
"""Create a cuDF.Series out of the given ak array
Buffers that are not already in GPU memory will be transferred, and some
structural reformatting may happen to account for differences in architecture.
"""
import cudf

return cudf.Series(array.layout._to_cudf(cudf, None, len(array)))
57 changes: 57 additions & 0 deletions tests-cuda/test_3051_to_cuda.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from __future__ import annotations

import pytest

import awkward as ak

cudf = pytest.importorskip("cudf")
cupy = pytest.importorskip("cupy")


def test_jagged():
arr = ak.Array([[[1, 2, 3], [], [3, 4]], []])
out = ak.to_cudf(arr)
assert isinstance(out, cudf.Series)
assert out.to_arrow().tolist() == [[[1, 2, 3], [], [3, 4]], []]


def test_nested():
arr = ak.Array(
[{"a": 0, "b": 1.0, "c": {"d": 0}}, {"a": 1, "b": 0.0, "c": {"d": 1}}]
)
out = ak.to_cudf(arr)
assert isinstance(out, cudf.Series)
assert out.to_arrow().tolist() == [
{"a": 0, "b": 1.0, "c": {"d": 0}},
{"a": 1, "b": 0.0, "c": {"d": 1}},
]


def test_null():
arr = ak.Array([12, None, 21, 12])
# calls ByteMaskedArray._to_cudf not NumpyArray
out = ak.to_cudf(arr)
assert isinstance(out, cudf.Series)
assert out.to_arrow().tolist() == [12, None, 21, 12]

# True is valid, LSB order
arr2 = ak.Array(arr.layout.to_BitMaskedArray(True, True))
out = ak.to_cudf(arr2)
assert isinstance(out, cudf.Series)
assert out.to_arrow().tolist() == [12, None, 21, 12]

# reversed LSB (should be rare, involves extra work!)
arr3 = ak.Array(arr.layout.to_BitMaskedArray(True, False))
out = ak.to_cudf(arr3)
assert isinstance(out, cudf.Series)
assert out.to_arrow().tolist() == [12, None, 21, 12]


def test_strings():
arr = ak.Array(["hey", "hi", "hum"])
out = ak.to_cudf(arr)
assert out.to_arrow().tolist() == ["hey", "hi", "hum"]

arr = ak.Array(["hey", "hi", None, "hum"])
out = ak.to_cudf(arr)
assert out.to_arrow().tolist() == ["hey", "hi", None, "hum"]

0 comments on commit 1c368f1

Please sign in to comment.