Skip to content

Commit

Permalink
GH-44651: [Python] Allow from_buffers to work with StringView on Pyth…
Browse files Browse the repository at this point in the history
…on (#44701)

### Rationale for this change

Currently `from_buffers` is not working with StringView on Python because we validate against num_buffers. This only take into account the mandatory buffers but does not take into account the variadic_spec that can be present for both string_view and binary_view

### What changes are included in this PR?

Take into account whether the type contains a variadic_spec for the non-mandatory buffers and only check lower_bound number of buffers.

### Are these changes tested?

Yes, I've added a couple of tests.

### Are there any user-facing changes?

We are exposing a new method on the Python DataType. `has_variadic_buffers` which tells us whether the number of buffers expected is only lower-bounded by num_buffers.
* GitHub Issue: #44651

Authored-by: Raúl Cumplido <[email protected]>
Signed-off-by: Raúl Cumplido <[email protected]>
  • Loading branch information
raulcd authored Nov 18, 2024
1 parent 4c2aef7 commit aa26f28
Show file tree
Hide file tree
Showing 5 changed files with 57 additions and 1 deletion.
7 changes: 6 additions & 1 deletion python/pyarrow/array.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -1174,7 +1174,12 @@ cdef class Array(_PandasConvertible):
"({0}) did not match the passed number "
"({1}).".format(type.num_fields, len(children)))

if type.num_buffers != len(buffers):
if type.has_variadic_buffers:
if type.num_buffers > len(buffers):
raise ValueError("Type's expected number of buffers is at least "
"{0}, but the passed number is "
"{1}.".format(type.num_buffers, len(buffers)))
elif type.num_buffers != len(buffers):
raise ValueError("Type's expected number of buffers "
"({0}) did not match the passed number "
"({1}).".format(type.num_buffers, len(buffers)))
Expand Down
1 change: 1 addition & 0 deletions python/pyarrow/includes/libarrow.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -158,6 +158,7 @@ cdef extern from "arrow/api.h" namespace "arrow" nogil:

cdef cppclass CDataTypeLayout" arrow::DataTypeLayout":
vector[CBufferSpec] buffers
optional[CBufferSpec] variadic_spec
c_bool has_dictionary

cdef cppclass CDataType" arrow::DataType":
Expand Down
26 changes: 26 additions & 0 deletions python/pyarrow/tests/test_array.py
Original file line number Diff line number Diff line change
Expand Up @@ -651,6 +651,32 @@ def test_string_binary_from_buffers():
assert copied.null_count == 0


def test_string_view_from_buffers():
array = pa.array(
[
"String longer than 12 characters",
None,
"short",
"Length is 12"
], type=pa.string_view())

buffers = array.buffers()
copied = pa.StringViewArray.from_buffers(
pa.string_view(), len(array), buffers)
copied.validate(full=True)
assert copied.to_pylist() == [
"String longer than 12 characters",
None,
"short",
"Length is 12"
]

match = r"number of buffers is at least 2"
with pytest.raises(ValueError, match=match):
pa.StringViewArray.from_buffers(
pa.string_view(), len(array), buffers[0:1])


@pytest.mark.parametrize('list_type_factory', [
pa.list_, pa.large_list, pa.list_view, pa.large_list_view])
def test_list_from_buffers(list_type_factory):
Expand Down
8 changes: 8 additions & 0 deletions python/pyarrow/tests/test_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -887,6 +887,14 @@ def test_types_weakref():
assert wr() is None # not a singleton


def test_types_has_variadic_buffers():
for ty in get_many_types():
if ty in (pa.string_view(), pa.binary_view()):
assert ty.has_variadic_buffers
else:
assert not ty.has_variadic_buffers


def test_fields_hashable():
in_dict = {}
fields = [pa.field('a', pa.int32()),
Expand Down
16 changes: 16 additions & 0 deletions python/pyarrow/types.pxi
Original file line number Diff line number Diff line change
Expand Up @@ -326,6 +326,22 @@ cdef class DataType(_Weakrefable):
"""
return self.type.layout().buffers.size()

@property
def has_variadic_buffers(self):
"""
If True, the number of expected buffers is only
lower-bounded by num_buffers.
Examples
--------
>>> import pyarrow as pa
>>> pa.int64().has_variadic_buffers
False
>>> pa.string_view().has_variadic_buffers
True
"""
return self.type.layout().variadic_spec.has_value()

def __str__(self):
return frombytes(self.type.ToString(), safe=True)

Expand Down

0 comments on commit aa26f28

Please sign in to comment.