Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GH-37484: [Python] Add a FixedSizeTensorScalar class #37533

Merged
merged 45 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
45 commits
Select commit Hold shift + click to select a range
ac59961
Add FixedShapeTensorScalarType
rok Sep 4, 2023
c4bb734
Update python/pyarrow/array.pxi
rok Sep 24, 2023
3b53b45
Apply suggestions from code review
rok Sep 24, 2023
ffe044e
Use GetTensor to get numpy values.
rok Sep 24, 2023
31ab786
Document get_tensor
rok Oct 29, 2023
0a915ad
Minor doc change
rok Oct 30, 2023
68c1684
Review feedback
rok Dec 2, 2023
4a98c8d
Docs
rok Dec 2, 2023
8f95171
linter
rok Dec 2, 2023
d30947c
from_numpy should accept first-row-major
rok Dec 10, 2023
a3cb910
some fixes, more tests
rok Dec 11, 2023
6d31d6c
add array.ToTensor test, simplify logic
rok Dec 12, 2023
7a1a641
Review feedback
rok Dec 12, 2023
18146b2
Review feedback
rok Dec 12, 2023
345ea09
Apply suggestions from code review
rok Dec 12, 2023
078f135
Review feedback
rok Dec 12, 2023
c4ad7fe
add check and test for bad scalar values
rok Dec 13, 2023
f49476a
split out test
rok Dec 13, 2023
0c27892
Minor issue
rok Dec 13, 2023
eda5dde
minor
rok Dec 13, 2023
514a95d
linter
rok Dec 13, 2023
17f9330
change slicing
rok Dec 13, 2023
a5d5d6a
permutation test
rok Dec 13, 2023
8b0b0fa
Review feedback
rok Dec 14, 2023
bf34367
Apply suggestions from code review
rok Dec 22, 2023
451f759
Review feedback
rok Dec 22, 2023
2654829
Review feedback
rok Dec 22, 2023
fcef6e5
Better permutation checks and test
rok Dec 22, 2023
06a81c0
Review feedback
rok Dec 22, 2023
6b49e25
Review feedback
rok Dec 22, 2023
76874e5
Review feedback
rok Dec 22, 2023
14e2e09
Review feedback
rok Dec 23, 2023
00c5d47
Review feedback
rok Dec 23, 2023
6a40b4b
Rename cell->element
rok Dec 23, 2023
a7f9b31
test
rok Dec 23, 2023
da35639
work
rok Jan 29, 2024
977a92b
Update cpp/src/arrow/extension/fixed_shape_tensor_test.cc
rok Jan 29, 2024
cc1ba4c
review feedback
rok Jan 30, 2024
92e3b55
Review feedback
rok Feb 2, 2024
3593031
Apply suggestions from code review
rok Feb 7, 2024
bfdeafd
lint
rok Feb 7, 2024
83d2934
changes to from_numpy_ndarray
rok Feb 7, 2024
e1a1d28
change to checked casts
rok Feb 7, 2024
052eec2
cast to ExtensionArray instead
rok Feb 8, 2024
bf2ca0e
Nit: remove extraneous empty line.
pitrou Feb 8, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 96 additions & 34 deletions cpp/src/arrow/extension/fixed_shape_tensor.cc
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@
#include <sstream>

#include "arrow/extension/fixed_shape_tensor.h"
#include "arrow/extension/tensor_internal.h"
#include "arrow/scalar.h"

#include "arrow/array/array_nested.h"
#include "arrow/array/array_primitive.h"
Expand Down Expand Up @@ -86,7 +88,7 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const {
if (extension_name() != other.extension_name()) {
return false;
}
const auto& other_ext = static_cast<const FixedShapeTensorType&>(other);
const auto& other_ext = internal::checked_cast<const FixedShapeTensorType&>(other);

auto is_permutation_trivial = [](const std::vector<int64_t>& permutation) {
for (size_t i = 1; i < permutation.size(); ++i) {
Expand Down Expand Up @@ -143,7 +145,7 @@ std::string FixedShapeTensorType::Serialize() const {

if (!dim_names_.empty()) {
rj::Value dim_names(rj::kArrayType);
for (std::string v : dim_names_) {
for (const std::string& v : dim_names_) {
dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator);
}
document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator);
Expand Down Expand Up @@ -199,10 +201,52 @@ std::shared_ptr<Array> FixedShapeTensorType::MakeArray(
std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("arrow.fixed_shape_tensor",
static_cast<const ExtensionType&>(*data->type).extension_name());
internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<ExtensionArray>(data);
}

Result<std::shared_ptr<Tensor>> FixedShapeTensorType::MakeTensor(
const std::shared_ptr<ExtensionScalar>& scalar) {
const auto ext_scalar = internal::checked_pointer_cast<ExtensionScalar>(scalar);
const auto ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(scalar->type);
if (!is_fixed_width(*ext_type->value_type())) {
return Status::TypeError("Cannot convert non-fixed-width values to Tensor.");
}
const auto array =
internal::checked_pointer_cast<const FixedSizeListScalar>(ext_scalar->value)->value;
if (array->null_count() > 0) {
return Status::Invalid("Cannot convert data with nulls to Tensor.");
}
const auto value_type =
internal::checked_pointer_cast<FixedWidthType>(ext_type->value_type());
const auto byte_width = value_type->byte_width();

std::vector<int64_t> permutation = ext_type->permutation();
if (permutation.empty()) {
permutation.resize(ext_type->ndim());
std::iota(permutation.begin(), permutation.end(), 0);
}

std::vector<int64_t> shape = ext_type->shape();
internal::Permute<int64_t>(permutation, &shape);

std::vector<std::string> dim_names = ext_type->dim_names();
if (!dim_names.empty()) {
internal::Permute<std::string>(permutation, &dim_names);
}

std::vector<int64_t> strides;
RETURN_NOT_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides));
const auto start_position = array->offset() * byte_width;
const auto size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
std::multiplies<>());
const auto buffer =
SliceBuffer(array->data()->buffers[1], start_position, size * byte_width);

return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names);
}

Result<std::shared_ptr<FixedShapeTensorArray>> FixedShapeTensorArray::FromTensor(
const std::shared_ptr<Tensor>& tensor) {
auto permutation = internal::ArgSort(tensor->strides(), std::greater<>());
Expand Down Expand Up @@ -293,53 +337,71 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const {
// To convert an array of n dimensional tensors to a n+1 dimensional tensor we
// interpret the array's length as the first dimension the new tensor.

auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
auto ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
Status::Invalid(ext_arr->value_type()->ToString(),
" is not valid data type for a tensor"));
auto permutation = ext_type->permutation();

std::vector<std::string> dim_names;
if (!ext_type->dim_names().empty()) {
for (auto i : permutation) {
dim_names.emplace_back(ext_type->dim_names()[i]);
}
dim_names.insert(dim_names.begin(), 1, "");
const auto ext_type =
internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
const auto value_type = ext_type->value_type();
ARROW_RETURN_IF(
!is_fixed_width(*value_type),
Status::TypeError(value_type->ToString(), " is not valid data type for a tensor"));

// ext_type->permutation() gives us permutation for a single row with values in
// range [0, ndim). Here want to create a ndim + 1 dimensional tensor from the entire
// array and we assume the first dimension will always have the greatest stride, so it
// will get permutation index 0 and remaining values from ext_type->permutation() need
// to be shifted to fill the [1, ndim+1) range. Computed permutation will be used to
// generate the new tensor's shape, strides and dim_names.
rok marked this conversation as resolved.
Show resolved Hide resolved
std::vector<int64_t> permutation = ext_type->permutation();
if (permutation.empty()) {
permutation.resize(ext_type->ndim() + 1);
std::iota(permutation.begin(), permutation.end(), 0);
} else {
dim_names = {};
for (auto i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
permutation[i] += 1;
}
permutation.insert(permutation.begin(), 1, 0);
}

std::vector<int64_t> shape;
for (int64_t& i : permutation) {
shape.emplace_back(ext_type->shape()[i]);
++i;
rok marked this conversation as resolved.
Show resolved Hide resolved
std::vector<std::string> dim_names = ext_type->dim_names();
if (!dim_names.empty()) {
dim_names.insert(dim_names.begin(), 1, "");
internal::Permute<std::string>(permutation, &dim_names);
}

std::vector<int64_t> shape = ext_type->shape();
auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
std::multiplies<>());
shape.insert(shape.begin(), 1, this->length());
permutation.insert(permutation.begin(), 1, 0);
internal::Permute<int64_t>(permutation, &shape);

std::vector<int64_t> tensor_strides;
auto value_type = internal::checked_pointer_cast<FixedWidthType>(ext_arr->value_type());
const auto fw_value_type = internal::checked_pointer_cast<FixedWidthType>(value_type);
ARROW_RETURN_NOT_OK(
ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides));
ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten());
ComputeStrides(*fw_value_type.get(), shape, permutation, &tensor_strides));

const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1];
ARROW_ASSIGN_OR_RAISE(
auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape,
tensor_strides, dim_names));
return tensor;
const auto buffer,
SliceBufferSafe(raw_buffer, this->offset() * cell_size * value_type->byte_width()));

return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names);
}

Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make(
const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape,
const std::vector<int64_t>& permutation, const std::vector<std::string>& dim_names) {
if (!permutation.empty() && shape.size() != permutation.size()) {
return Status::Invalid("permutation size must match shape size. Expected: ",
shape.size(), " Got: ", permutation.size());
const auto ndim = shape.size();
if (!permutation.empty() && ndim != permutation.size()) {
return Status::Invalid("permutation size must match shape size. Expected: ", ndim,
" Got: ", permutation.size());
}
if (!dim_names.empty() && ndim != dim_names.size()) {
return Status::Invalid("dim_names size must match shape size. Expected: ", ndim,
" Got: ", dim_names.size());
}
if (!dim_names.empty() && shape.size() != dim_names.size()) {
return Status::Invalid("dim_names size must match shape size. Expected: ",
shape.size(), " Got: ", dim_names.size());
if (!permutation.empty()) {
RETURN_NOT_OK(internal::IsPermutationValid(permutation));
}

const auto size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
std::multiplies<>());
return std::make_shared<FixedShapeTensorType>(value_type, static_cast<int32_t>(size),
Expand Down
11 changes: 10 additions & 1 deletion cpp/src/arrow/extension/fixed_shape_tensor.h
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
std::string ToString() const override;

/// Number of dimensions of tensor elements
size_t ndim() { return shape_.size(); }
size_t ndim() const { return shape_.size(); }

/// Shape of tensor elements
const std::vector<int64_t> shape() const { return shape_; }
Expand Down Expand Up @@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
/// Create a FixedShapeTensorArray from ArrayData
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

/// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray
///
/// This method will return a Tensor from ExtensionScalar with strides
/// derived from shape and permutation of FixedShapeTensorType. Shape and
/// dim_names will be permuted according to permutation stored in the
/// FixedShapeTensorType metadata.
static Result<std::shared_ptr<Tensor>> MakeTensor(
const std::shared_ptr<ExtensionScalar>& scalar);

/// \brief Create a FixedShapeTensorType instance
static Result<std::shared_ptr<DataType>> Make(
const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape,
Expand Down
Loading
Loading