apacheGH-37484: [Python] Add a FixedSizeTensorScalar class (apache#37533

) ### Rationale for this change When working with `FixedSizeTensorArray` we want to access individual tensors. This would be enabled by adding: ```python def FixedSizeTensorScalar(pa.ExtensionScalar): def to_numpy_ndarray(): ... ``` See apache#37484. ### What changes are included in this PR? This adds `FixedSizeTensorScalar` and tests for it. ### Are there any user-facing changes? Yes, when calling `FixedSizeTensorArray[i]` we would get back `FixedSizeTensorScalar` instead of `ExtensionScalar`. * Closes: apache#37484 Lead-authored-by: Rok Mihevc <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]> Co-authored-by: Joris Van den Bossche <[email protected]> Co-authored-by: Alenka Frim <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
zanmato1984 · Feb 28, 2024 · a94253c · a94253c
1 parent 72773fe
commit a94253c
Show file tree

Hide file tree

Showing 9 changed files with 566 additions and 122 deletions.
diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.cc b/cpp/src/arrow/extension/fixed_shape_tensor.cc
@@ -19,6 +19,8 @@
 #include <sstream>
 
 #include "arrow/extension/fixed_shape_tensor.h"
+#include "arrow/extension/tensor_internal.h"
+#include "arrow/scalar.h"
 
 #include "arrow/array/array_nested.h"
 #include "arrow/array/array_primitive.h"
@@ -86,7 +88,7 @@ bool FixedShapeTensorType::ExtensionEquals(const ExtensionType& other) const {
   if (extension_name() != other.extension_name()) {
     return false;
   }
-  const auto& other_ext = static_cast<const FixedShapeTensorType&>(other);
+  const auto& other_ext = internal::checked_cast<const FixedShapeTensorType&>(other);
 
   auto is_permutation_trivial = [](const std::vector<int64_t>& permutation) {
     for (size_t i = 1; i < permutation.size(); ++i) {
@@ -143,7 +145,7 @@ std::string FixedShapeTensorType::Serialize() const {
 
   if (!dim_names_.empty()) {
     rj::Value dim_names(rj::kArrayType);
-    for (std::string v : dim_names_) {
+    for (const std::string& v : dim_names_) {
       dim_names.PushBack(rj::Value{}.SetString(v.c_str(), allocator), allocator);
     }
     document.AddMember(rj::Value("dim_names", allocator), dim_names, allocator);
@@ -199,10 +201,52 @@ std::shared_ptr<Array> FixedShapeTensorType::MakeArray(
     std::shared_ptr<ArrayData> data) const {
   DCHECK_EQ(data->type->id(), Type::EXTENSION);
   DCHECK_EQ("arrow.fixed_shape_tensor",
-            static_cast<const ExtensionType&>(*data->type).extension_name());
+            internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
   return std::make_shared<ExtensionArray>(data);
 }
 
+Result<std::shared_ptr<Tensor>> FixedShapeTensorType::MakeTensor(
+    const std::shared_ptr<ExtensionScalar>& scalar) {
+  const auto ext_scalar = internal::checked_pointer_cast<ExtensionScalar>(scalar);
+  const auto ext_type =
+      internal::checked_pointer_cast<FixedShapeTensorType>(scalar->type);
+  if (!is_fixed_width(*ext_type->value_type())) {
+    return Status::TypeError("Cannot convert non-fixed-width values to Tensor.");
+  }
+  const auto array =
+      internal::checked_pointer_cast<const FixedSizeListScalar>(ext_scalar->value)->value;
+  if (array->null_count() > 0) {
+    return Status::Invalid("Cannot convert data with nulls to Tensor.");
+  }
+  const auto value_type =
+      internal::checked_pointer_cast<FixedWidthType>(ext_type->value_type());
+  const auto byte_width = value_type->byte_width();
+
+  std::vector<int64_t> permutation = ext_type->permutation();
+  if (permutation.empty()) {
+    permutation.resize(ext_type->ndim());
+    std::iota(permutation.begin(), permutation.end(), 0);
+  }
+
+  std::vector<int64_t> shape = ext_type->shape();
+  internal::Permute<int64_t>(permutation, &shape);
+
+  std::vector<std::string> dim_names = ext_type->dim_names();
+  if (!dim_names.empty()) {
+    internal::Permute<std::string>(permutation, &dim_names);
+  }
+
+  std::vector<int64_t> strides;
+  RETURN_NOT_OK(ComputeStrides(*value_type.get(), shape, permutation, &strides));
+  const auto start_position = array->offset() * byte_width;
+  const auto size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
+                                    std::multiplies<>());
+  const auto buffer =
+      SliceBuffer(array->data()->buffers[1], start_position, size * byte_width);
+
+  return Tensor::Make(ext_type->value_type(), buffer, shape, strides, dim_names);
+}
+
 Result<std::shared_ptr<FixedShapeTensorArray>> FixedShapeTensorArray::FromTensor(
     const std::shared_ptr<Tensor>& tensor) {
   auto permutation = internal::ArgSort(tensor->strides(), std::greater<>());
@@ -293,53 +337,71 @@ const Result<std::shared_ptr<Tensor>> FixedShapeTensorArray::ToTensor() const {
   // To convert an array of n dimensional tensors to a n+1 dimensional tensor we
   // interpret the array's length as the first dimension the new tensor.
 
-  auto ext_arr = std::static_pointer_cast<FixedSizeListArray>(this->storage());
-  auto ext_type = internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
-  ARROW_RETURN_IF(!is_fixed_width(*ext_arr->value_type()),
-                  Status::Invalid(ext_arr->value_type()->ToString(),
-                                  " is not valid data type for a tensor"));
-  auto permutation = ext_type->permutation();
-
-  std::vector<std::string> dim_names;
-  if (!ext_type->dim_names().empty()) {
-    for (auto i : permutation) {
-      dim_names.emplace_back(ext_type->dim_names()[i]);
-    }
-    dim_names.insert(dim_names.begin(), 1, "");
+  const auto ext_type =
+      internal::checked_pointer_cast<FixedShapeTensorType>(this->type());
+  const auto value_type = ext_type->value_type();
+  ARROW_RETURN_IF(
+      !is_fixed_width(*value_type),
+      Status::TypeError(value_type->ToString(), " is not valid data type for a tensor"));
+
+  // ext_type->permutation() gives us permutation for a single row with values in
+  // range [0, ndim). Here want to create a ndim + 1 dimensional tensor from the entire
+  // array and we assume the first dimension will always have the greatest stride, so it
+  // will get permutation index 0 and remaining values from ext_type->permutation() need
+  // to be shifted to fill the [1, ndim+1) range. Computed permutation will be used to
+  // generate the new tensor's shape, strides and dim_names.
+  std::vector<int64_t> permutation = ext_type->permutation();
+  if (permutation.empty()) {
+    permutation.resize(ext_type->ndim() + 1);
+    std::iota(permutation.begin(), permutation.end(), 0);
   } else {
-    dim_names = {};
+    for (auto i = 0; i < static_cast<int64_t>(ext_type->ndim()); i++) {
+      permutation[i] += 1;
+    }
+    permutation.insert(permutation.begin(), 1, 0);
   }
 
-  std::vector<int64_t> shape;
-  for (int64_t& i : permutation) {
-    shape.emplace_back(ext_type->shape()[i]);
-    ++i;
+  std::vector<std::string> dim_names = ext_type->dim_names();
+  if (!dim_names.empty()) {
+    dim_names.insert(dim_names.begin(), 1, "");
+    internal::Permute<std::string>(permutation, &dim_names);
   }
+
+  std::vector<int64_t> shape = ext_type->shape();
+  auto cell_size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
+                                   std::multiplies<>());
   shape.insert(shape.begin(), 1, this->length());
-  permutation.insert(permutation.begin(), 1, 0);
+  internal::Permute<int64_t>(permutation, &shape);
 
   std::vector<int64_t> tensor_strides;
-  auto value_type = internal::checked_pointer_cast<FixedWidthType>(ext_arr->value_type());
+  const auto fw_value_type = internal::checked_pointer_cast<FixedWidthType>(value_type);
   ARROW_RETURN_NOT_OK(
-      ComputeStrides(*value_type.get(), shape, permutation, &tensor_strides));
-  ARROW_ASSIGN_OR_RAISE(auto buffers, ext_arr->Flatten());
+      ComputeStrides(*fw_value_type.get(), shape, permutation, &tensor_strides));
+
+  const auto raw_buffer = this->storage()->data()->child_data[0]->buffers[1];
   ARROW_ASSIGN_OR_RAISE(
-      auto tensor, Tensor::Make(ext_arr->value_type(), buffers->data()->buffers[1], shape,
-                                tensor_strides, dim_names));
-  return tensor;
+      const auto buffer,
+      SliceBufferSafe(raw_buffer, this->offset() * cell_size * value_type->byte_width()));
+
+  return Tensor::Make(value_type, buffer, shape, tensor_strides, dim_names);
 }
 
 Result<std::shared_ptr<DataType>> FixedShapeTensorType::Make(
     const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape,
     const std::vector<int64_t>& permutation, const std::vector<std::string>& dim_names) {
-  if (!permutation.empty() && shape.size() != permutation.size()) {
-    return Status::Invalid("permutation size must match shape size. Expected: ",
-                           shape.size(), " Got: ", permutation.size());
+  const auto ndim = shape.size();
+  if (!permutation.empty() && ndim != permutation.size()) {
+    return Status::Invalid("permutation size must match shape size. Expected: ", ndim,
+                           " Got: ", permutation.size());
+  }
+  if (!dim_names.empty() && ndim != dim_names.size()) {
+    return Status::Invalid("dim_names size must match shape size. Expected: ", ndim,
+                           " Got: ", dim_names.size());
   }
-  if (!dim_names.empty() && shape.size() != dim_names.size()) {
-    return Status::Invalid("dim_names size must match shape size. Expected: ",
-                           shape.size(), " Got: ", dim_names.size());
+  if (!permutation.empty()) {
+    RETURN_NOT_OK(internal::IsPermutationValid(permutation));
   }
+
   const auto size = std::accumulate(shape.begin(), shape.end(), static_cast<int64_t>(1),
                                     std::multiplies<>());
   return std::make_shared<FixedShapeTensorType>(value_type, static_cast<int32_t>(size),

diff --git a/cpp/src/arrow/extension/fixed_shape_tensor.h b/cpp/src/arrow/extension/fixed_shape_tensor.h
@@ -64,7 +64,7 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
   std::string ToString() const override;
 
   /// Number of dimensions of tensor elements
-  size_t ndim() { return shape_.size(); }
+  size_t ndim() const { return shape_.size(); }
 
   /// Shape of tensor elements
   const std::vector<int64_t> shape() const { return shape_; }
@@ -94,6 +94,15 @@ class ARROW_EXPORT FixedShapeTensorType : public ExtensionType {
   /// Create a FixedShapeTensorArray from ArrayData
   std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;
 
+  /// \brief Create a Tensor from an ExtensionScalar from a FixedShapeTensorArray
+  ///
+  /// This method will return a Tensor from ExtensionScalar with strides
+  /// derived from shape and permutation of FixedShapeTensorType. Shape and
+  /// dim_names will be permuted according to permutation stored in the
+  /// FixedShapeTensorType metadata.
+  static Result<std::shared_ptr<Tensor>> MakeTensor(
+      const std::shared_ptr<ExtensionScalar>& scalar);
+
   /// \brief Create a FixedShapeTensorType instance
   static Result<std::shared_ptr<DataType>> Make(
       const std::shared_ptr<DataType>& value_type, const std::vector<int64_t>& shape,