Skip to content

Commit

Permalink
apacheGH-43454: [C++][Python] Add Opaque canonical extension type (ap…
Browse files Browse the repository at this point in the history
…ache#43458)

### Rationale for this change

Add the newly ratified extension type.

### What changes are included in this PR?

The C++/Python implementation only.

### Are these changes tested?

Yes

### Are there any user-facing changes?

No.
* GitHub Issue: apache#43454

Lead-authored-by: David Li <[email protected]>
Co-authored-by: Weston Pace <[email protected]>
Signed-off-by: David Li <[email protected]>
  • Loading branch information
lidavidm and westonpace authored Aug 14, 2024
1 parent 4d200dc commit 6e7125b
Show file tree
Hide file tree
Showing 17 changed files with 627 additions and 3 deletions.
1 change: 1 addition & 0 deletions cpp/src/arrow/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -907,6 +907,7 @@ endif()
if(ARROW_JSON)
arrow_add_object_library(ARROW_JSON
extension/fixed_shape_tensor.cc
extension/opaque.cc
json/options.cc
json/chunked_builder.cc
json/chunker.cc
Expand Down
23 changes: 23 additions & 0 deletions cpp/src/arrow/compute/kernels/scalar_cast_numeric.cc
Original file line number Diff line number Diff line change
Expand Up @@ -865,6 +865,25 @@ std::shared_ptr<CastFunction> GetCastToHalfFloat() {
return func;
}

struct NullExtensionTypeMatcher : public TypeMatcher {
~NullExtensionTypeMatcher() override = default;

bool Matches(const DataType& type) const override {
return type.id() == Type::EXTENSION &&
checked_cast<const ExtensionType&>(type).storage_id() == Type::NA;
}

std::string ToString() const override { return "extension<storage_type: null>"; }

bool Equals(const TypeMatcher& other) const override {
if (this == &other) {
return true;
}
auto casted = dynamic_cast<const NullExtensionTypeMatcher*>(&other);
return casted != nullptr;
}
};

} // namespace

std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
Expand All @@ -875,6 +894,10 @@ std::vector<std::shared_ptr<CastFunction>> GetNumericCasts() {
auto cast_null = std::make_shared<CastFunction>("cast_null", Type::NA);
DCHECK_OK(cast_null->AddKernel(Type::DICTIONARY, {InputType(Type::DICTIONARY)}, null(),
OutputAllNull));
// Explicitly allow casting extension type with null backing array to null
DCHECK_OK(cast_null->AddKernel(
Type::EXTENSION, {InputType(std::make_shared<NullExtensionTypeMatcher>())}, null(),
OutputAllNull));
functions.push_back(cast_null);

functions.push_back(GetCastToInteger<Int8Type>("cast_int8"));
Expand Down
6 changes: 6 additions & 0 deletions cpp/src/arrow/extension/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,4 +21,10 @@ add_arrow_test(test
PREFIX
"arrow-fixed-shape-tensor")

add_arrow_test(test
SOURCES
opaque_test.cc
PREFIX
"arrow-extension-opaque")

arrow_install_all_headers("arrow/extension")
109 changes: 109 additions & 0 deletions cpp/src/arrow/extension/opaque.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension/opaque.h"

#include <sstream>

#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep
#include "arrow/util/logging.h"

#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <rapidjson/writer.h>

namespace arrow::extension {

std::string OpaqueType::ToString(bool show_metadata) const {
std::stringstream ss;
ss << "extension<" << this->extension_name()
<< "[storage_type=" << storage_type_->ToString(show_metadata)
<< ", type_name=" << type_name_ << ", vendor_name=" << vendor_name_ << "]>";
return ss.str();
}

bool OpaqueType::ExtensionEquals(const ExtensionType& other) const {
if (extension_name() != other.extension_name()) {
return false;
}
const auto& opaque = internal::checked_cast<const OpaqueType&>(other);
return storage_type()->Equals(*opaque.storage_type()) &&
type_name() == opaque.type_name() && vendor_name() == opaque.vendor_name();
}

std::string OpaqueType::Serialize() const {
rapidjson::Document document;
document.SetObject();
rapidjson::Document::AllocatorType& allocator = document.GetAllocator();

rapidjson::Value type_name(rapidjson::StringRef(type_name_));
document.AddMember(rapidjson::Value("type_name", allocator), type_name, allocator);
rapidjson::Value vendor_name(rapidjson::StringRef(vendor_name_));
document.AddMember(rapidjson::Value("vendor_name", allocator), vendor_name, allocator);

rapidjson::StringBuffer buffer;
rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
document.Accept(writer);
return buffer.GetString();
}

Result<std::shared_ptr<DataType>> OpaqueType::Deserialize(
std::shared_ptr<DataType> storage_type, const std::string& serialized_data) const {
rapidjson::Document document;
const auto& parsed = document.Parse(serialized_data.data(), serialized_data.length());
if (parsed.HasParseError()) {
return Status::Invalid("Invalid serialized JSON data for OpaqueType: ",
rapidjson::GetParseError_En(parsed.GetParseError()), ": ",
serialized_data);
} else if (!document.IsObject()) {
return Status::Invalid("Invalid serialized JSON data for OpaqueType: not an object");
}
if (!document.HasMember("type_name")) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: missing type_name");
} else if (!document.HasMember("vendor_name")) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: missing vendor_name");
}

const auto& type_name = document["type_name"];
const auto& vendor_name = document["vendor_name"];
if (!type_name.IsString()) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: type_name is not a string");
} else if (!vendor_name.IsString()) {
return Status::Invalid(
"Invalid serialized JSON data for OpaqueType: vendor_name is not a string");
}

return opaque(std::move(storage_type), type_name.GetString(), vendor_name.GetString());
}

std::shared_ptr<Array> OpaqueType::MakeArray(std::shared_ptr<ArrayData> data) const {
DCHECK_EQ(data->type->id(), Type::EXTENSION);
DCHECK_EQ("arrow.opaque",
internal::checked_cast<const ExtensionType&>(*data->type).extension_name());
return std::make_shared<OpaqueArray>(data);
}

std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
std::string type_name, std::string vendor_name) {
return std::make_shared<OpaqueType>(std::move(storage_type), std::move(type_name),
std::move(vendor_name));
}

} // namespace arrow::extension
69 changes: 69 additions & 0 deletions cpp/src/arrow/extension/opaque.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.

#include "arrow/extension_type.h"
#include "arrow/type.h"

namespace arrow::extension {

/// \brief Opaque is a placeholder for a type from an external (usually
/// non-Arrow) system that could not be interpreted.
class ARROW_EXPORT OpaqueType : public ExtensionType {
public:
/// \brief Construct an OpaqueType.
///
/// \param[in] storage_type The underlying storage type. Should be
/// arrow::null if there is no data.
/// \param[in] type_name The name of the type in the external system.
/// \param[in] vendor_name The name of the external system.
explicit OpaqueType(std::shared_ptr<DataType> storage_type, std::string type_name,
std::string vendor_name)
: ExtensionType(std::move(storage_type)),
type_name_(std::move(type_name)),
vendor_name_(std::move(vendor_name)) {}

std::string extension_name() const override { return "arrow.opaque"; }
std::string ToString(bool show_metadata) const override;
bool ExtensionEquals(const ExtensionType& other) const override;
std::string Serialize() const override;
Result<std::shared_ptr<DataType>> Deserialize(
std::shared_ptr<DataType> storage_type,
const std::string& serialized_data) const override;
/// Create an OpaqueArray from ArrayData
std::shared_ptr<Array> MakeArray(std::shared_ptr<ArrayData> data) const override;

std::string_view type_name() const { return type_name_; }
std::string_view vendor_name() const { return vendor_name_; }

private:
std::string type_name_;
std::string vendor_name_;
};

/// \brief Opaque is a wrapper for (usually binary) data from an external
/// (often non-Arrow) system that could not be interpreted.
class ARROW_EXPORT OpaqueArray : public ExtensionArray {
public:
using ExtensionArray::ExtensionArray;
};

/// \brief Return an OpaqueType instance.
ARROW_EXPORT std::shared_ptr<DataType> opaque(std::shared_ptr<DataType> storage_type,
std::string type_name,
std::string vendor_name);

} // namespace arrow::extension
Loading

0 comments on commit 6e7125b

Please sign in to comment.