GH-43956: [C++][Format] Add initial Decimal32/Decimal64 implementatio…

…ns (#43957)  ### Rationale for this change Widening the Decimal128/256 type to allow for bitwidths of 32 and 64 allows for more interoperability with other libraries and utilities which already support these types. This provides even more opportunities for zero-copy interactions between things such as libcudf and various databases.  ### What changes are included in this PR? This PR contains the basic C++ implementations for Decimal32/Decimal64 types, arrays, builders and scalars. It also includes the minimum necessary to get everything compiling and tests passing without also extending the acero kernels and parquet handling (both of which will be handled in follow-up PRs).  ### Are these changes tested? Yes, tests were extended where applicable to add decimal32/decimal64 cases.  ### Are there any user-facing changes? Currently if a user is using `decimal(precision, scale)` rather than `decimal128(precision, scale)` they will get a `Decimal128Type` if the precision is <= 38 (max precision for Decimal128) and `Decimal256Type` if the precision is higher. Following the same pattern, this change means that using `decimal(precision, scale)` instead of the specific `decimal32`/`decimal64`/`decimal128`/`decimal256` functions results in the following functionality: - for precisions [1 : 9] => `Decimal32Type` - for precisions [10 : 18] => `Decimal64Type` - for precisions [19 : 38] => `Decimal128Type` - for precisions [39 : 76] => `Decimal256Type` While many of our tests currently make the assumption that `decimal` with a low precision would be `Decimal128` and had to be updated, this may cause an initial surprise if users are making the same assumptions.      * GitHub Issue: #43956 --------- Co-authored-by: Felipe Oliveira Carvalho <[email protected]> Co-authored-by: Benjamin Kietzman <[email protected]> Co-authored-by: Antoine Pitrou <[email protected]>
apache · Sep 30, 2024 · d55d4c6 · d55d4c6
1 parent 6ee74c2
commit d55d4c6
Show file tree

Hide file tree

Showing 85 changed files with 3,331 additions and 627 deletions.
diff --git a/cpp/src/arrow/acero/tpch_benchmark.cc b/cpp/src/arrow/acero/tpch_benchmark.cc
@@ -58,7 +58,7 @@ std::shared_ptr<ExecPlan> Plan_Q1(AsyncGenerator<std::optional<ExecBatch>>* sink
   Expression base_price = field_ref("L_EXTENDEDPRICE");
 
   std::shared_ptr<Decimal128Scalar> decimal_1 =
-      std::make_shared<Decimal128Scalar>(Decimal128{0, 100}, decimal(12, 2));
+      std::make_shared<Decimal128Scalar>(Decimal128{0, 100}, decimal128(12, 2));
   Expression discount_multiplier =
       call("subtract", {literal(decimal_1), field_ref("L_DISCOUNT")});
   Expression tax_multiplier = call("add", {literal(decimal_1), field_ref("L_TAX")});
@@ -68,7 +68,7 @@ std::shared_ptr<ExecPlan> Plan_Q1(AsyncGenerator<std::optional<ExecBatch>>* sink
       call("multiply",
            {call("cast",
                  {call("multiply", {field_ref("L_EXTENDEDPRICE"), discount_multiplier})},
-                 compute::CastOptions::Unsafe(decimal(12, 2))),
+                 compute::CastOptions::Unsafe(decimal128(12, 2))),
             tax_multiplier});
   Expression discount = field_ref("L_DISCOUNT");
 

diff --git a/cpp/src/arrow/acero/tpch_node.cc b/cpp/src/arrow/acero/tpch_node.cc
@@ -838,12 +838,12 @@ class PartAndPartSupplierGenerator {
 
   const std::vector<std::shared_ptr<DataType>> kPartTypes = {
       int32(), utf8(),  fixed_size_binary(25), fixed_size_binary(10),
-      utf8(),  int32(), fixed_size_binary(10), decimal(12, 2),
+      utf8(),  int32(), fixed_size_binary(10), decimal128(12, 2),
       utf8(),
   };
 
   const std::vector<std::shared_ptr<DataType>> kPartsuppTypes = {
-      int32(), int32(), int32(), decimal(12, 2), utf8(),
+      int32(), int32(), int32(), decimal128(12, 2), utf8(),
   };
 
   Status AllocatePartBatch(size_t thread_index, int column) {
@@ -1527,7 +1527,7 @@ class OrdersAndLineItemGenerator {
   const std::vector<std::shared_ptr<DataType>> kOrdersTypes = {int32(),
                                                                int32(),
                                                                fixed_size_binary(1),
-                                                               decimal(12, 2),
+                                                               decimal128(12, 2),
                                                                date32(),
                                                                fixed_size_binary(15),
                                                                fixed_size_binary(15),
@@ -1539,10 +1539,10 @@ class OrdersAndLineItemGenerator {
       int32(),
       int32(),
       int32(),
-      decimal(12, 2),
-      decimal(12, 2),
-      decimal(12, 2),
-      decimal(12, 2),
+      decimal128(12, 2),
+      decimal128(12, 2),
+      decimal128(12, 2),
+      decimal128(12, 2),
       fixed_size_binary(1),
       fixed_size_binary(1),
       date32(),
@@ -2489,7 +2489,7 @@ class SupplierGenerator : public TpchTableGenerator {
 
   std::vector<std::shared_ptr<DataType>> kTypes = {
       int32(), fixed_size_binary(25), utf8(),
-      int32(), fixed_size_binary(15), decimal(12, 2),
+      int32(), fixed_size_binary(15), decimal128(12, 2),
       utf8(),
   };
 
@@ -2872,7 +2872,7 @@ class CustomerGenerator : public TpchTableGenerator {
       utf8(),
       int32(),
       fixed_size_binary(15),
-      decimal(12, 2),
+      decimal128(12, 2),
       fixed_size_binary(10),
       utf8(),
   };

diff --git a/cpp/src/arrow/array/array_base.cc b/cpp/src/arrow/array/array_base.cc
@@ -74,6 +74,10 @@ struct ScalarFromArraySlotImpl {
     return Finish(a.Value(index_));
   }
 
+  Status Visit(const Decimal32Array& a) { return Finish(Decimal32(a.GetValue(index_))); }
+
+  Status Visit(const Decimal64Array& a) { return Finish(Decimal64(a.GetValue(index_))); }
+
   Status Visit(const Decimal128Array& a) {
     return Finish(Decimal128(a.GetValue(index_)));
   }

diff --git a/cpp/src/arrow/array/array_decimal.cc b/cpp/src/arrow/array/array_decimal.cc
@@ -32,6 +32,34 @@ namespace arrow {
 
 using internal::checked_cast;
 
+// ----------------------------------------------------------------------
+// Decimal32
+
+Decimal32Array::Decimal32Array(const std::shared_ptr<ArrayData>& data)
+    : FixedSizeBinaryArray(data) {
+  ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL32);
+}
+
+std::string Decimal32Array::FormatValue(int64_t i) const {
+  const auto& type_ = checked_cast<const Decimal32Type&>(*type());
+  const Decimal32 value(GetValue(i));
+  return value.ToString(type_.scale());
+}
+
+// ----------------------------------------------------------------------
+// Decimal64
+
+Decimal64Array::Decimal64Array(const std::shared_ptr<ArrayData>& data)
+    : FixedSizeBinaryArray(data) {
+  ARROW_CHECK_EQ(data->type->id(), Type::DECIMAL64);
+}
+
+std::string Decimal64Array::FormatValue(int64_t i) const {
+  const auto& type_ = checked_cast<const Decimal64Type&>(*type());
+  const Decimal64 value(GetValue(i));
+  return value.ToString(type_.scale());
+}
+
 // ----------------------------------------------------------------------
 // Decimal128
 

diff --git a/cpp/src/arrow/array/array_decimal.h b/cpp/src/arrow/array/array_decimal.h
@@ -32,6 +32,38 @@ namespace arrow {
 ///
 /// @{
 
+// ----------------------------------------------------------------------
+// Decimal32Array
+
+/// Concrete Array class for 32-bit decimal data
+class ARROW_EXPORT Decimal32Array : public FixedSizeBinaryArray {
+ public:
+  using TypeClass = Decimal32Type;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+  /// \brief Construct Decimal32Array from ArrayData instance
+  explicit Decimal32Array(const std::shared_ptr<ArrayData>& data);
+
+  std::string FormatValue(int64_t i) const;
+};
+
+// ----------------------------------------------------------------------
+// Decimal64Array
+
+/// Concrete Array class for 64-bit decimal data
+class ARROW_EXPORT Decimal64Array : public FixedSizeBinaryArray {
+ public:
+  using TypeClass = Decimal64Type;
+
+  using FixedSizeBinaryArray::FixedSizeBinaryArray;
+
+  /// \brief Construct Decimal64Array from ArrayData instance
+  explicit Decimal64Array(const std::shared_ptr<ArrayData>& data);
+
+  std::string FormatValue(int64_t i) const;
+};
+
 // ----------------------------------------------------------------------
 // Decimal128Array
 

diff --git a/cpp/src/arrow/array/array_test.cc b/cpp/src/arrow/array/array_test.cc
@@ -442,7 +442,7 @@ static std::vector<std::shared_ptr<DataType>> TestArrayUtilitiesAgainstTheseType
       large_binary(),
       binary_view(),
       fixed_size_binary(3),
-      decimal(16, 4),
+      decimal128(16, 4),
       utf8(),
       large_utf8(),
       utf8_view(),
@@ -667,8 +667,10 @@ static ScalarVector GetScalars() {
       std::make_shared<BinaryViewScalar>(hello),
       std::make_shared<FixedSizeBinaryScalar>(
           hello, fixed_size_binary(static_cast<int32_t>(hello->size()))),
-      std::make_shared<Decimal128Scalar>(Decimal128(10), decimal(16, 4)),
-      std::make_shared<Decimal256Scalar>(Decimal256(10), decimal(76, 38)),
+      std::make_shared<Decimal32Scalar>(Decimal32(10), smallest_decimal(7, 4)),
+      std::make_shared<Decimal64Scalar>(Decimal64(10), smallest_decimal(12, 4)),
+      std::make_shared<Decimal128Scalar>(Decimal128(10), smallest_decimal(20, 4)),
+      std::make_shared<Decimal256Scalar>(Decimal256(10), smallest_decimal(76, 38)),
       std::make_shared<StringScalar>(hello),
       std::make_shared<LargeStringScalar>(hello),
       std::make_shared<StringViewScalar>(hello),
@@ -3092,6 +3094,98 @@ class DecimalTest : public ::testing::TestWithParam<int> {
   }
 };
 
+using Decimal32Test = DecimalTest<Decimal32Type>;
+
+TEST_P(Decimal32Test, NoNulls) {
+  int32_t precision = GetParam();
+  std::vector<Decimal32> draw = {Decimal32(1), Decimal32(-2), Decimal32(2389),
+                                 Decimal32(4), Decimal32(-12348)};
+  std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
+}
+
+TEST_P(Decimal32Test, WithNulls) {
+  int32_t precision = GetParam();
+  std::vector<Decimal32> draw = {Decimal32(1),  Decimal32(2), Decimal32(-1), Decimal32(4),
+                                 Decimal32(-1), Decimal32(1), Decimal32(2)};
+  Decimal32 big;
+  ASSERT_OK_AND_ASSIGN(big, Decimal32::FromString("23034.234"));
+  draw.push_back(big);
+
+  Decimal32 big_negative;
+  ASSERT_OK_AND_ASSIGN(big_negative, Decimal32::FromString("-23049.235"));
+  draw.push_back(big_negative);
+
+  std::vector<uint8_t> valid_bytes = {true, true, false, true, false,
+                                      true, true, true,  true};
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
+}
+
+TEST_P(Decimal32Test, ValidateFull) {
+  int32_t precision = GetParam();
+  std::vector<Decimal32> draw;
+  Decimal32 val = Decimal32::GetMaxValue(precision) + 1;
+
+  draw = {Decimal32(), val};
+  auto arr = this->TestCreate(precision, draw, {true, false}, 0);
+  ASSERT_OK(arr->ValidateFull());
+
+  draw = {val, Decimal32()};
+  arr = this->TestCreate(precision, draw, {true, false}, 0);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("does not fit in precision of"), arr->ValidateFull());
+}
+
+INSTANTIATE_TEST_SUITE_P(Decimal32Test, Decimal32Test, ::testing::Range(1, 9));
+
+using Decimal64Test = DecimalTest<Decimal64Type>;
+
+TEST_P(Decimal64Test, NoNulls) {
+  int32_t precision = GetParam();
+  std::vector<Decimal64> draw = {Decimal64(1), Decimal64(-2), Decimal64(2389),
+                                 Decimal64(4), Decimal64(-12348)};
+  std::vector<uint8_t> valid_bytes = {true, true, true, true, true};
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
+}
+
+TEST_P(Decimal64Test, WithNulls) {
+  int32_t precision = GetParam();
+  std::vector<Decimal64> draw = {Decimal64(1),  Decimal64(2), Decimal64(-1), Decimal64(4),
+                                 Decimal64(-1), Decimal64(1), Decimal64(2)};
+  Decimal64 big;
+  ASSERT_OK_AND_ASSIGN(big, Decimal64::FromString("23034.234234"));
+  draw.push_back(big);
+
+  Decimal64 big_negative;
+  ASSERT_OK_AND_ASSIGN(big_negative, Decimal64::FromString("-23049.235234"));
+  draw.push_back(big_negative);
+
+  std::vector<uint8_t> valid_bytes = {true, true, false, true, false,
+                                      true, true, true,  true};
+  this->TestCreate(precision, draw, valid_bytes, 0);
+  this->TestCreate(precision, draw, valid_bytes, 2);
+}
+
+TEST_P(Decimal64Test, ValidateFull) {
+  int32_t precision = GetParam();
+  std::vector<Decimal64> draw;
+  Decimal64 val = Decimal64::GetMaxValue(precision) + 1;
+
+  draw = {Decimal64(), val};
+  auto arr = this->TestCreate(precision, draw, {true, false}, 0);
+  ASSERT_OK(arr->ValidateFull());
+
+  draw = {val, Decimal64()};
+  arr = this->TestCreate(precision, draw, {true, false}, 0);
+  EXPECT_RAISES_WITH_MESSAGE_THAT(
+      Invalid, ::testing::HasSubstr("does not fit in precision of"), arr->ValidateFull());
+}
+
+INSTANTIATE_TEST_SUITE_P(Decimal64Test, Decimal64Test, ::testing::Range(1, 9));
+
 using Decimal128Test = DecimalTest<Decimal128Type>;
 
 TEST_P(Decimal128Test, NoNulls) {
@@ -3315,6 +3409,28 @@ TEST(TestSwapEndianArrayData, PrimitiveType) {
   expected_data = ArrayData::Make(uint64(), 1, {null_buffer, data_int64_buffer}, 0);
   AssertArrayDataEqualsWithSwapEndian(data, expected_data);
 
+  auto data_4byte_buffer = Buffer::FromString(
+      "\x01"
+      "12\x01");
+  data = ArrayData::Make(decimal32(9, 8), 1, {null_buffer, data_4byte_buffer});
+  auto data_decimal32_buffer = Buffer::FromString(
+      "\x01"
+      "21\x01");
+  expected_data =
+      ArrayData::Make(decimal32(9, 8), 1, {null_buffer, data_decimal32_buffer}, 0);
+  AssertArrayDataEqualsWithSwapEndian(data, expected_data);
+
+  auto data_8byte_buffer = Buffer::FromString(
+      "\x01"
+      "123456\x01");
+  data = ArrayData::Make(decimal64(18, 8), 1, {null_buffer, data_8byte_buffer});
+  auto data_decimal64_buffer = Buffer::FromString(
+      "\x01"
+      "654321\x01");
+  expected_data =
+      ArrayData::Make(decimal64(18, 8), 1, {null_buffer, data_decimal64_buffer}, 0);
+  AssertArrayDataEqualsWithSwapEndian(data, expected_data);
+
   auto data_16byte_buffer = Buffer::FromString(
       "\x01"
       "123456789abcde\x01");
@@ -3647,6 +3763,8 @@ DataTypeVector SwappableTypes() {
                         uint16(),
                         uint32(),
                         uint64(),
+                        decimal32(8, 1),
+                        decimal64(16, 2),
                         decimal128(19, 4),
                         decimal256(37, 8),
                         timestamp(TimeUnit::MICRO, ""),

diff --git a/cpp/src/arrow/array/array_view_test.cc b/cpp/src/arrow/array/array_view_test.cc
@@ -385,8 +385,32 @@ TEST(TestArrayView, SparseUnionAsStruct) {
   CheckView(expected, arr);
 }
 
-TEST(TestArrayView, DecimalRoundTrip) {
-  auto ty1 = decimal(10, 4);
+TEST(TestArrayView, Decimal32RoundTrip) {
+  auto ty1 = decimal32(9, 4);
+  auto arr = ArrayFromJSON(ty1, R"(["123.4567", "-78.9000", null])");
+
+  auto ty2 = fixed_size_binary(4);
+  ASSERT_OK_AND_ASSIGN(auto v, arr->View(ty2));
+  ASSERT_OK(v->ValidateFull());
+  ASSERT_OK_AND_ASSIGN(auto w, v->View(ty1));
+  ASSERT_OK(w->ValidateFull());
+  AssertArraysEqual(*arr, *w);
+}
+
+TEST(TestArrayView, Decimal64RoundTrip) {
+  auto ty1 = decimal64(10, 4);
+  auto arr = ArrayFromJSON(ty1, R"(["123.4567", "-78.9000", null])");
+
+  auto ty2 = fixed_size_binary(8);
+  ASSERT_OK_AND_ASSIGN(auto v, arr->View(ty2));
+  ASSERT_OK(v->ValidateFull());
+  ASSERT_OK_AND_ASSIGN(auto w, v->View(ty1));
+  ASSERT_OK(w->ValidateFull());
+  AssertArraysEqual(*arr, *w);
+}
+
+TEST(TestArrayView, Decimal128RoundTrip) {
+  auto ty1 = decimal128(20, 4);
   auto arr = ArrayFromJSON(ty1, R"(["123.4567", "-78.9000", null])");
 
   auto ty2 = fixed_size_binary(16);
@@ -397,6 +421,18 @@ TEST(TestArrayView, DecimalRoundTrip) {
   AssertArraysEqual(*arr, *w);
 }
 
+TEST(TestArrayView, Decimal256RoundTrip) {
+  auto ty1 = decimal256(10, 4);
+  auto arr = ArrayFromJSON(ty1, R"(["123.4567", "-78.9000", null])");
+
+  auto ty2 = fixed_size_binary(32);
+  ASSERT_OK_AND_ASSIGN(auto v, arr->View(ty2));
+  ASSERT_OK(v->ValidateFull());
+  ASSERT_OK_AND_ASSIGN(auto w, v->View(ty1));
+  ASSERT_OK(w->ValidateFull());
+  AssertArraysEqual(*arr, *w);
+}
+
 TEST(TestArrayView, Dictionaries) {
   // ARROW-6049
   auto ty1 = dictionary(int8(), float32());

diff --git a/cpp/src/arrow/array/builder_base.cc b/cpp/src/arrow/array/builder_base.cc
@@ -119,6 +119,8 @@ struct AppendScalarImpl {
   }
 
   Status Visit(const FixedSizeBinaryType& t) { return HandleFixedWidth(t); }
+  Status Visit(const Decimal32Type& t) { return HandleFixedWidth(t); }
+  Status Visit(const Decimal64Type& t) { return HandleFixedWidth(t); }
   Status Visit(const Decimal128Type& t) { return HandleFixedWidth(t); }
   Status Visit(const Decimal256Type& t) { return HandleFixedWidth(t); }