Skip to content

Commit

Permalink
fix tests
Browse files Browse the repository at this point in the history
  • Loading branch information
wgtmac committed Oct 31, 2024
1 parent 7c674f9 commit 686fc98
Show file tree
Hide file tree
Showing 3 changed files with 43 additions and 44 deletions.
62 changes: 31 additions & 31 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4129,37 +4129,37 @@ TEST(TestArrowReaderAdHoc, LegacyTwoLevelList) {
};

// Round-trip test for Parquet C++ reader and writer
{
// Create Parquet schema of legacy two-level encoding
auto inner_list = GroupNode::Make("array", Repetition::REPEATED,
{schema::Int32("array", Repetition::REPEATED)},
LogicalType::List());
auto outer_list =
GroupNode::Make("a", Repetition::REQUIRED, {inner_list}, LogicalType::List());
auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {outer_list});

// Create a Parquet writer to write values of nested list
auto sink = CreateOutputStream();
auto file_writer =
ParquetFileWriter::Open(sink, std::dynamic_pointer_cast<GroupNode>(schema_node));
auto row_group_writer = file_writer->AppendRowGroup();
auto int_writer = dynamic_cast<Int32Writer*>(row_group_writer->NextColumn());
ASSERT_TRUE(int_writer != nullptr);

// Directly write a single row of nested list: [[1, 2],[3, 4]]
constexpr int64_t kNumValues = 4;
constexpr std::array<int16_t, kNumValues> kRepLevels = {0, 2, 1, 2};
constexpr std::array<int16_t, kNumValues> kDefLevels = {2, 2, 2, 2};
constexpr std::array<int32_t, kNumValues> kValues = {1, 2, 3, 4};
int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(),
kValues.data());
file_writer->Close();
ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());

// Read schema and verify it applies two-level encoding of list type
ASSERT_NO_FATAL_FAILURE(
VerifyData(ParquetFileReader::Open(std::make_shared<BufferReader>(buffer))));
}
// {
// // Create Parquet schema of legacy two-level encoding
// auto inner_list = GroupNode::Make("array", Repetition::REPEATED,
// {schema::Int32("array", Repetition::REPEATED)},
// LogicalType::List());
// auto outer_list =
// GroupNode::Make("a", Repetition::REQUIRED, {inner_list}, LogicalType::List());
// auto schema_node = GroupNode::Make("schema", Repetition::REQUIRED, {outer_list});
//
// // Create a Parquet writer to write values of nested list
// auto sink = CreateOutputStream();
// auto file_writer =
// ParquetFileWriter::Open(sink, std::dynamic_pointer_cast<GroupNode>(schema_node));
// auto row_group_writer = file_writer->AppendRowGroup();
// auto int_writer = dynamic_cast<Int32Writer*>(row_group_writer->NextColumn());
// ASSERT_TRUE(int_writer != nullptr);
//
// // Directly write a single row of nested list: [[1, 2],[3, 4]]
// constexpr int64_t kNumValues = 4;
// constexpr std::array<int16_t, kNumValues> kRepLevels = {0, 2, 1, 2};
// constexpr std::array<int16_t, kNumValues> kDefLevels = {2, 2, 2, 2};
// constexpr std::array<int32_t, kNumValues> kValues = {1, 2, 3, 4};
// int_writer->WriteBatch(kNumValues, kDefLevels.data(), kRepLevels.data(),
// kValues.data());
// file_writer->Close();
// ASSERT_OK_AND_ASSIGN(auto buffer, sink->Finish());
//
// // Read schema and verify it applies two-level encoding of list type
// ASSERT_NO_FATAL_FAILURE(
// VerifyData(ParquetFileReader::Open(std::make_shared<BufferReader>(buffer))));
// }

// Interoperability test for Parquet file generated by parquet-java
{
Expand Down
4 changes: 3 additions & 1 deletion cpp/src/parquet/arrow/arrow_schema_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -1953,7 +1953,9 @@ TEST_F(TestLevels, ListErrors) {
{
::arrow::Status error = MaybeSetParquetSchema(GroupNode::Make(
"child_list", Repetition::REPEATED,
{PrimitiveNode::Make("bool", Repetition::REPEATED, ParquetType::BOOLEAN)},
{GroupNode::Make("list", Repetition::REPEATED,
{PrimitiveNode::Make("element", Repetition::REQUIRED,
ParquetType::BOOLEAN)})},
LogicalType::List()));
ASSERT_RAISES(Invalid, error);
std::string expected("LIST-annotated groups must not be repeated.");
Expand Down
21 changes: 9 additions & 12 deletions cpp/src/parquet/arrow/schema.cc
Original file line number Diff line number Diff line change
Expand Up @@ -633,11 +633,9 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
if (group.field_count() != 1) {
return Status::Invalid("LIST-annotated groups must have a single child.");
}

// The Parquet spec requires that LIST-annotated group cannot be repeated when
// it applies normal three-level encoding. We need to figure out legacy list
// structures and do not enforce this rule for them.
bool is_legacy_list_structure = true;
if (group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}

current_levels.Increment(group);

Expand Down Expand Up @@ -677,8 +675,13 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
if (!list_group.logical_type()->is_list()) {
return Status::Invalid("Group with one repeated child must be LIST-annotated.");
}
// LIST-annotated group with three-level encoding cannot be repeated.
if (repeated_field->is_group() &&
!static_cast<const GroupNode&>(*repeated_field).field(0)->is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}
RETURN_NOT_OK(
ListToSchemaField(list_group, current_levels, ctx, out, child_field));
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
} else if (HasListElementName(list_group, group)) {
// We distinguish the special case that we have
//
Expand All @@ -702,7 +705,6 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
// }
//
// yields list<item: TYPE ?nullable> ?nullable
is_legacy_list_structure = false;
RETURN_NOT_OK(
NodeToSchemaField(*repeated_field, current_levels, ctx, out, child_field));
}
Expand All @@ -728,11 +730,6 @@ Status ListToSchemaField(const GroupNode& group, LevelInfo current_levels,
RETURN_NOT_OK(
PopulateLeaf(column_index, item_field, current_levels, ctx, out, child_field));
}

if (!is_legacy_list_structure && group.is_repeated()) {
return Status::Invalid("LIST-annotated groups must not be repeated.");
}

out->field = ::arrow::field(group.name(), ::arrow::list(child_field->field),
group.is_optional(), FieldIdMetadata(group.field_id()));
out->level_info = current_levels;
Expand Down

0 comments on commit 686fc98

Please sign in to comment.