From f5691d467c5638c6f6a07948de82369cc0ea282e Mon Sep 17 00:00:00 2001 From: mwish Date: Thu, 31 Oct 2024 14:15:20 +0800 Subject: [PATCH] GH-44101: [C++][Parquet] Tools: Debug Print for Json should be valid JSON (#44532) ### Rationale for this change The printJson is not a valid json now. This is ok for human-read, but when I want to analysis it with json tools or ai, it will prevent from using it. ### What changes are included in this PR? Change the output to be a valid json. Style: previously, the `\"` trailing would be added in start of object, but this patch put it to end of object Before: ``` stream << "\", \"number\":\"" << number; stream << "\"..."; ``` After: ``` stream << ", \"number\":\"" << number << "\""; ``` ### Are these changes tested? Yes ### Are there any user-facing changes? Minor format change * GitHub Issue: #44101 Authored-by: mwish Signed-off-by: mwish --- cpp/src/parquet/CMakeLists.txt | 2 +- cpp/src/parquet/printer.cc | 56 ++++++++++++++++++---------------- cpp/src/parquet/printer.h | 2 +- cpp/src/parquet/reader_test.cc | 32 +++++++++++++++++++ 4 files changed, 64 insertions(+), 28 deletions(-) diff --git a/cpp/src/parquet/CMakeLists.txt b/cpp/src/parquet/CMakeLists.txt index b984ef77adbe0..e43a254fb616a 100644 --- a/cpp/src/parquet/CMakeLists.txt +++ b/cpp/src/parquet/CMakeLists.txt @@ -320,7 +320,7 @@ if(ARROW_TESTING) # "link" our dependencies so that include paths are configured # correctly target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK}) - list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing) + list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing RapidJSON) endif() if(NOT ARROW_BUILD_SHARED) diff --git a/cpp/src/parquet/printer.cc b/cpp/src/parquet/printer.cc index 60adfc697f95c..3ce3e1da4bb09 100644 --- a/cpp/src/parquet/printer.cc +++ b/cpp/src/parquet/printer.cc @@ -220,7 +220,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list selecte bool hasRow; do { hasRow = false; - for (auto scanner : scanners) { + for (const auto& scanner : scanners) { if (scanner->HasNext()) { hasRow = true; scanner->PrintNext(stream, COL_WIDTH); @@ -246,7 +246,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected << file_metadata->schema()->group_node()->field_count() << "\",\n"; stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n"; - if (selected_columns.size() == 0) { + if (selected_columns.empty()) { for (int i = 0; i < file_metadata->num_columns(); i++) { selected_columns.push_back(i); } @@ -299,29 +299,30 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected << column_chunk->num_values() << "\", " << "\"StatsSet\": "; if (column_chunk->is_stats_set()) { - stream << "\"True\", \"Stats\": {"; + stream << R"("True", "Stats": {)"; if (stats->HasNullCount()) { - stream << "\"NumNulls\": \"" << stats->null_count(); + stream << R"("NumNulls": ")" << stats->null_count() << "\""; } if (stats->HasDistinctCount()) { - stream << "\", " - << "\"DistinctValues\": \"" << stats->distinct_count(); + stream << ", " + << R"("DistinctValues": ")" << stats->distinct_count() << "\""; } if (stats->HasMinMax()) { std::string min = stats->EncodeMin(), max = stats->EncodeMax(); - stream << "\", " - << "\"Max\": \"" << FormatStatValue(descr->physical_type(), max) + stream << ", " + << R"("Max": ")" << FormatStatValue(descr->physical_type(), max) << "\", " - << "\"Min\": \"" << FormatStatValue(descr->physical_type(), min); + << R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\""; } - stream << "\" },"; + stream << " },"; } else { stream << "\"False\","; } stream << "\n \"Compression\": \"" << ::arrow::internal::AsciiToUpper( Codec::GetCodecAsString(column_chunk->compression())) - << "\", \"Encodings\": \""; + << R"(", "Encodings": )"; + stream << "\""; if (column_chunk->encoding_stats().empty()) { for (auto encoding : column_chunk->encodings()) { stream << EncodingToString(encoding) << " "; @@ -329,40 +330,43 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list selected } else { PrintPageEncodingStats(stream, column_chunk->encoding_stats()); } - stream << "\", " - << "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size() - << "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size(); + stream << "\""; + stream << ", " + << R"("UncompressedSize": ")" << column_chunk->total_uncompressed_size() + << R"(", "CompressedSize": ")" << column_chunk->total_compressed_size() + << "\""; if (column_chunk->bloom_filter_offset()) { // Output BloomFilter {offset, length} - stream << "\", BloomFilter {" - << "\"offset\": \"" << column_chunk->bloom_filter_offset().value(); + stream << ", \"BloomFilter\": {" + << R"("offset": ")" << column_chunk->bloom_filter_offset().value() << "\""; if (column_chunk->bloom_filter_length()) { - stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value(); + stream << R"(, "length": ")" << column_chunk->bloom_filter_length().value() + << "\""; } - stream << "\"}"; + stream << "}"; } if (column_chunk->GetColumnIndexLocation()) { auto location = column_chunk->GetColumnIndexLocation().value(); // Output ColumnIndex {offset, length} - stream << "\", ColumnIndex {" - << "\"offset\": \"" << location.offset; - stream << "\", \"length\": \"" << location.length; + stream << ", \"ColumnIndex\": {" + << R"("offset": ")" << location.offset; + stream << R"(", "length": ")" << location.length; stream << "\"}"; } if (column_chunk->GetOffsetIndexLocation()) { auto location = column_chunk->GetOffsetIndexLocation().value(); // Output OffsetIndex {offset, length} - stream << "\", OffsetIndex {" - << "\"offset\": \"" << location.offset; - stream << "\", \"length\": \"" << location.length; - stream << "\"}"; + stream << ", \"OffsetIndex\": {" + << R"("offset": ")" << location.offset << "\""; + stream << R"(, "length": ")" << location.length << "\""; + stream << "}"; } // end of a ColumnChunk - stream << "\" }"; + stream << " }"; c1++; if (c1 != static_cast(selected_columns.size())) { stream << ",\n"; diff --git a/cpp/src/parquet/printer.h b/cpp/src/parquet/printer.h index 6bdf5b456fa6b..bb86b107f9f9b 100644 --- a/cpp/src/parquet/printer.h +++ b/cpp/src/parquet/printer.h @@ -32,7 +32,7 @@ class PARQUET_EXPORT ParquetFilePrinter { public: explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {} - ~ParquetFilePrinter() {} + ~ParquetFilePrinter() = default; void DebugPrint(std::ostream& stream, std::list selected_columns, bool print_values = false, bool format_dump = false, diff --git a/cpp/src/parquet/reader_test.cc b/cpp/src/parquet/reader_test.cc index fb77ba6cbc178..688c875b9ec0f 100644 --- a/cpp/src/parquet/reader_test.cc +++ b/cpp/src/parquet/reader_test.cc @@ -27,6 +27,12 @@ #include #include +#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep + +#include +#include +#include + #include "arrow/array.h" #include "arrow/buffer.h" #include "arrow/io/file.h" @@ -47,6 +53,8 @@ #include "parquet/printer.h" #include "parquet/test_util.h" +namespace rj = arrow::rapidjson; + using arrow::internal::checked_pointer_cast; using arrow::internal::Zip; @@ -1172,6 +1180,30 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) { EXPECT_THAT(json_content, testing::HasSubstr(json_contains)); } +// GH-44101: Test that JSON output is valid JSON +TEST_F(TestJSONWithLocalFile, ValidJsonOutput) { + auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status { + rj::Document json_doc; + constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag; + json_doc.Parse(json_string.data(), json_string.length()); + if (json_doc.HasParseError()) { + return ::arrow::Status::Invalid("JSON parse error at offset ", + json_doc.GetErrorOffset(), ": ", + rj::GetParseError_En(json_doc.GetParseError())); + } + return ::arrow::Status::OK(); + }; + std::vector check_file_lists = { + "data_index_bloom_encoding_with_length.parquet", + "data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet", + "concatenated_gzip_members.parquet", "nulls.snappy.parquet"}; + for (const auto& file : check_file_lists) { + std::string json_content = ReadFromLocalFile(file); + ASSERT_OK(check_json_valid(json_content)) + << "Invalid JSON output for file: " << file << ", content:" << json_content; + } +} + TEST(TestFileReader, BufferedReadsWithDictionary) { const int num_rows = 1000;