Skip to content

Commit

Permalink
GH-44101: [C++][Parquet] Tools: Debug Print for Json should be valid …
Browse files Browse the repository at this point in the history
…JSON (#44532)

### Rationale for this change

The printJson is not a valid json now. This is ok for human-read, but when I want to analysis it with json tools or ai, it will prevent from using it.

### What changes are included in this PR?

Change the output to be a valid json.

Style:

previously, the `\"` trailing would be added in start of object, but this patch put it to end of object

Before:
```
stream << "\", \"number\":\""  << number;
stream << "\"...";
```

After:

```
stream << ", \"number\":\""  << number << "\"";
```

### Are these changes tested?

Yes

### Are there any user-facing changes?

Minor format change

* GitHub Issue: #44101

Authored-by: mwish <[email protected]>
Signed-off-by: mwish <[email protected]>
  • Loading branch information
mapleFU authored Oct 31, 2024
1 parent 3917b60 commit f5691d4
Show file tree
Hide file tree
Showing 4 changed files with 64 additions and 28 deletions.
2 changes: 1 addition & 1 deletion cpp/src/parquet/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ if(ARROW_TESTING)
# "link" our dependencies so that include paths are configured
# correctly
target_link_libraries(parquet_testing PUBLIC ${ARROW_GTEST_GMOCK})
list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing)
list(APPEND PARQUET_TEST_LINK_LIBS parquet_testing RapidJSON)
endif()

if(NOT ARROW_BUILD_SHARED)
Expand Down
56 changes: 30 additions & 26 deletions cpp/src/parquet/printer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ void ParquetFilePrinter::DebugPrint(std::ostream& stream, std::list<int> selecte
bool hasRow;
do {
hasRow = false;
for (auto scanner : scanners) {
for (const auto& scanner : scanners) {
if (scanner->HasNext()) {
hasRow = true;
scanner->PrintNext(stream, COL_WIDTH);
Expand All @@ -246,7 +246,7 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
<< file_metadata->schema()->group_node()->field_count() << "\",\n";
stream << " \"NumberOfColumns\": \"" << file_metadata->num_columns() << "\",\n";

if (selected_columns.size() == 0) {
if (selected_columns.empty()) {
for (int i = 0; i < file_metadata->num_columns(); i++) {
selected_columns.push_back(i);
}
Expand Down Expand Up @@ -299,70 +299,74 @@ void ParquetFilePrinter::JSONPrint(std::ostream& stream, std::list<int> selected
<< column_chunk->num_values() << "\", "
<< "\"StatsSet\": ";
if (column_chunk->is_stats_set()) {
stream << "\"True\", \"Stats\": {";
stream << R"("True", "Stats": {)";
if (stats->HasNullCount()) {
stream << "\"NumNulls\": \"" << stats->null_count();
stream << R"("NumNulls": ")" << stats->null_count() << "\"";
}
if (stats->HasDistinctCount()) {
stream << "\", "
<< "\"DistinctValues\": \"" << stats->distinct_count();
stream << ", "
<< R"("DistinctValues": ")" << stats->distinct_count() << "\"";
}
if (stats->HasMinMax()) {
std::string min = stats->EncodeMin(), max = stats->EncodeMax();
stream << "\", "
<< "\"Max\": \"" << FormatStatValue(descr->physical_type(), max)
stream << ", "
<< R"("Max": ")" << FormatStatValue(descr->physical_type(), max)
<< "\", "
<< "\"Min\": \"" << FormatStatValue(descr->physical_type(), min);
<< R"("Min": ")" << FormatStatValue(descr->physical_type(), min) << "\"";
}
stream << "\" },";
stream << " },";
} else {
stream << "\"False\",";
}
stream << "\n \"Compression\": \""
<< ::arrow::internal::AsciiToUpper(
Codec::GetCodecAsString(column_chunk->compression()))
<< "\", \"Encodings\": \"";
<< R"(", "Encodings": )";
stream << "\"";
if (column_chunk->encoding_stats().empty()) {
for (auto encoding : column_chunk->encodings()) {
stream << EncodingToString(encoding) << " ";
}
} else {
PrintPageEncodingStats(stream, column_chunk->encoding_stats());
}
stream << "\", "
<< "\"UncompressedSize\": \"" << column_chunk->total_uncompressed_size()
<< "\", \"CompressedSize\": \"" << column_chunk->total_compressed_size();
stream << "\"";
stream << ", "
<< R"("UncompressedSize": ")" << column_chunk->total_uncompressed_size()
<< R"(", "CompressedSize": ")" << column_chunk->total_compressed_size()
<< "\"";

if (column_chunk->bloom_filter_offset()) {
// Output BloomFilter {offset, length}
stream << "\", BloomFilter {"
<< "\"offset\": \"" << column_chunk->bloom_filter_offset().value();
stream << ", \"BloomFilter\": {"
<< R"("offset": ")" << column_chunk->bloom_filter_offset().value() << "\"";
if (column_chunk->bloom_filter_length()) {
stream << "\", \"length\": \"" << column_chunk->bloom_filter_length().value();
stream << R"(, "length": ")" << column_chunk->bloom_filter_length().value()
<< "\"";
}
stream << "\"}";
stream << "}";
}

if (column_chunk->GetColumnIndexLocation()) {
auto location = column_chunk->GetColumnIndexLocation().value();
// Output ColumnIndex {offset, length}
stream << "\", ColumnIndex {"
<< "\"offset\": \"" << location.offset;
stream << "\", \"length\": \"" << location.length;
stream << ", \"ColumnIndex\": {"
<< R"("offset": ")" << location.offset;
stream << R"(", "length": ")" << location.length;
stream << "\"}";
}

if (column_chunk->GetOffsetIndexLocation()) {
auto location = column_chunk->GetOffsetIndexLocation().value();
// Output OffsetIndex {offset, length}
stream << "\", OffsetIndex {"
<< "\"offset\": \"" << location.offset;
stream << "\", \"length\": \"" << location.length;
stream << "\"}";
stream << ", \"OffsetIndex\": {"
<< R"("offset": ")" << location.offset << "\"";
stream << R"(, "length": ")" << location.length << "\"";
stream << "}";
}

// end of a ColumnChunk
stream << "\" }";
stream << " }";
c1++;
if (c1 != static_cast<int>(selected_columns.size())) {
stream << ",\n";
Expand Down
2 changes: 1 addition & 1 deletion cpp/src/parquet/printer.h
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ class PARQUET_EXPORT ParquetFilePrinter {

public:
explicit ParquetFilePrinter(ParquetFileReader* reader) : fileReader(reader) {}
~ParquetFilePrinter() {}
~ParquetFilePrinter() = default;

void DebugPrint(std::ostream& stream, std::list<int> selected_columns,
bool print_values = false, bool format_dump = false,
Expand Down
32 changes: 32 additions & 0 deletions cpp/src/parquet/reader_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,12 @@
#include <gmock/gmock.h>
#include <gtest/gtest.h>

#include "arrow/json/rapidjson_defs.h" // IWYU pragma: keep

#include <rapidjson/document.h>
#include <rapidjson/error/en.h>
#include <rapidjson/stringbuffer.h>

#include "arrow/array.h"
#include "arrow/buffer.h"
#include "arrow/io/file.h"
Expand All @@ -47,6 +53,8 @@
#include "parquet/printer.h"
#include "parquet/test_util.h"

namespace rj = arrow::rapidjson;

using arrow::internal::checked_pointer_cast;
using arrow::internal::Zip;

Expand Down Expand Up @@ -1172,6 +1180,30 @@ TEST_F(TestJSONWithLocalFile, JSONOutputFLBA) {
EXPECT_THAT(json_content, testing::HasSubstr(json_contains));
}

// GH-44101: Test that JSON output is valid JSON
TEST_F(TestJSONWithLocalFile, ValidJsonOutput) {
auto check_json_valid = [](std::string_view json_string) -> ::arrow::Status {
rj::Document json_doc;
constexpr auto kParseFlags = rj::kParseFullPrecisionFlag | rj::kParseNanAndInfFlag;
json_doc.Parse<kParseFlags>(json_string.data(), json_string.length());
if (json_doc.HasParseError()) {
return ::arrow::Status::Invalid("JSON parse error at offset ",
json_doc.GetErrorOffset(), ": ",
rj::GetParseError_En(json_doc.GetParseError()));
}
return ::arrow::Status::OK();
};
std::vector<std::string_view> check_file_lists = {
"data_index_bloom_encoding_with_length.parquet",
"data_index_bloom_encoding_stats.parquet", "alltypes_tiny_pages_plain.parquet",
"concatenated_gzip_members.parquet", "nulls.snappy.parquet"};
for (const auto& file : check_file_lists) {
std::string json_content = ReadFromLocalFile(file);
ASSERT_OK(check_json_valid(json_content))
<< "Invalid JSON output for file: " << file << ", content:" << json_content;
}
}

TEST(TestFileReader, BufferedReadsWithDictionary) {
const int num_rows = 1000;

Expand Down

0 comments on commit f5691d4

Please sign in to comment.