From 2635378e33b0ec355f7f842675ea5e645f15864f Mon Sep 17 00:00:00 2001 From: Antoine Pitrou Date: Thu, 16 May 2024 12:51:25 +0200 Subject: [PATCH] DRAFT: Parquet 3 metadata with decoupled column metadata --- README.md | 57 ++++++++- src/main/thrift/parquet.thrift | 210 +++++++++++++++++++++++++++++++++ 2 files changed, 266 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 18a75077..d4372e4e 100644 --- a/README.md +++ b/README.md @@ -81,7 +81,13 @@ more pages. - Encoding/Compression - Page ## File format -This file and the [Thrift definition](src/main/thrift/parquet.thrift) should be read together to understand the format. + +This file and the [Thrift definition](src/main/thrift/parquet.thrift) should be read +together to understand the format. + +## Overall file structure + +A Parquet file is a collection of binary blocks representing data and metadata. 4-byte magic number "PAR1" @@ -113,6 +119,55 @@ chunks they are interested in. The columns chunks should then be read sequentia ![File Layout](https://raw.github.com/apache/parquet-format/master/doc/images/FileLayout.gif) +### Parquet 3 + +Parquet 3 files have the following overall structure: + +``` +4-byte magic number "PAR1" +4-byte magic number "PAR3" +8-byte offset of File Metadata v3 +8-byte length of File Metadata v3 + + + +... + + + +... + +... + + +... + + + +... + +File Metadata v3 + +File Metadata +4-byte length in bytes of file metadata (little endian) +4-byte magic number "PAR1" +``` + +The File Metadata v3 is designed to be light-weight to decode, regardless of +the number of columns in the file. Individual column metadata can be opportunistically +decoded depending on actual needs. + +This file structure is backwards-compatible. Parquet 1 readers will read the +legacy File Metadata in the file footer, while Parquet 3 readers will notice +the "PAR1PAR3" magic number (probably by reading the 24 first bytes in the file) +and will instead read the File Metadata v3. + +## Encryption + +Encryption with footer encryption enabled changes the above file structure slightly. +In particular, the "PAR1" magic number is replaced with "PARE". +See the [encryption specification](Encryption.md) for details. + ## Metadata There are three types of metadata: file metadata, column (chunk) metadata and page header metadata. All thrift structures are serialized using the TCompactProtocol. diff --git a/src/main/thrift/parquet.thrift b/src/main/thrift/parquet.thrift index 27d40437..4705f79d 100644 --- a/src/main/thrift/parquet.thrift +++ b/src/main/thrift/parquet.thrift @@ -467,6 +467,35 @@ struct SchemaElement { 10: optional LogicalType logicalType } +struct SchemaElementV3 { + /** Data type for this field. */ + 1: optional Type type; + + /** If type is FIXED_LEN_BYTE_ARRAY, this is the byte length of the values. + * + * CHANGED from v1: this must be omitted for other column types. + */ + 2: optional i32 type_length; + + /** repetition of the field. */ + 3: optional FieldRepetitionType repetition_type; + + /** Name of the field in the schema */ + 4: required string name; + + /** Nested fields. */ + 5: optional i32 num_children; + + /** CHANGED from v1: from i32 to i64 + */ + 6: optional i64 field_id; + + /** The logical type of this SchemaElement */ + 7: optional LogicalType logicalType + + /** REMOVED from v1: converted_type, scale, precision */ +} + /** * Encodings supported by Parquet. Not all encodings are valid for all types. These * enums are also used to specify the encoding of definition and repetition levels. @@ -835,6 +864,63 @@ struct ColumnMetaData { 16: optional SizeStatistics size_statistics; } +struct ColumnChunkMetaDataV3 { + /** REMOVED from v1: type (unnecessary) */ + /** REMOVED from v1: encodings (unnecessary and non-trivial to get right) */ + /** REMOVED from v1: path_in_schema (unnecessary and wasteful) */ + /** REMOVED from v1: index_page_offset (unused in practice?) */ + /** REMOVED from v1: statistics (use ColumnIndex and/or page-level statistics instead) */ + + /** Compression codec **/ + 1: required CompressionCodec codec + + /** Number of values in this column chunk **/ + 2: required i64 num_values + + /** total byte size of all uncompressed pages in this column chunk (including the headers) **/ + 3: required i64 total_uncompressed_size + + /** total byte size of all compressed, and potentially encrypted, pages + * in this column chunk (including the headers) **/ + 4: required i64 total_compressed_size + + /** Optional key/value metadata for this column chunk. + ** CHANGED from v1: only use this for chunk-specific metadata, otherwise + ** use `FileColumnMetadataV3.key_value_metadata`. + **/ + 5: optional list key_value_metadata + + /** Byte offset from beginning of file to first data page **/ + 6: required i64 data_page_offset + + /** Byte offset from the beginning of file to first (only) dictionary page **/ + 7: optional i64 dictionary_page_offset + + /** Set of all encodings used for pages in this column chunk. + * This information can be used to determine if all data pages are + * dictionary encoded for example **/ + 8: optional list encoding_stats; + + /** Byte offset from beginning of file to Bloom filter data. **/ + 9: optional i64 bloom_filter_offset; + + /** Size of Bloom filter data including the serialized header, in bytes. + * Added in 2.10 so readers may not read this field from old files and + * it can be obtained after the BloomFilterHeader has been deserialized. + * Writers should write this field so readers can read the bloom filter + * in a single I/O. + */ + 10: optional i32 bloom_filter_length; + + /** + * Optional statistics to help estimate total memory when converted to in-memory + * representations. The histograms contained in these statistics can + * also be useful in some cases for more fine-grained nullability/list length + * filter pushdown. + */ + 11: optional SizeStatistics size_statistics; +} + struct EncryptionWithFooterKey { } @@ -885,6 +971,44 @@ struct ColumnChunk { 9: optional binary encrypted_column_metadata } +struct ColumnChunkV3 { + /** File where column data is stored. **/ + 1: optional string file_path + + /** Byte offset in file_path to the ColumnChunkMetaDataV3, optionally encrypted + ** CHANGED from v1: renamed to metadata_file_offset + **/ + 2: required i64 metadata_file_offset + + /** NEW from v1: Byte length in file_path of ColumnChunkMetaDataV3, optionally encrypted + **/ + 3: required i64 metadata_file_length + + /** REMOVED from v1: meta_data, encrypted_column_metadata. + ** Use encoded_metadata instead. + **/ + + /** NEW from v1: Column metadata for this chunk, duplicated here from file_path. + ** This is a Thrift-encoded ColumnChunkMetaDataV3, optionally encrypted. + **/ + 3: optional binary encoded_metadata + + /** CHANGED from v1: this is now required **/ + 4: required i64 offset_index_offset + + /** CHANGED from v1: this is now required **/ + 5: required i32 offset_index_length + + /** File offset of ColumnChunk's ColumnIndex **/ + 6: optional i64 column_index_offset + + /** Size of ColumnChunk's ColumnIndex, in bytes **/ + 7: optional i32 column_index_length + + /** Crypto metadata of encrypted columns **/ + 8: optional ColumnCryptoMetaData crypto_metadata +} + struct RowGroup { /** Metadata for each column chunk in this row group. * This list must have the same order as the SchemaElement list in FileMetaData. @@ -914,6 +1038,32 @@ struct RowGroup { 7: optional i16 ordinal } +struct RowGroupV3 { + /** REMOVED from v1: columns. + * Instead, decode each FileColumnMetadataV3 individually as needed. + */ + + /** Total byte size of all the uncompressed column data in this row group **/ + 1: required i64 total_byte_size + + /** Number of rows in this row group **/ + 2: required i64 num_rows + + /** If set, specifies a sort ordering of the rows in this row group. */ + 3: optional list sorting_columns + + /** REMOVED from v1: file_offset. + * Use the OffsetIndex for each column instead. + */ + + /** Total byte size of all compressed (and potentially encrypted) column data + * in this row group **/ + 4: optional i64 total_compressed_size + + /** Row group ordinal in the file **/ + 5: optional i16 ordinal +} + /** Empty struct to signal the order defined by the physical or logical type */ struct TypeDefinedOrder {} @@ -1165,6 +1315,66 @@ struct FileMetaData { 9: optional binary footer_signing_key_metadata } +/** Metadata for a column in this file. */ +struct FileColumnMetadataV3 { + /** All column chunks in this file (one per row group) + ** + ** TODO: make this a ColumnChunkV3 with reduced footprint (especially + ** around column metadata) + **/ + 1: required list columns + + /** Sort order used for the Statistics min_value and max_value fields + **/ + 2: optional ColumnOrder column_order; + + /** NEW from v1: Optional key/value metadata for this column + **/ + 3: optional list key_value_metadata +} + +struct FileMetaDataV3 { + /** Version of this file **/ + 1: required i32 version + + /** Parquet schema for this file **/ + 2: required list schema; + + /** Number of rows in this file **/ + 3: required i64 num_rows + + /** Row groups in this file **/ + 4: required list row_groups + + /** Optional key/value metadata for this file. **/ + 5: optional list key_value_metadata + + /** String for application that wrote this file. **/ + 6: optional string created_by + + /** NEW from v1: byte offset of file column metadata **/ + 7: required list file_column_metadata_offset; + /** NEW from v1: byte length of file column metadata **/ + 8: required list file_column_metadata_length; + + /** REMOVED from v1: column_orders. + ** Use `FileColumnMetadataV3.column_order` instead. + **/ + + /** + * Encryption algorithm. This field is set only in encrypted files + * with plaintext footer. Files with encrypted footer store algorithm id + * in FileCryptoMetaData structure. + */ + 9: optional EncryptionAlgorithm encryption_algorithm + + /** + * Retrieval metadata of key used for signing the footer. + * Used only in encrypted files with plaintext footer. + */ + 10: optional binary footer_signing_key_metadata +} + /** Crypto metadata for files with encrypted footer **/ struct FileCryptoMetaData { /**