Skip to content

Commit

Permalink
Add read_parquet_metadata libcudf API (#13663)
Browse files Browse the repository at this point in the history
Closes #11675
Adds `read_parquet_metadata` to libcudf.
The metadata has following information
- schema - (type, name, children)
- num_rows
- num_rowgroups
- key-value string metadata in file footer

To Reviewers: Request for adding more information in metadata. Refer #11214

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Divye Gala (https://github.com/divyegala)
  - Ray Douglass (https://github.com/raydouglass)

URL: #13663
  • Loading branch information
karthikeyann authored Jul 27, 2023
1 parent fa09cca commit abb59c8
Show file tree
Hide file tree
Showing 8 changed files with 426 additions and 6 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -210,6 +210,7 @@ outputs:
- test -f $PREFIX/include/cudf/io/orc_metadata.hpp
- test -f $PREFIX/include/cudf/io/orc_types.hpp
- test -f $PREFIX/include/cudf/io/parquet.hpp
- test -f $PREFIX/include/cudf/io/parquet_metadata.hpp
- test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
- test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
Expand Down
10 changes: 10 additions & 0 deletions cpp/include/cudf/io/detail/parquet.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
#pragma once

#include <cudf/io/detail/utils.hpp>
#include <cudf/io/parquet_metadata.hpp>
#include <cudf/io/types.hpp>
#include <cudf/table/table_view.hpp>

Expand Down Expand Up @@ -211,5 +212,14 @@ class writer {
std::vector<std::unique_ptr<std::vector<uint8_t>>> const& metadata_list);
};

/**
* @brief Reads metadata of parquet dataset.
*
* @param sources Dataset sources to read from
*
* @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value
* metadata.
*/
parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources);
} // namespace detail::parquet
} // namespace cudf::io
231 changes: 231 additions & 0 deletions cpp/include/cudf/io/parquet_metadata.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,231 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

/**
* @file parquet_metadata.hpp
* @brief cuDF-IO freeform API
*/

#pragma once

#include <cudf/io/types.hpp>

#include <optional>
#include <string_view>
#include <variant>
#include <vector>

namespace cudf {
namespace io {

namespace parquet {
/**
* @brief Basic data types in Parquet, determines how data is physically stored
*/
enum class TypeKind : int8_t {
UNDEFINED_TYPE = -1, // Undefined for non-leaf nodes
BOOLEAN = 0,
INT32 = 1,
INT64 = 2,
INT96 = 3, // Deprecated
FLOAT = 4,
DOUBLE = 5,
BYTE_ARRAY = 6,
FIXED_LEN_BYTE_ARRAY = 7,
};
} // namespace parquet

/**
* @brief Schema of a parquet column, including the nested columns.
*/
struct parquet_column_schema {
public:
/**
* @brief constructor
*
* @param name column name
* @param type parquet type
* @param children child columns (empty for non-nested types)
*/
parquet_column_schema(std::string_view name,
parquet::TypeKind type,
std::vector<parquet_column_schema> children)
: _name{name}, _type_kind{type}, _children{std::move(children)}
{
}

/**
* @brief Returns parquet column name; can be empty.
*
* @return Column name
*/
[[nodiscard]] auto name() const { return _name; }

/**
* @brief Returns parquet type of the column.
*
* @return Column parquet type
*/
[[nodiscard]] auto type_kind() const { return _type_kind; }

/**
* @brief Returns schemas of all child columns.
*
* @return Children schemas
*/
[[nodiscard]] auto const& children() const& { return _children; }

/** @copydoc children
* Children array is moved out of the object (rvalues only).
*
*/
[[nodiscard]] auto children() && { return std::move(_children); }

/**
* @brief Returns schema of the child with the given index.
*
* @param idx child index
*
* @return Child schema
*/
[[nodiscard]] auto const& child(int idx) const& { return children().at(idx); }

/** @copydoc child
* Child is moved out of the object (rvalues only).
*
*/
[[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); }

/**
* @brief Returns the number of child columns.
*
* @return Children count
*/
[[nodiscard]] auto num_children() const { return children().size(); }

private:
std::string _name;
// 3 types available: Physical, Converted, Logical.
parquet::TypeKind _type_kind; // Physical
std::vector<parquet_column_schema> _children;
};

/**
* @brief Schema of a parquet file.
*/
struct parquet_schema {
public:
/**
* @brief constructor
*
* @param root_column_schema root column
*/
parquet_schema(parquet_column_schema root_column_schema) : _root{std::move(root_column_schema)} {}

/**
* @brief Returns the schema of the struct column that contains all columns as fields.
*
* @return Root column schema
*/
[[nodiscard]] auto const& root() const& { return _root; }

/** @copydoc root
* Root column schema is moved out of the object (rvalues only).
*
*/
[[nodiscard]] auto root() && { return std::move(_root); }

private:
parquet_column_schema _root;
};

/**
* @brief Information about content of a parquet file.
*/
class parquet_metadata {
public:
/// Key-value metadata in the file footer.
using key_value_metadata = std::unordered_map<std::string, std::string>;

/**
* @brief constructor
*
* @param schema parquet schema
* @param num_rows number of rows
* @param num_rowgroups number of row groups
* @param file_metadata key-value metadata in the file footer
*/
parquet_metadata(parquet_schema schema,
int64_t num_rows,
size_type num_rowgroups,
key_value_metadata file_metadata)
: _schema{std::move(schema)},
_num_rows{num_rows},
_num_rowgroups{num_rowgroups},
_file_metadata{std::move(file_metadata)}
{
}

/**
* @brief Returns the parquet schema.
*
* @return parquet schema
*/
[[nodiscard]] auto const& schema() const { return _schema; }

/**
* @brief Returns the number of rows of the root column.
*
* If a file contains list columns, nested columns can have a different number of rows.
*
* @return Number of rows
*/
[[nodiscard]] auto num_rows() const { return _num_rows; }

/**
* @brief Returns the number of rowgroups in the file.
*
* @return Number of row groups
*/
[[nodiscard]] auto num_rowgroups() const { return _num_rowgroups; }
/**
* @brief Returns the Key value metadata in the file footer.
*
* @return Key value metadata as a map
*/
[[nodiscard]] auto const& metadata() const { return _file_metadata; }

private:
parquet_schema _schema;
int64_t _num_rows;
size_type _num_rowgroups;
key_value_metadata _file_metadata;
};

/**
* @brief Reads metadata of parquet dataset.
*
* @ingroup io_readers
*
* @param src_info Dataset source
*
* @return parquet_metadata with parquet schema, number of rows, number of row groups and key-value
* metadata.
*/
parquet_metadata read_parquet_metadata(source_info const& src_info);

} // namespace io
} // namespace cudf
9 changes: 9 additions & 0 deletions cpp/src/io/functions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
#include <cudf/io/orc.hpp>
#include <cudf/io/orc_metadata.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/io/parquet_metadata.hpp>
#include <cudf/table/table.hpp>
#include <cudf/utilities/default_stream.hpp>
#include <cudf/utilities/error.hpp>
Expand Down Expand Up @@ -484,6 +485,14 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
return reader->read(options);
}

parquet_metadata read_parquet_metadata(source_info const& src_info)
{
CUDF_FUNC_RANGE();

auto datasources = make_datasources(src_info);
return detail_parquet::read_parquet_metadata(datasources);
}

/**
* @copydoc cudf::io::merge_row_group_metadata
*/
Expand Down
24 changes: 24 additions & 0 deletions cpp/src/io/parquet/reader_impl.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -477,4 +477,28 @@ bool reader::impl::has_next()
return _current_read_chunk < _chunk_read_info.size();
}

namespace {
parquet_column_schema walk_schema(aggregate_reader_metadata const* mt, int idx)
{
SchemaElement const& sch = mt->get_schema(idx);
std::vector<parquet_column_schema> children;
for (auto const& child_idx : sch.children_idx) {
children.push_back(walk_schema(mt, child_idx));
}
return parquet_column_schema{
sch.name, static_cast<parquet::TypeKind>(sch.type), std::move(children)};
}
} // namespace

parquet_metadata read_parquet_metadata(host_span<std::unique_ptr<datasource> const> sources)
{
// Open and parse the source dataset metadata
auto metadata = aggregate_reader_metadata(sources);

return parquet_metadata{parquet_schema{walk_schema(&metadata, 0)},
metadata.get_num_rows(),
metadata.get_num_row_groups(),
metadata.get_key_value_metadata()[0]};
}

} // namespace cudf::io::detail::parquet
6 changes: 3 additions & 3 deletions cpp/src/io/parquet/reader_impl_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -198,11 +198,11 @@ metadata::metadata(datasource* source)
}

std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
std::vector<std::unique_ptr<datasource>> const& sources)
host_span<std::unique_ptr<datasource> const> sources)
{
std::vector<metadata> metadatas;
std::transform(
sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
sources.begin(), sources.end(), std::back_inserter(metadatas), [](auto const& source) {
return metadata(source.get());
});
return metadatas;
Expand Down Expand Up @@ -252,7 +252,7 @@ size_type aggregate_reader_metadata::calc_num_row_groups() const
}

aggregate_reader_metadata::aggregate_reader_metadata(
std::vector<std::unique_ptr<datasource>> const& sources)
host_span<std::unique_ptr<datasource> const> sources)
: per_file_metadata(metadatas_from_sources(sources)),
keyval_maps(collect_keyval_metadata()),
num_rows(calc_num_rows()),
Expand Down
7 changes: 4 additions & 3 deletions cpp/src/io/parquet/reader_impl_helpers.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ class aggregate_reader_metadata {
* @brief Create a metadata object from each element in the source vector
*/
static std::vector<metadata> metadatas_from_sources(
std::vector<std::unique_ptr<datasource>> const& sources);
host_span<std::unique_ptr<datasource> const> sources);

/**
* @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
Expand All @@ -102,7 +102,7 @@ class aggregate_reader_metadata {
[[nodiscard]] size_type calc_num_row_groups() const;

public:
aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
aggregate_reader_metadata(host_span<std::unique_ptr<datasource> const> sources);

[[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;

Expand All @@ -119,8 +119,9 @@ class aggregate_reader_metadata {
return per_file_metadata[0].schema[schema_idx];
}

[[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }
[[nodiscard]] auto const& get_key_value_metadata() const& { return keyval_maps; }

[[nodiscard]] auto&& get_key_value_metadata() && { return std::move(keyval_maps); }
/**
* @brief Gets the concrete nesting depth of output cudf columns
*
Expand Down
Loading

0 comments on commit abb59c8

Please sign in to comment.