Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add Parquet reader benchmarks for row selection #14147

Merged
merged 8 commits into from
Sep 25, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
18 changes: 10 additions & 8 deletions cpp/benchmarks/io/cuio_common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/

#include <benchmarks/io/cuio_common.hpp>
#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/detail/utilities/logger.hpp>

#include <cstdio>
Expand Down Expand Up @@ -141,17 +142,18 @@ std::vector<std::string> select_column_names(std::vector<std::string> const& col
return col_names_to_read;
}

std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk)
std::vector<cudf::size_type> segments_in_chunk(int num_segments, int num_chunks, int chunk_idx)
{
CUDF_EXPECTS(num_segments >= num_chunks,
"Number of chunks cannot be greater than the number of segments in the file");
auto start_segment = [num_segments, num_chunks](int chunk) {
return num_segments * chunk / num_chunks;
};
std::vector<cudf::size_type> selected_segments;
for (auto segment = start_segment(chunk); segment < start_segment(chunk + 1); ++segment) {
selected_segments.push_back(segment);
}
CUDF_EXPECTS(chunk_idx < num_chunks,
"Chunk index must be smaller than the number of chunks in the file");

auto const segments_in_chunk = cudf::util::div_rounding_up_unsafe(num_segments, num_chunks);
auto const begin_segment = std::min(chunk_idx * segments_in_chunk, num_segments);
auto const end_segment = std::min(begin_segment + segments_in_chunk, num_segments);
std::vector<cudf::size_type> selected_segments(end_segment - begin_segment);
std::iota(selected_segments.begin(), selected_segments.end(), begin_segment);

return selected_segments;
}
Expand Down
12 changes: 5 additions & 7 deletions cpp/benchmarks/io/orc/orc_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/orc.hpp>
#include <cudf/io/orc_metadata.hpp>
#include <cudf/utilities/default_stream.hpp>
Expand All @@ -30,7 +31,7 @@
constexpr int64_t data_size = 512 << 20;
// The number of separate read calls to use when reading files in multiple chunks
// Each call reads roughly equal amounts of data
constexpr int32_t chunked_read_num_chunks = 8;
constexpr int32_t chunked_read_num_chunks = 4;

std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
{
Expand Down Expand Up @@ -88,7 +89,7 @@ void BM_orc_read_varying_options(nvbench::state& state,

auto const num_stripes =
cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
Expand All @@ -99,7 +100,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
timer.start();
cudf::size_type rows_read = 0;
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
auto const is_last_chunk = chunk == (num_chunks - 1);
switch (RowSelection) {
case row_selection::ALL: break;
case row_selection::STRIPES:
Expand All @@ -108,7 +108,6 @@ void BM_orc_read_varying_options(nvbench::state& state,
case row_selection::NROWS:
read_options.set_skip_rows(chunk * chunk_row_cnt);
read_options.set_num_rows(chunk_row_cnt);
if (is_last_chunk) read_options.set_num_rows(-1);
break;
default: CUDF_FAIL("Unsupported row selection method");
}
Expand All @@ -132,9 +131,6 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
column_selection::ALTERNATE,
column_selection::FIRST_HALF,
column_selection::SECOND_HALF>;
using row_selections =
nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;

NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
NVBENCH_TYPE_AXES(col_selections,
nvbench::enum_type_list<row_selection::ALL>,
Expand All @@ -146,6 +142,8 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
{"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
.set_min_samples(4);

using row_selections =
nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::STRIPES>;
NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
row_selections,
Expand Down
65 changes: 38 additions & 27 deletions cpp/benchmarks/io/parquet/parquet_reader_options.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -19,28 +19,29 @@
#include <benchmarks/io/cuio_common.hpp>
#include <benchmarks/io/nvbench_helpers.hpp>

#include <cudf/detail/utilities/integer_utils.hpp>
#include <cudf/io/parquet.hpp>
#include <cudf/utilities/default_stream.hpp>

#include <nvbench/nvbench.cuh>

// Size of the data in the benchmark dataframe; chosen to be low enough to allow benchmarks to
// run on most GPUs, but large enough to allow highest throughput
constexpr std::size_t data_size = 512 << 20;
constexpr std::size_t row_group_size = 128 << 20;
constexpr std::size_t data_size = 512 << 20;
// The number of separate read calls to use when reading files in multiple chunks
// Each call reads roughly equal amounts of data
constexpr int32_t chunked_read_num_chunks = 4;

std::vector<std::string> get_top_level_col_names(cudf::io::source_info const& source)
{
cudf::io::parquet_reader_options const read_options =
cudf::io::parquet_reader_options::builder(source);
auto const schema = cudf::io::read_parquet(read_options).metadata.schema_info;

std::vector<std::string> names;
names.reserve(schema.size());
std::transform(schema.cbegin(), schema.cend(), std::back_inserter(names), [](auto const& c) {
return c.name;
});
return names;
auto const top_lvl_cols = cudf::io::read_parquet_metadata(source).schema().root().children();
std::vector<std::string> col_names;
std::transform(top_lvl_cols.cbegin(),
top_lvl_cols.cend(),
std::back_inserter(col_names),
[](auto const& col_meta) { return col_meta.name(); });

return col_names;
}

template <column_selection ColSelection,
Expand All @@ -55,6 +56,8 @@ void BM_parquet_read_options(nvbench::state& state,
nvbench::enum_type<UsesPandasMetadata>,
nvbench::enum_type<Timestamp>>)
{
auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;

auto constexpr str_to_categories = ConvertsStrings == converts_strings::YES;
auto constexpr uses_pd_metadata = UsesPandasMetadata == uses_pandas_metadata::YES;

Expand Down Expand Up @@ -87,9 +90,8 @@ void BM_parquet_read_options(nvbench::state& state,
.use_pandas_metadata(uses_pd_metadata)
.timestamp_type(ts_type);

// TODO: add read_parquet_metadata to properly calculate #row_groups
auto constexpr num_row_groups = data_size / row_group_size;
auto constexpr num_chunks = 1;
auto const num_row_groups = read_parquet_metadata(source_sink.make_source_info()).num_rowgroups();
auto const chunk_row_cnt = cudf::util::div_rounding_up_unsafe(view.num_rows(), num_chunks);

auto mem_stats_logger = cudf::memory_stats_logger();
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
Expand All @@ -100,18 +102,15 @@ void BM_parquet_read_options(nvbench::state& state,
timer.start();
cudf::size_type rows_read = 0;
for (int32_t chunk = 0; chunk < num_chunks; ++chunk) {
auto const is_last_chunk = chunk == (num_chunks - 1);
switch (RowSelection) {
case row_selection::ALL: break;
case row_selection::ROW_GROUPS: {
auto row_groups_to_read = segments_in_chunk(num_row_groups, num_chunks, chunk);
if (is_last_chunk) {
// Need to assume that an additional "overflow" row group is present
row_groups_to_read.push_back(num_row_groups);
}
read_options.set_row_groups({row_groups_to_read});
read_options.set_row_groups({segments_in_chunk(num_row_groups, num_chunks, chunk)});
} break;
case row_selection::NROWS: [[fallthrough]];
case row_selection::NROWS:
read_options.set_skip_rows(chunk * chunk_row_cnt);
read_options.set_num_rows(chunk_row_cnt);
break;
default: CUDF_FAIL("Unsupported row selection method");
}

Expand All @@ -130,14 +129,26 @@ void BM_parquet_read_options(nvbench::state& state,
state.add_buffer_size(source_sink.size(), "encoded_file_size", "encoded_file_size");
}

using row_selections =
nvbench::enum_type_list<row_selection::ALL, row_selection::NROWS, row_selection::ROW_GROUPS>;
NVBENCH_BENCH_TYPES(BM_parquet_read_options,
NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
row_selections,
nvbench::enum_type_list<converts_strings::YES>,
nvbench::enum_type_list<uses_pandas_metadata::YES>,
nvbench::enum_type_list<cudf::type_id::EMPTY>))
.set_name("parquet_read_row_selection")
.set_type_axes_names({"column_selection",
"row_selection",
"str_to_categories",
"uses_pandas_metadata",
"timestamp_type"})
.set_min_samples(4);

using col_selections = nvbench::enum_type_list<column_selection::ALL,
column_selection::ALTERNATE,
column_selection::FIRST_HALF,
column_selection::SECOND_HALF>;

// TODO: row_selection::ROW_GROUPS disabled until we add an API to read metadata from a parquet file
// and determine num row groups. https://github.com/rapidsai/cudf/pull/9963#issuecomment-1004832863

NVBENCH_BENCH_TYPES(BM_parquet_read_options,
NVBENCH_TYPE_AXES(col_selections,
nvbench::enum_type_list<row_selection::ALL>,
Expand Down