Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Cast only time of day to nanos to avoid an overflow in Parquet INT96 write #13776

Merged
merged 29 commits into from
Aug 2, 2023
Merged
Show file tree
Hide file tree
Changes from 22 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
2656de1
fprintf
gerashegalov Jul 20, 2023
0e39811
printf debugging
gerashegalov Jul 24, 2023
9e54fd5
printf again
gerashegalov Jul 25, 2023
0edee2c
redo
gerashegalov Jul 25, 2023
c86d8f7
wip
gerashegalov Jul 26, 2023
f33df0c
Merge remote-tracking branch 'origin/branch-23.08' into int96v2
gerashegalov Jul 26, 2023
a34239c
revert gera
gerashegalov Jul 26, 2023
19555fa
revert CMake
gerashegalov Jul 26, 2023
1cc14c9
Cast last day's time to nano avoiding int64 overflow
gerashegalov Jul 26, 2023
defdc2b
Merge remote-tracking branch 'origin/branch-23.10' into int96v2
gerashegalov Jul 26, 2023
edb0ede
clang-format
gerashegalov Jul 26, 2023
336ac92
reduce bloat
gerashegalov Jul 27, 2023
f00e065
refactor
gerashegalov Jul 27, 2023
7c9abbc
bug fix
gerashegalov Jul 27, 2023
6c59bbe
Merge branch 'branch-23.10' into int96v2
gerashegalov Jul 27, 2023
8571581
gtest
gerashegalov Jul 29, 2023
2c15c0e
clang-format
gerashegalov Jul 29, 2023
debbb1f
Merge remote-tracking branch 'origin/branch-23.10' into int96v2
gerashegalov Jul 29, 2023
b6b65ba
literal dates in test
gerashegalov Aug 1, 2023
6c29db4
Merge remote-tracking branch 'origin/branch-23.10' into int96v2
gerashegalov Aug 1, 2023
6f24279
Merge branch 'bug8070-int96-test' into int96v2
gerashegalov Aug 1, 2023
5c85790
clang-format
gerashegalov Aug 1, 2023
22d00ba
Apply suggestions from code review
gerashegalov Aug 1, 2023
613112a
reviews
gerashegalov Aug 1, 2023
d7c035a
Merge remote-tracking branch 'origin/branch-23.10' into int96v2
gerashegalov Aug 1, 2023
11973d1
Review
gerashegalov Aug 1, 2023
6a04604
Empty commit to retrigger CI.
gerashegalov Aug 1, 2023
618b26a
Apply suggestions from code review
gerashegalov Aug 1, 2023
9c783cd
review
gerashegalov Aug 1, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 18 additions & 16 deletions cpp/src/io/parquet/page_enc.cu
Original file line number Diff line number Diff line change
Expand Up @@ -933,22 +933,23 @@ constexpr auto julian_calendar_epoch_diff()
}

/**
* @brief Converts a timestamp_ns into a pair with nanoseconds since midnight and number of Julian
* days. Does not deal with time zones. Used by INT96 code.
* @brief Converts number of periods Per into a pair with nanoseconds since midnight
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
* and number of Julian days. Does not deal with time zones. Used by INT96 code.
*
* @param ns number of nanoseconds since epoch
* @param v int64_t count of ticks since epoch
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
* @return std::pair<nanoseconds,days> where nanoseconds is the number of nanoseconds
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
* elapsed in the day and days is the number of days from Julian epoch.
*/
static __device__ std::pair<duration_ns, duration_D> convert_nanoseconds(timestamp_ns const ns)
template <typename Per>
mythrocks marked this conversation as resolved.
Show resolved Hide resolved
__device__ auto juilian_days_with_time(int64_t v)
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
{
using namespace cuda::std::chrono;
auto const nanosecond_ticks = ns.time_since_epoch();
auto const gregorian_days = floor<days>(nanosecond_ticks);
auto const julian_days = gregorian_days + ceil<days>(julian_calendar_epoch_diff());

auto const last_day_ticks = nanosecond_ticks - gregorian_days;
return {last_day_ticks, julian_days};
auto const dur_total = duration<int64_t, Per>{v};
auto const dur_days = floor<days>(dur_total);
auto const dur_time_of_day = dur_total - dur_days;
auto const dur_time_of_day_nanos = duration_cast<nanoseconds>(dur_time_of_day);
auto const julian_days = dur_days + ceil<days>(julian_calendar_epoch_diff());
return std::make_pair(dur_time_of_day_nanos, julian_days);
}

// blockDim(128, 1, 1)
Expand Down Expand Up @@ -1236,22 +1237,23 @@ __global__ void __launch_bounds__(128, 8)
}
}

auto const ret = convert_nanoseconds([&]() {
auto const& [last_day_nanos, julian_days] = ([&]() {
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
using namespace cuda::std::chrono;
switch (s->col.leaf_column->type().id()) {
case type_id::TIMESTAMP_SECONDS:
case type_id::TIMESTAMP_MILLISECONDS: {
return timestamp_ns{duration_ms{v}};
return juilian_days_with_time<cuda::std::milli>(v);
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
} break;
case type_id::TIMESTAMP_MICROSECONDS:
case type_id::TIMESTAMP_NANOSECONDS: {
return timestamp_ns{duration_us{v}};
return juilian_days_with_time<cuda::std::micro>(v);
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
} break;
}
return timestamp_ns{duration_ns{0}};
return juilian_days_with_time<cuda::std::nano>(0);
gerashegalov marked this conversation as resolved.
Show resolved Hide resolved
}());

// the 12 bytes of fixed length data.
v = ret.first.count();
v = last_day_nanos.count();
dst[pos + 0] = v;
dst[pos + 1] = v >> 8;
dst[pos + 2] = v >> 16;
Expand All @@ -1260,7 +1262,7 @@ __global__ void __launch_bounds__(128, 8)
dst[pos + 5] = v >> 40;
dst[pos + 6] = v >> 48;
dst[pos + 7] = v >> 56;
uint32_t w = ret.second.count();
uint32_t w = julian_days.count();
dst[pos + 8] = w;
dst[pos + 9] = w >> 8;
dst[pos + 10] = w >> 16;
Expand Down
25 changes: 25 additions & 0 deletions cpp/tests/io/parquet_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@
#include <cudf/transform.hpp>
#include <cudf/unary.hpp>
#include <cudf/utilities/span.hpp>
#include <cudf/wrappers/timestamps.hpp>

#include <src/io/parquet/compact_protocol_reader.hpp>
#include <src/io/parquet/parquet.hpp>
Expand Down Expand Up @@ -6411,4 +6412,28 @@ TEST_F(ParquetReaderTest, FilterFloatNAN)
CUDF_TEST_EXPECT_TABLES_EQUAL(expected1->view(), result1);
}

TEST_F(ParquetWriterTest, TimestampMicrosINT96NoOverflow)
{
using namespace cuda::std::chrono;
using namespace cudf::io;

column_wrapper<cudf::timestamp_us> big_ts_col{
sys_days{year{3023} / month{7} / day{14}} + 7h + 38min + 45s + 418688us,
sys_days{year{723} / month{3} / day{21}} + 14h + 20min + 13s + microseconds{781ms}};

table_view expected({big_ts_col});
auto filepath = temp_env->get_temp_filepath("BigINT96Timestamp.parquet");

auto const out_opts =
parquet_writer_options::builder(sink_info{filepath}, expected).int96_timestamps(true).build();
write_parquet(out_opts);

auto const in_opts = parquet_reader_options::builder(source_info(filepath))
.timestamp_type(cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS))
.build();
auto const result = read_parquet(in_opts);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
}

CUDF_TEST_PROGRAM_MAIN()