From 076ad58dba87be07c45a563ec105f28dea647620 Mon Sep 17 00:00:00 2001 From: Lawrence Mitchell Date: Mon, 4 Nov 2024 18:04:22 +0000 Subject: [PATCH 01/12] Expose mixed and conditional joins in pylibcudf (#17235) Expose these join types to pylibcudf, they will be useful for implement inequality joins in cudf polars. Authors: - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/17235 --- cpp/include/cudf/join.hpp | 32 +- cpp/src/join/conditional_join.cu | 7 +- cpp/src/join/mixed_join.cu | 7 +- python/pylibcudf/pylibcudf/join.pxd | 76 ++++ python/pylibcudf/pylibcudf/join.pyx | 405 ++++++++++++++++++ python/pylibcudf/pylibcudf/libcudf/join.pxd | 114 +++++ python/pylibcudf/pylibcudf/tests/test_join.py | 154 ++++++- 7 files changed, 771 insertions(+), 24 deletions(-) diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp index a590eb27511..afefd04d4fa 100644 --- a/cpp/include/cudf/join.hpp +++ b/cpp/include/cudf/join.hpp @@ -573,7 +573,7 @@ class distinct_hash_join { * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -620,7 +620,7 @@ conditional_inner_join(table_view const& left, * Result: {{0, 1, 2}, {None, 0, None}} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -666,7 +666,7 @@ conditional_left_join(table_view const& left, * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -705,7 +705,7 @@ conditional_full_join(table_view const& left, * Result: {1} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -746,7 +746,7 @@ std::unique_ptr> conditional_left_semi_join( * Result: {0, 2} * @endcode * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -793,7 +793,7 @@ std::unique_ptr> conditional_left_anti_join( * Result: {{1}, {0}} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -855,7 +855,7 @@ mixed_inner_join( * Result: {{0, 1, 2}, {None, 0, None}} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -917,7 +917,7 @@ mixed_left_join( * Result: {{0, 1, 2, None, None}, {None, 0, None, 1, 2}} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -972,7 +972,7 @@ mixed_full_join( * Result: {1} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1022,7 +1022,7 @@ std::unique_ptr> mixed_left_semi_join( * Result: {0, 2} * @endcode * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1061,7 +1061,7 @@ std::unique_ptr> mixed_left_anti_join( * choose a suitable compare_nulls value AND use appropriate null-safe * operators in the expression. * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1103,7 +1103,7 @@ std::pair>> mixed_in * choose a suitable compare_nulls value AND use appropriate null-safe * operators in the expression. * - * @throw cudf::logic_error If the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error If the binary predicate outputs a non-boolean result. * @throw cudf::logic_error If the number of rows in left_equality and left_conditional do not * match. * @throw cudf::logic_error If the number of rows in right_equality and right_conditional do not @@ -1142,7 +1142,7 @@ std::pair>> mixed_le * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -1167,7 +1167,7 @@ std::size_t conditional_inner_join_size( * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -1192,7 +1192,7 @@ std::size_t conditional_left_join_size( * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table @@ -1217,7 +1217,7 @@ std::size_t conditional_left_semi_join_size( * If the provided predicate returns NULL for a pair of rows * (left, right), that pair is not included in the output. * - * @throw cudf::logic_error if the binary predicate outputs a non-boolean result. + * @throw cudf::data_type_error if the binary predicate outputs a non-boolean result. * * @param left The left table * @param right The right table diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu index 40d1c925889..781fda215fd 100644 --- a/cpp/src/join/conditional_join.cu +++ b/cpp/src/join/conditional_join.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -178,7 +179,8 @@ conditional_join(table_view const& left, auto const parser = ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); auto left_table = table_device_view::create(left, stream); auto right_table = table_device_view::create(right, stream); @@ -330,7 +332,8 @@ std::size_t compute_conditional_join_output_size(table_view const& left, auto const parser = ast::detail::expression_parser{binary_predicate, left, right, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); auto left_table = table_device_view::create(left, stream); auto right_table = table_device_view::create(right, stream); diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu index 820b81ee309..90b0d0a45ad 100644 --- a/cpp/src/join/mixed_join.cu +++ b/cpp/src/join/mixed_join.cu @@ -28,6 +28,7 @@ #include #include #include +#include #include #include @@ -115,7 +116,8 @@ mixed_join( auto const parser = ast::detail::expression_parser{ binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); // TODO: The non-conditional join impls start with a dictionary matching, // figure out what that is and what it's needed for (and if conditional joins @@ -381,7 +383,8 @@ compute_mixed_join_output_size(table_view const& left_equality, auto const parser = ast::detail::expression_parser{ binary_predicate, left_conditional, right_conditional, has_nulls, stream, mr}; CUDF_EXPECTS(parser.output_type().id() == type_id::BOOL8, - "The expression must produce a boolean output."); + "The expression must produce a boolean output.", + cudf::data_type_error); // TODO: The non-conditional join impls start with a dictionary matching, // figure out what that is and what it's needed for (and if conditional joins diff --git a/python/pylibcudf/pylibcudf/join.pxd b/python/pylibcudf/pylibcudf/join.pxd index 06969b4a2db..bb9162b466a 100644 --- a/python/pylibcudf/pylibcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/join.pxd @@ -3,6 +3,7 @@ from pylibcudf.libcudf.types cimport null_equality from .column cimport Column +from .expressions cimport Expression from .table cimport Table @@ -37,3 +38,78 @@ cpdef Column left_anti_join( ) cpdef Table cross_join(Table left, Table right) + +cpdef tuple conditional_inner_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef tuple conditional_left_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef tuple conditional_full_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef Column conditional_left_semi_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef Column conditional_left_anti_join( + Table left, + Table right, + Expression binary_predicate, +) + +cpdef tuple mixed_inner_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef tuple mixed_left_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef tuple mixed_full_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef Column mixed_left_semi_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) + +cpdef Column mixed_left_anti_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +) diff --git a/python/pylibcudf/pylibcudf/join.pyx b/python/pylibcudf/pylibcudf/join.pyx index bc72647ea8e..0d841eee194 100644 --- a/python/pylibcudf/pylibcudf/join.pyx +++ b/python/pylibcudf/pylibcudf/join.pyx @@ -12,6 +12,7 @@ from pylibcudf.libcudf.types cimport null_equality from rmm.librmm.device_buffer cimport device_buffer from .column cimport Column +from .expressions cimport Expression from .table cimport Table @@ -214,3 +215,407 @@ cpdef Table cross_join(Table left, Table right): with nogil: result = cpp_join.cross_join(left.view(), right.view()) return Table.from_libcudf(move(result)) + + +cpdef tuple conditional_inner_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional inner join between two tables. + + For details, see :cpp:func:`conditional_inner_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.conditional_inner_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple conditional_left_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional left join between two tables. + + For details, see :cpp:func:`conditional_left_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.conditional_left_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple conditional_full_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional full join between two tables. + + For details, see :cpp:func:`conditional_full_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.conditional_full_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef Column conditional_left_semi_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional left semi join between two tables. + + For details, see :cpp:func:`conditional_left_semi_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.conditional_left_semi_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return _column_from_gather_map(move(c_result)) + + +cpdef Column conditional_left_anti_join( + Table left, + Table right, + Expression binary_predicate, +): + """Perform a conditional left anti join between two tables. + + For details, see :cpp:func:`conditional_left_anti_join`. + + Parameters + ---------- + left : Table + The left table to join. + right : Table + The right table to join. + binary_predicate : Expression + Condition to join on. + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.conditional_left_anti_join( + left.view(), right.view(), dereference(binary_predicate.c_obj.get()) + ) + return _column_from_gather_map(move(c_result)) + + +cpdef tuple mixed_inner_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed inner join between two tables. + + For details, see :cpp:func:`mixed_inner_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.mixed_inner_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple mixed_left_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed left join between two tables. + + For details, see :cpp:func:`mixed_left_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.mixed_left_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef tuple mixed_full_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed full join between two tables. + + For details, see :cpp:func:`mixed_full_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Tuple[Column, Column] + A tuple containing the row indices from the left and right tables after the + join. + """ + cdef cpp_join.gather_map_pair_type c_result + with nogil: + c_result = cpp_join.mixed_full_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return ( + _column_from_gather_map(move(c_result.first)), + _column_from_gather_map(move(c_result.second)), + ) + + +cpdef Column mixed_left_semi_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed left semi join between two tables. + + For details, see :cpp:func:`mixed_left_semi_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.mixed_left_semi_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return _column_from_gather_map(move(c_result)) + + +cpdef Column mixed_left_anti_join( + Table left_keys, + Table right_keys, + Table left_conditional, + Table right_conditional, + Expression binary_predicate, + null_equality nulls_equal +): + """Perform a mixed left anti join between two tables. + + For details, see :cpp:func:`mixed_left_anti_join`. + + Parameters + ---------- + left_keys : Table + The left table to use for the equality join. + right_keys : Table + The right table to use for the equality join. + left_conditional : Table + The left table to use for the conditional join. + right_conditional : Table + The right table to use for the conditional join. + binary_predicate : Expression + Condition to join on. + nulls_equal : NullEquality + Should nulls compare equal in the equality join? + + Returns + ------- + Column + A column containing the row indices from the left table after the join. + """ + cdef cpp_join.gather_map_type c_result + with nogil: + c_result = cpp_join.mixed_left_anti_join( + left_keys.view(), + right_keys.view(), + left_conditional.view(), + right_conditional.view(), + dereference(binary_predicate.c_obj.get()), + nulls_equal, + ) + return _column_from_gather_map(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/libcudf/join.pxd b/python/pylibcudf/pylibcudf/libcudf/join.pxd index 21033a0284e..f8e592c2104 100644 --- a/python/pylibcudf/pylibcudf/libcudf/join.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/join.pxd @@ -1,10 +1,14 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. +from libc.stddef cimport size_t from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.optional cimport optional from libcpp.pair cimport pair from libcpp.vector cimport vector +from pylibcudf.exception_handler cimport libcudf_exception_handler from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.expressions cimport expression from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport null_equality, size_type @@ -74,3 +78,113 @@ cdef extern from "cudf/join.hpp" namespace "cudf" nogil: const table_view left, const table_view right, ) except + + + cdef gather_map_pair_type conditional_inner_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_inner_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_left_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_left_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_full_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_pair_type conditional_full_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_semi_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_semi_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_anti_join( + const table_view left, + const table_view right, + const expression binary_predicate, + ) except +libcudf_exception_handler + + cdef gather_map_type conditional_left_anti_join( + const table_view left, + const table_view right, + const expression binary_predicate, + optional[size_t] output_size + ) except +libcudf_exception_handler + + cdef gather_map_pair_type mixed_inner_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_pair_type mixed_left_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_pair_type mixed_full_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_type mixed_left_semi_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler + + cdef gather_map_type mixed_left_anti_join( + const table_view left_equality, + const table_view right_equality, + const table_view left_conditional, + const table_view right_conditional, + const expression binary_predicate, + null_equality compare_nulls + ) except +libcudf_exception_handler diff --git a/python/pylibcudf/pylibcudf/tests/test_join.py b/python/pylibcudf/pylibcudf/tests/test_join.py index f43a56046a4..56cf421780b 100644 --- a/python/pylibcudf/pylibcudf/tests/test_join.py +++ b/python/pylibcudf/pylibcudf/tests/test_join.py @@ -2,17 +2,45 @@ import numpy as np import pyarrow as pa +import pytest from utils import assert_table_eq import pylibcudf as plc -def test_cross_join(): - left = pa.Table.from_arrays([[0, 1, 2], [3, 4, 5]], names=["a", "b"]) - right = pa.Table.from_arrays( - [[6, 7, 8, 9], [10, 11, 12, 13]], names=["c", "d"] +@pytest.fixture +def left(): + return pa.Table.from_arrays( + [[0, 1, 2, 100], [3, 4, 5, None]], + schema=pa.schema({"a": pa.int32(), "b": pa.int32()}), ) + +@pytest.fixture +def right(): + return pa.Table.from_arrays( + [[-1, -2, 0, 1, -3], [10, 3, 4, 5, None]], + schema=pa.schema({"c": pa.int32(), "d": pa.int32()}), + ) + + +@pytest.fixture +def expr(): + return plc.expressions.Operation( + plc.expressions.ASTOperator.LESS, + plc.expressions.ColumnReference( + 0, plc.expressions.TableReference.LEFT + ), + plc.expressions.ColumnReference( + 0, plc.expressions.TableReference.RIGHT + ), + ) + + +def test_cross_join(left, right): + # Remove the nulls so the calculation of the expected result works + left = left[:-1] + right = right[:-1] pleft = plc.interop.from_arrow(left) pright = plc.interop.from_arrow(right) @@ -27,3 +55,121 @@ def test_cross_join(): got = plc.join.cross_join(pleft, pright) assert_table_eq(expect, got) + + +sentinel = np.iinfo(np.int32).min + + +@pytest.mark.parametrize( + "join_type,expect_left,expect_right", + [ + (plc.join.conditional_inner_join, {0}, {3}), + (plc.join.conditional_left_join, {0, 1, 2, 3}, {3, sentinel}), + ( + plc.join.conditional_full_join, + {0, 1, 2, 3, sentinel}, + {0, 1, 2, 3, 4, sentinel}, + ), + ], + ids=["inner", "left", "full"], +) +def test_conditional_join( + left, right, expr, join_type, expect_left, expect_right +): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left, g_right = map(plc.interop.to_arrow, join_type(pleft, pright, expr)) + + assert set(g_left.to_pylist()) == expect_left + assert set(g_right.to_pylist()) == expect_right + + +@pytest.mark.parametrize( + "join_type,expect", + [ + (plc.join.conditional_left_semi_join, {0}), + (plc.join.conditional_left_anti_join, {1, 2, 3}), + ], + ids=["semi", "anti"], +) +def test_conditional_semianti_join(left, right, expr, join_type, expect): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left = plc.interop.to_arrow(join_type(pleft, pright, expr)) + + assert set(g_left.to_pylist()) == expect + + +@pytest.mark.parametrize( + "join_type,expect_left,expect_right", + [ + (plc.join.mixed_inner_join, set(), set()), + (plc.join.mixed_left_join, {0, 1, 2, 3}, {sentinel}), + ( + plc.join.mixed_full_join, + {0, 1, 2, 3, sentinel}, + {0, 1, 2, 3, 4, sentinel}, + ), + ], + ids=["inner", "left", "full"], +) +@pytest.mark.parametrize( + "null_equality", + [plc.types.NullEquality.EQUAL, plc.types.NullEquality.UNEQUAL], + ids=["nulls_equal", "nulls_not_equal"], +) +def test_mixed_join( + left, right, expr, join_type, expect_left, expect_right, null_equality +): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left, g_right = map( + plc.interop.to_arrow, + join_type( + plc.Table(pleft.columns()[1:]), + plc.Table(pright.columns()[1:]), + pleft, + pright, + expr, + null_equality, + ), + ) + + assert set(g_left.to_pylist()) == expect_left + assert set(g_right.to_pylist()) == expect_right + + +@pytest.mark.parametrize( + "join_type,expect", + [ + (plc.join.mixed_left_semi_join, set()), + (plc.join.mixed_left_anti_join, {0, 1, 2, 3}), + ], + ids=["semi", "anti"], +) +@pytest.mark.parametrize( + "null_equality", + [plc.types.NullEquality.EQUAL, plc.types.NullEquality.UNEQUAL], + ids=["nulls_equal", "nulls_not_equal"], +) +def test_mixed_semianti_join( + left, right, expr, join_type, expect, null_equality +): + pleft = plc.interop.from_arrow(left) + pright = plc.interop.from_arrow(right) + + g_left = plc.interop.to_arrow( + join_type( + plc.Table(pleft.columns()[1:]), + plc.Table(pright.columns()[1:]), + pleft, + pright, + expr, + null_equality, + ) + ) + + assert set(g_left.to_pylist()) == expect From a2001dd5c93177fbebd37e85de5d83f152869eb9 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 4 Nov 2024 10:06:15 -0800 Subject: [PATCH 02/12] Use more pylibcudf.io.types enums in cudf._libs (#17237) If we consider the `pylibcudf.libcudf` namespace to eventually be more "private", this PR replaces that usage, specifically when accessing enums, with their public counterparts Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/17237 --- python/cudf/cudf/_lib/csv.pyx | 12 +-- python/cudf/cudf/_lib/json.pyx | 38 +++---- python/cudf/cudf/_lib/orc.pyx | 43 ++++---- python/cudf/cudf/_lib/parquet.pyx | 138 +++++++++++------------- python/pylibcudf/pylibcudf/io/types.pyx | 8 +- 5 files changed, 111 insertions(+), 128 deletions(-) diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx index 9ad96f610b3..c09e06bfc59 100644 --- a/python/cudf/cudf/_lib/csv.pyx +++ b/python/cudf/cudf/_lib/csv.pyx @@ -28,7 +28,7 @@ from pylibcudf.libcudf.io.csv cimport ( write_csv as cpp_write_csv, ) from pylibcudf.libcudf.io.data_sink cimport data_sink -from pylibcudf.libcudf.io.types cimport compression_type, sink_info +from pylibcudf.libcudf.io.types cimport sink_info from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.io.utils cimport make_sink_info @@ -148,13 +148,13 @@ def read_csv( byte_range = (0, 0) if compression is None: - c_compression = compression_type.NONE + c_compression = plc.io.types.CompressionType.NONE else: compression_map = { - "infer": compression_type.AUTO, - "gzip": compression_type.GZIP, - "bz2": compression_type.BZIP2, - "zip": compression_type.ZIP, + "infer": plc.io.types.CompressionType.AUTO, + "gzip": plc.io.types.CompressionType.GZIP, + "bz2": plc.io.types.CompressionType.BZIP2, + "zip": plc.io.types.CompressionType.ZIP, } c_compression = compression_map[compression] diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 9bbbcf60dcf..fb149603960 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -9,10 +9,6 @@ from cudf.core.buffer import acquire_spill_lock from libcpp cimport bool -cimport pylibcudf.libcudf.io.types as cudf_io_types -from pylibcudf.io.types cimport compression_type -from pylibcudf.libcudf.io.json cimport json_recovery_mode_t -from pylibcudf.libcudf.io.types cimport compression_type from pylibcudf.libcudf.types cimport data_type, type_id from pylibcudf.types cimport DataType @@ -24,15 +20,6 @@ from cudf._lib.utils cimport _data_from_columns, data_from_pylibcudf_io import pylibcudf as plc -cdef json_recovery_mode_t _get_json_recovery_mode(object on_bad_lines): - if on_bad_lines.lower() == "error": - return json_recovery_mode_t.FAIL - elif on_bad_lines.lower() == "recover": - return json_recovery_mode_t.RECOVER_WITH_NULL - else: - raise TypeError(f"Invalid parameter for {on_bad_lines=}") - - cpdef read_json(object filepaths_or_buffers, object dtype, bool lines, @@ -41,7 +28,7 @@ cpdef read_json(object filepaths_or_buffers, bool keep_quotes, bool mixed_types_as_string, bool prune_columns, - object on_bad_lines): + str on_bad_lines): """ Cython function to call into libcudf API, see `read_json`. @@ -64,19 +51,24 @@ cpdef read_json(object filepaths_or_buffers, filepaths_or_buffers[idx] = filepaths_or_buffers[idx].encode() # Setup arguments - cdef cudf_io_types.compression_type c_compression - if compression is not None: if compression == 'gzip': - c_compression = cudf_io_types.compression_type.GZIP + c_compression = plc.io.types.CompressionType.GZIP elif compression == 'bz2': - c_compression = cudf_io_types.compression_type.BZIP2 + c_compression = plc.io.types.CompressionType.BZIP2 elif compression == 'zip': - c_compression = cudf_io_types.compression_type.ZIP + c_compression = plc.io.types.CompressionType.ZIP else: - c_compression = cudf_io_types.compression_type.AUTO + c_compression = plc.io.types.CompressionType.AUTO + else: + c_compression = plc.io.types.CompressionType.NONE + + if on_bad_lines.lower() == "error": + c_on_bad_lines = plc.io.types.JSONRecoveryMode.FAIL + elif on_bad_lines.lower() == "recover": + c_on_bad_lines = plc.io.types.JSONRecoveryMode.RECOVER_WITH_NULL else: - c_compression = cudf_io_types.compression_type.NONE + raise TypeError(f"Invalid parameter for {on_bad_lines=}") processed_dtypes = None @@ -108,7 +100,7 @@ cpdef read_json(object filepaths_or_buffers, keep_quotes = keep_quotes, mixed_types_as_string = mixed_types_as_string, prune_columns = prune_columns, - recovery_mode = _get_json_recovery_mode(on_bad_lines) + recovery_mode = c_on_bad_lines ) df = cudf.DataFrame._from_data( *_data_from_columns( @@ -130,7 +122,7 @@ cpdef read_json(object filepaths_or_buffers, keep_quotes = keep_quotes, mixed_types_as_string = mixed_types_as_string, prune_columns = prune_columns, - recovery_mode = _get_json_recovery_mode(on_bad_lines) + recovery_mode = c_on_bad_lines ) df = cudf.DataFrame._from_data( diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index f88c48ce989..32a5e463916 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -15,7 +15,6 @@ try: except ImportError: import json -cimport pylibcudf.libcudf.io.types as cudf_io_types cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.orc cimport ( @@ -26,7 +25,6 @@ from pylibcudf.libcudf.io.orc cimport ( ) from pylibcudf.libcudf.io.types cimport ( column_in_metadata, - compression_type, sink_info, table_input_metadata, ) @@ -137,22 +135,23 @@ cpdef read_orc(object filepaths_or_buffers, return data, index -cdef compression_type _get_comp_type(object compression): +def _get_comp_type(object compression): if compression is None or compression is False: - return compression_type.NONE + return plc.io.types.CompressionType.NONE compression = str(compression).upper() if compression == "SNAPPY": - return compression_type.SNAPPY + return plc.io.types.CompressionType.SNAPPY elif compression == "ZLIB": - return compression_type.ZLIB + return plc.io.types.CompressionType.ZLIB elif compression == "ZSTD": - return compression_type.ZSTD + return plc.io.types.CompressionType.ZSTD elif compression == "LZ4": - return compression_type.LZ4 + return plc.io.types.CompressionType.LZ4 else: raise ValueError(f"Unsupported `compression` type {compression}") + cdef tuple _get_index_from_metadata( vector[map[string, string]] user_data, object names, @@ -210,7 +209,8 @@ cdef tuple _get_index_from_metadata( range_idx ) -cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): + +def _get_orc_stat_freq(str statistics): """ Convert ORC statistics terms to CUDF convention: - ORC "STRIPE" == CUDF "ROWGROUP" @@ -218,11 +218,11 @@ cdef cudf_io_types.statistics_freq _get_orc_stat_freq(object statistics): """ statistics = str(statistics).upper() if statistics == "NONE": - return cudf_io_types.statistics_freq.STATISTICS_NONE + return plc.io.types.StatisticsFreq.STATISTICS_NONE elif statistics == "STRIPE": - return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP + return plc.io.types.StatisticsFreq.STATISTICS_ROWGROUP elif statistics == "ROWGROUP": - return cudf_io_types.statistics_freq.STATISTICS_PAGE + return plc.io.types.StatisticsFreq.STATISTICS_PAGE else: raise ValueError(f"Unsupported `statistics_freq` type {statistics}") @@ -232,7 +232,7 @@ def write_orc( table, object path_or_buf, object compression="snappy", - object statistics="ROWGROUP", + str statistics="ROWGROUP", object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None, @@ -246,7 +246,6 @@ def write_orc( -------- cudf.read_orc """ - cdef compression_type compression_ = _get_comp_type(compression) cdef unique_ptr[data_sink] data_sink_c cdef sink_info sink_info_c = make_sink_info(path_or_buf, data_sink_c) cdef table_input_metadata tbl_meta @@ -289,7 +288,7 @@ def write_orc( sink_info_c, tv ).metadata(tbl_meta) .key_value_metadata(move(user_data)) - .compression(compression_) + .compression(_get_comp_type(compression)) .enable_statistics(_get_orc_stat_freq(statistics)) .build() ) @@ -330,8 +329,8 @@ cdef class ORCWriter: cdef unique_ptr[orc_chunked_writer] writer cdef sink_info sink cdef unique_ptr[data_sink] _data_sink - cdef cudf_io_types.statistics_freq stat_freq - cdef compression_type comp_type + cdef str statistics + cdef object compression cdef object index cdef table_input_metadata tbl_meta cdef object cols_as_map_type @@ -343,15 +342,15 @@ cdef class ORCWriter: object path, object index=None, object compression="snappy", - object statistics="ROWGROUP", + str statistics="ROWGROUP", object cols_as_map_type=None, object stripe_size_bytes=None, object stripe_size_rows=None, object row_index_stride=None): self.sink = make_sink_info(path, self._data_sink) - self.stat_freq = _get_orc_stat_freq(statistics) - self.comp_type = _get_comp_type(compression) + self.statistics = statistics + self.compression = compression self.index = index self.cols_as_map_type = cols_as_map_type \ if cols_as_map_type is None else set(cols_as_map_type) @@ -429,8 +428,8 @@ cdef class ORCWriter: chunked_orc_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) - .compression(self.comp_type) - .enable_statistics(self.stat_freq) + .compression(_get_comp_type(self.compression)) + .enable_statistics(_get_orc_stat_freq(self.statistics)) .build() ) if self.stripe_size_bytes is not None: diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index fa2690c7f21..1212637d330 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -31,10 +31,9 @@ from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move from libcpp.vector cimport vector -cimport pylibcudf.libcudf.io.data_sink as cudf_io_data_sink -cimport pylibcudf.libcudf.io.types as cudf_io_types from pylibcudf.expressions cimport Expression from pylibcudf.io.parquet cimport ChunkedParquetReader +from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.parquet cimport ( chunked_parquet_writer_options, merge_row_group_metadata as parquet_merge_metadata, @@ -47,8 +46,14 @@ from pylibcudf.libcudf.io.parquet_metadata cimport ( read_parquet_metadata as parquet_metadata_reader, ) from pylibcudf.libcudf.io.types cimport ( + source_info, + sink_info, column_in_metadata, table_input_metadata, + partition_info, + statistics_freq, + compression_type, + dictionary_policy, ) from pylibcudf.libcudf.table.table_view cimport table_view from pylibcudf.libcudf.types cimport size_type @@ -377,7 +382,7 @@ cpdef read_parquet_metadata(filepaths_or_buffers): cudf.io.parquet.read_parquet cudf.io.parquet.to_parquet """ - cdef cudf_io_types.source_info source = make_source_info(filepaths_or_buffers) + cdef source_info source = make_source_info(filepaths_or_buffers) args = move(source) @@ -466,8 +471,8 @@ def write_parquet( cdef vector[map[string, string]] user_data cdef table_view tv - cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sinks - cdef cudf_io_types.sink_info sink = make_sinks_info( + cdef vector[unique_ptr[data_sink]] _data_sinks + cdef sink_info sink = make_sinks_info( filepaths_or_buffers, _data_sinks ) @@ -531,19 +536,19 @@ def write_parquet( "Valid values are '1.0' and '2.0'" ) - cdef cudf_io_types.dictionary_policy dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE + dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE if use_dictionary - else cudf_io_types.dictionary_policy.NEVER + else plc.io.types.DictionaryPolicy.NEVER ) - cdef cudf_io_types.compression_type comp_type = _get_comp_type(compression) - cdef cudf_io_types.statistics_freq stat_freq = _get_stat_freq(statistics) + comp_type = _get_comp_type(compression) + stat_freq = _get_stat_freq(statistics) cdef unique_ptr[vector[uint8_t]] out_metadata_c cdef vector[string] c_column_chunks_file_paths cdef bool _int96_timestamps = int96_timestamps - cdef vector[cudf_io_types.partition_info] partitions + cdef vector[partition_info] partitions # Perform write cdef parquet_writer_options args = move( @@ -563,7 +568,7 @@ def write_parquet( partitions.reserve(len(partitions_info)) for part in partitions_info: partitions.push_back( - cudf_io_types.partition_info(part[0], part[1]) + partition_info(part[0], part[1]) ) args.set_partitions(move(partitions)) if metadata_file_path is not None: @@ -646,17 +651,17 @@ cdef class ParquetWriter: cdef bool initialized cdef unique_ptr[cpp_parquet_chunked_writer] writer cdef table_input_metadata tbl_meta - cdef cudf_io_types.sink_info sink - cdef vector[unique_ptr[cudf_io_data_sink.data_sink]] _data_sink - cdef cudf_io_types.statistics_freq stat_freq - cdef cudf_io_types.compression_type comp_type + cdef sink_info sink + cdef vector[unique_ptr[data_sink]] _data_sink + cdef str statistics + cdef object compression cdef object index cdef size_t row_group_size_bytes cdef size_type row_group_size_rows cdef size_t max_page_size_bytes cdef size_type max_page_size_rows cdef size_t max_dictionary_size - cdef cudf_io_types.dictionary_policy dict_policy + cdef bool use_dictionary cdef bool write_arrow_schema def __cinit__(self, object filepath_or_buffer, object index=None, @@ -674,8 +679,8 @@ cdef class ParquetWriter: else [filepath_or_buffer] ) self.sink = make_sinks_info(filepaths_or_buffers, self._data_sink) - self.stat_freq = _get_stat_freq(statistics) - self.comp_type = _get_comp_type(compression) + self.statistics = statistics + self.compression = compression self.index = index self.initialized = False self.row_group_size_bytes = row_group_size_bytes @@ -683,11 +688,7 @@ cdef class ParquetWriter: self.max_page_size_bytes = max_page_size_bytes self.max_page_size_rows = max_page_size_rows self.max_dictionary_size = max_dictionary_size - self.dict_policy = ( - cudf_io_types.dictionary_policy.ADAPTIVE - if use_dictionary - else cudf_io_types.dictionary_policy.NEVER - ) + self.use_dictionary = use_dictionary self.write_arrow_schema = store_schema def write_table(self, table, object partitions_info=None): @@ -706,11 +707,11 @@ cdef class ParquetWriter: else: tv = table_view_from_table(table, ignore_index=True) - cdef vector[cudf_io_types.partition_info] partitions + cdef vector[partition_info] partitions if partitions_info is not None: for part in partitions_info: partitions.push_back( - cudf_io_types.partition_info(part[0], part[1]) + partition_info(part[0], part[1]) ) with nogil: @@ -795,13 +796,20 @@ cdef class ParquetWriter: user_data = vector[map[string, string]](num_partitions, tmp_user_data) cdef chunked_parquet_writer_options args + cdef compression_type comp_type = _get_comp_type(self.compression) + cdef statistics_freq stat_freq = _get_stat_freq(self.statistics) + cdef dictionary_policy dict_policy = ( + plc.io.types.DictionaryPolicy.ADAPTIVE + if self.use_dictionary + else plc.io.types.DictionaryPolicy.NEVER + ) with nogil: args = move( chunked_parquet_writer_options.builder(self.sink) .metadata(self.tbl_meta) .key_value_metadata(move(user_data)) - .compression(self.comp_type) - .stats_level(self.stat_freq) + .compression(comp_type) + .stats_level(stat_freq) .row_group_size_bytes(self.row_group_size_bytes) .row_group_size_rows(self.row_group_size_rows) .max_page_size_bytes(self.max_page_size_bytes) @@ -810,7 +818,7 @@ cdef class ParquetWriter: .write_arrow_schema(self.write_arrow_schema) .build() ) - args.set_dictionary_policy(self.dict_policy) + args.set_dictionary_policy(dict_policy) self.writer.reset(new cpp_parquet_chunked_writer(args)) self.initialized = True @@ -838,56 +846,28 @@ cpdef merge_filemetadata(object filemetadata_list): return np.asarray(out_metadata_py) -cdef cudf_io_types.statistics_freq _get_stat_freq(object statistics): - statistics = str(statistics).upper() - if statistics == "NONE": - return cudf_io_types.statistics_freq.STATISTICS_NONE - elif statistics == "ROWGROUP": - return cudf_io_types.statistics_freq.STATISTICS_ROWGROUP - elif statistics == "PAGE": - return cudf_io_types.statistics_freq.STATISTICS_PAGE - elif statistics == "COLUMN": - return cudf_io_types.statistics_freq.STATISTICS_COLUMN - else: +cdef statistics_freq _get_stat_freq(str statistics): + result = getattr( + plc.io.types.StatisticsFreq, + f"STATISTICS_{statistics.upper()}", + None + ) + if result is None: raise ValueError("Unsupported `statistics_freq` type") + return result -cdef cudf_io_types.compression_type _get_comp_type(object compression): +cdef compression_type _get_comp_type(object compression): if compression is None: - return cudf_io_types.compression_type.NONE - - compression = str(compression).upper() - if compression == "SNAPPY": - return cudf_io_types.compression_type.SNAPPY - elif compression == "ZSTD": - return cudf_io_types.compression_type.ZSTD - elif compression == "LZ4": - return cudf_io_types.compression_type.LZ4 - else: + return plc.io.types.CompressionType.NONE + result = getattr( + plc.io.types.CompressionType, + str(compression).upper(), + None + ) + if result is None: raise ValueError("Unsupported `compression` type") - - -cdef cudf_io_types.column_encoding _get_encoding_type(object encoding): - if encoding is None: - return cudf_io_types.column_encoding.USE_DEFAULT - - enc = str(encoding).upper() - if enc == "PLAIN": - return cudf_io_types.column_encoding.PLAIN - elif enc == "DICTIONARY": - return cudf_io_types.column_encoding.DICTIONARY - elif enc == "DELTA_BINARY_PACKED": - return cudf_io_types.column_encoding.DELTA_BINARY_PACKED - elif enc == "DELTA_LENGTH_BYTE_ARRAY": - return cudf_io_types.column_encoding.DELTA_LENGTH_BYTE_ARRAY - elif enc == "DELTA_BYTE_ARRAY": - return cudf_io_types.column_encoding.DELTA_BYTE_ARRAY - elif enc == "BYTE_STREAM_SPLIT": - return cudf_io_types.column_encoding.BYTE_STREAM_SPLIT - elif enc == "USE_DEFAULT": - return cudf_io_types.column_encoding.USE_DEFAULT - else: - raise ValueError("Unsupported `column_encoding` type") + return result cdef _set_col_metadata( @@ -914,7 +894,15 @@ cdef _set_col_metadata( col_meta.set_skip_compression(True) if column_encoding is not None and full_path in column_encoding: - col_meta.set_encoding(_get_encoding_type(column_encoding[full_path])) + encoding = column_encoding[full_path] + if encoding is None: + c_encoding = plc.io.types.ColumnEncoding.USE_DEFAULT + else: + enc = str(encoding).upper() + c_encoding = getattr(plc.io.types.ColumnEncoding, enc, None) + if c_encoding is None: + raise ValueError("Unsupported `column_encoding` type") + col_meta.set_encoding(c_encoding) if column_type_length is not None and full_path in column_type_length: col_meta.set_output_as_binary(True) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 563a02761da..967d05e7057 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -23,8 +23,12 @@ import os from pylibcudf.libcudf.io.json import \ json_recovery_mode_t as JSONRecoveryMode # no-cython-lint -from pylibcudf.libcudf.io.types import \ - compression_type as CompressionType # no-cython-lint +from pylibcudf.libcudf.io.types import ( + compression_type as CompressionType, # no-cython-lint + column_encoding as ColumnEncoding, # no-cython-lint + dictionary_policy as DictionaryPolicy, # no-cython-lint + statistics_freq as StatisticsFreq, # no-cython-lint +) cdef class TableWithMetadata: From 1d25d14b718541145b45cf25c80b55321a9e2c32 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Mon, 4 Nov 2024 14:16:05 -0600 Subject: [PATCH 03/12] Fix discoverability of submodules inside `pd.util` (#17215) Fixes: #17166 This PR fixes the discoverability of the submodules of attributes and modules inside `pd.util`. Somehow `importlib.import_module("pandas.util").__dict__` doesn't display submodules and only root level attributes. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Murray (https://github.com/Matt711) URL: https://github.com/rapidsai/cudf/pull/17215 --- python/cudf/cudf/pandas/_wrappers/pandas.py | 28 ++++++++++++++----- .../cudf_pandas_tests/test_cudf_pandas.py | 18 ++++++++++++ 2 files changed, 39 insertions(+), 7 deletions(-) diff --git a/python/cudf/cudf/pandas/_wrappers/pandas.py b/python/cudf/cudf/pandas/_wrappers/pandas.py index 6d03063fa27..05e7d159c63 100644 --- a/python/cudf/cudf/pandas/_wrappers/pandas.py +++ b/python/cudf/cudf/pandas/_wrappers/pandas.py @@ -75,13 +75,27 @@ def _pandas_util_dir(): # In pandas 2.0, pandas.util contains public APIs under # __getattr__ but no __dir__ to find them # https://github.com/pandas-dev/pandas/blob/2.2.x/pandas/util/__init__.py - return list(importlib.import_module("pandas.util").__dict__.keys()) + [ - "hash_array", - "hash_pandas_object", - "Appender", - "Substitution", - "cache_readonly", - ] + res = list( + set( + list(importlib.import_module("pandas.util").__dict__.keys()) + + [ + "Appender", + "Substitution", + "_exceptions", + "_print_versions", + "cache_readonly", + "hash_array", + "hash_pandas_object", + "version", + "_tester", + "_validators", + "_decorators", + ] + ) + ) + if cudf.core._compat.PANDAS_GE_220: + res.append("capitalize_first_letter") + return res pd.util.__dir__ = _pandas_util_dir diff --git a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py index 7aefdc386bb..3e7d1cf3c4c 100644 --- a/python/cudf/cudf_pandas_tests/test_cudf_pandas.py +++ b/python/cudf/cudf_pandas_tests/test_cudf_pandas.py @@ -1759,3 +1759,21 @@ def test_fallback_raises_error(monkeypatch): monkeycontext.setenv("CUDF_PANDAS_FAIL_ON_FALLBACK", "True") with pytest.raises(ProxyFallbackError): pd.Series(range(2)).astype(object) + + +@pytest.mark.parametrize( + "attrs", + [ + "_exceptions", + "version", + "_print_versions", + "capitalize_first_letter", + "_validators", + "_decorators", + ], +) +def test_cudf_pandas_util_version(attrs): + if not PANDAS_GE_220 and attrs == "capitalize_first_letter": + assert not hasattr(pd.util, attrs) + else: + assert hasattr(pd.util, attrs) From 45563b363d62b0f27f3d371e880142748a62eec5 Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Mon, 4 Nov 2024 15:06:35 -0600 Subject: [PATCH 04/12] Refactor Dask cuDF legacy code (#17205) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The "legacy" DataFrame API is now deprecated (https://github.com/dask/dask/pull/11437). The main purpose of this PR is to start isolating legacy code in Dask cuDF. **Old layout**: ``` dask_cudf/ ├── expr/ │ ├── _collection.py │ ├── _expr.py │ ├── _groupby.py ├── io/ │ ├── tests/ │ ├── ... │ ├── parquet.py │ ├── ... ├── tests/ ├── accessors.py ├── backends.py ├── core.py ├── groupby.py ├── sorting.py ``` **New layout**: ``` dask_cudf/ ├── _expr/ │ ├── accessors.py │ ├── collection.py │ ├── expr.py │ ├── groupby.py ├── _legacy/ │ ├── io/ │ ├── core.py │ ├── groupby.py │ ├── sorting.py ├── io/ │ ├── tests/ │ ├── ... │ ├── parquet.py │ ├── ... ├── tests/ ├── backends.py ├── core.py ``` **Notes** - This PR adds some backward compatibility to the expr-based API that was previously missing: The user can now import collection classes from `dask_cudf.core` (previously led to a "silent" bug when query-planning was enabled). - The user can also import various IO functions from `dask_cudf.io` (and sub-modules like `dask_cudf.io.parquet`), but they will typically get a deprecation warning. - This PR is still technically "breaking" in the sense that the user can no longer import *some* functions/classes from `dask_cudf.io.*`. Also, the `groupby`, `sorting`, and `accessors` modules have simply moved. It *should* be uncommon for down-stream code to import from these modules. It's also worth noting that query-planning was already causing problems for these users if they *were* doing this. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) Approvers: - Mads R. B. Kristensen (https://github.com/madsbk) URL: https://github.com/rapidsai/cudf/pull/17205 --- python/dask_cudf/dask_cudf/__init__.py | 51 +- python/dask_cudf/dask_cudf/_expr/__init__.py | 1 + .../dask_cudf/{ => _expr}/accessors.py | 0 .../_collection.py => _expr/collection.py} | 33 +- python/dask_cudf/dask_cudf/_expr/expr.py | 210 +++++ python/dask_cudf/dask_cudf/_expr/groupby.py | 335 ++++++++ .../dask_cudf/dask_cudf/_legacy/__init__.py | 1 + python/dask_cudf/dask_cudf/_legacy/core.py | 711 ++++++++++++++++ .../dask_cudf/{ => _legacy}/groupby.py | 2 +- .../dask_cudf/_legacy/io/__init__.py | 11 + python/dask_cudf/dask_cudf/_legacy/io/csv.py | 222 +++++ python/dask_cudf/dask_cudf/_legacy/io/json.py | 209 +++++ python/dask_cudf/dask_cudf/_legacy/io/orc.py | 199 +++++ .../dask_cudf/dask_cudf/_legacy/io/parquet.py | 513 ++++++++++++ python/dask_cudf/dask_cudf/_legacy/io/text.py | 54 ++ .../dask_cudf/{ => _legacy}/sorting.py | 0 python/dask_cudf/dask_cudf/backends.py | 29 +- python/dask_cudf/dask_cudf/core.py | 760 +----------------- python/dask_cudf/dask_cudf/expr/__init__.py | 25 - python/dask_cudf/dask_cudf/expr/_expr.py | 511 ------------ python/dask_cudf/dask_cudf/expr/_groupby.py | 123 --- python/dask_cudf/dask_cudf/io/__init__.py | 39 +- python/dask_cudf/dask_cudf/io/csv.py | 226 +----- python/dask_cudf/dask_cudf/io/json.py | 213 +---- python/dask_cudf/dask_cudf/io/orc.py | 212 +---- python/dask_cudf/dask_cudf/io/parquet.py | 594 +++----------- .../dask_cudf/dask_cudf/io/tests/test_csv.py | 15 + .../dask_cudf/dask_cudf/io/tests/test_json.py | 15 + .../dask_cudf/dask_cudf/io/tests/test_orc.py | 18 + .../dask_cudf/io/tests/test_parquet.py | 39 +- .../dask_cudf/dask_cudf/io/tests/test_text.py | 12 + python/dask_cudf/dask_cudf/io/text.py | 58 +- python/dask_cudf/dask_cudf/tests/test_core.py | 24 - .../dask_cudf/dask_cudf/tests/test_groupby.py | 2 +- python/dask_cudf/dask_cudf/tests/utils.py | 2 +- 35 files changed, 2795 insertions(+), 2674 deletions(-) create mode 100644 python/dask_cudf/dask_cudf/_expr/__init__.py rename python/dask_cudf/dask_cudf/{ => _expr}/accessors.py (100%) rename python/dask_cudf/dask_cudf/{expr/_collection.py => _expr/collection.py} (88%) create mode 100644 python/dask_cudf/dask_cudf/_expr/expr.py create mode 100644 python/dask_cudf/dask_cudf/_expr/groupby.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/__init__.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/core.py rename python/dask_cudf/dask_cudf/{ => _legacy}/groupby.py (99%) create mode 100644 python/dask_cudf/dask_cudf/_legacy/io/__init__.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/io/csv.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/io/json.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/io/orc.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/io/parquet.py create mode 100644 python/dask_cudf/dask_cudf/_legacy/io/text.py rename python/dask_cudf/dask_cudf/{ => _legacy}/sorting.py (100%) delete mode 100644 python/dask_cudf/dask_cudf/expr/__init__.py delete mode 100644 python/dask_cudf/dask_cudf/expr/_expr.py delete mode 100644 python/dask_cudf/dask_cudf/expr/_groupby.py diff --git a/python/dask_cudf/dask_cudf/__init__.py b/python/dask_cudf/dask_cudf/__init__.py index f9df22cc436..cc17e71039a 100644 --- a/python/dask_cudf/dask_cudf/__init__.py +++ b/python/dask_cudf/dask_cudf/__init__.py @@ -1,21 +1,19 @@ # Copyright (c) 2018-2024, NVIDIA CORPORATION. -from dask import config - -# For dask>2024.2.0, we can silence the loud deprecation -# warning before importing `dask.dataframe` (this won't -# do anything for dask==2024.2.0) -config.set({"dataframe.query-planning-warning": False}) +import warnings +from importlib import import_module -import dask.dataframe as dd # noqa: E402 +from dask import config +import dask.dataframe as dd from dask.dataframe import from_delayed # noqa: E402 import cudf # noqa: E402 from . import backends # noqa: E402, F401 from ._version import __git_commit__, __version__ # noqa: E402, F401 -from .core import concat, from_cudf, from_dask_dataframe # noqa: E402 -from .expr import QUERY_PLANNING_ON # noqa: E402 +from .core import concat, from_cudf, DataFrame, Index, Series # noqa: F401 + +QUERY_PLANNING_ON = dd.DASK_EXPR_ENABLED def read_csv(*args, **kwargs): @@ -38,26 +36,44 @@ def read_parquet(*args, **kwargs): return dd.read_parquet(*args, **kwargs) -def raise_not_implemented_error(attr_name): +def _deprecated_api(old_api, new_api=None, rec=None): def inner_func(*args, **kwargs): + if new_api: + # Use alternative + msg = f"{old_api} is now deprecated. " + msg += rec or f"Please use {new_api} instead." + warnings.warn(msg, FutureWarning) + new_attr = new_api.split(".") + module = import_module(".".join(new_attr[:-1])) + return getattr(module, new_attr[-1])(*args, **kwargs) + + # No alternative - raise an error raise NotImplementedError( - f"Top-level {attr_name} API is not available for dask-expr." + f"{old_api} is no longer supported. " + (rec or "") ) return inner_func if QUERY_PLANNING_ON: - from .expr._collection import DataFrame, Index, Series + from ._expr.expr import _patch_dask_expr + from . import io # noqa: F401 - groupby_agg = raise_not_implemented_error("groupby_agg") + groupby_agg = _deprecated_api("dask_cudf.groupby_agg") read_text = DataFrame.read_text - to_orc = raise_not_implemented_error("to_orc") + _patch_dask_expr() else: - from .core import DataFrame, Index, Series # noqa: F401 - from .groupby import groupby_agg # noqa: F401 - from .io import read_text, to_orc # noqa: F401 + from ._legacy.groupby import groupby_agg # noqa: F401 + from ._legacy.io import read_text # noqa: F401 + from . import io # noqa: F401 + + +to_orc = _deprecated_api( + "dask_cudf.to_orc", + new_api="dask_cudf._legacy.io.to_orc", + rec="Please use DataFrame.to_orc instead.", +) __all__ = [ @@ -65,7 +81,6 @@ def inner_func(*args, **kwargs): "Series", "Index", "from_cudf", - "from_dask_dataframe", "concat", "from_delayed", ] diff --git a/python/dask_cudf/dask_cudf/_expr/__init__.py b/python/dask_cudf/dask_cudf/_expr/__init__.py new file mode 100644 index 00000000000..3c827d4ff59 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_expr/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. diff --git a/python/dask_cudf/dask_cudf/accessors.py b/python/dask_cudf/dask_cudf/_expr/accessors.py similarity index 100% rename from python/dask_cudf/dask_cudf/accessors.py rename to python/dask_cudf/dask_cudf/_expr/accessors.py diff --git a/python/dask_cudf/dask_cudf/expr/_collection.py b/python/dask_cudf/dask_cudf/_expr/collection.py similarity index 88% rename from python/dask_cudf/dask_cudf/expr/_collection.py rename to python/dask_cudf/dask_cudf/_expr/collection.py index 907abaa2bfc..fdf7d8630e9 100644 --- a/python/dask_cudf/dask_cudf/expr/_collection.py +++ b/python/dask_cudf/dask_cudf/_expr/collection.py @@ -34,22 +34,6 @@ class CudfFrameBase(FrameBase): - def to_dask_dataframe(self, **kwargs): - """Create a dask.dataframe object from a dask_cudf object - - WARNING: This API is deprecated, and may not work properly. - Please use `*.to_backend("pandas")` to convert the - underlying data to pandas. - """ - - warnings.warn( - "The `to_dask_dataframe` API is now deprecated. " - "Please use `*.to_backend('pandas')` instead.", - FutureWarning, - ) - - return self.to_backend("pandas", **kwargs) - def _prepare_cov_corr(self, min_periods, numeric_only): # Upstream version of this method sets min_periods # to 2 by default (which is not supported by cudf) @@ -94,7 +78,7 @@ def var( def rename_axis( self, mapper=no_default, index=no_default, columns=no_default, axis=0 ): - from dask_cudf.expr._expr import RenameAxisCudf + from dask_cudf._expr.expr import RenameAxisCudf return new_collection( RenameAxisCudf( @@ -136,7 +120,7 @@ def groupby( dropna=None, **kwargs, ): - from dask_cudf.expr._groupby import GroupBy + from dask_cudf._expr.groupby import GroupBy if isinstance(by, FrameBase) and not isinstance(by, DXSeries): raise ValueError( @@ -169,13 +153,16 @@ def groupby( ) def to_orc(self, *args, **kwargs): - return self.to_legacy_dataframe().to_orc(*args, **kwargs) + from dask_cudf._legacy.io import to_orc + + return to_orc(self, *args, **kwargs) + # return self.to_legacy_dataframe().to_orc(*args, **kwargs) @staticmethod def read_text(*args, **kwargs): from dask_expr import from_legacy_dataframe - from dask_cudf.io.text import read_text as legacy_read_text + from dask_cudf._legacy.io.text import read_text as legacy_read_text ddf = legacy_read_text(*args, **kwargs) return from_legacy_dataframe(ddf) @@ -183,19 +170,19 @@ def read_text(*args, **kwargs): class Series(DXSeries, CudfFrameBase): def groupby(self, by, **kwargs): - from dask_cudf.expr._groupby import SeriesGroupBy + from dask_cudf._expr.groupby import SeriesGroupBy return SeriesGroupBy(self, by, **kwargs) @cached_property def list(self): - from dask_cudf.accessors import ListMethods + from dask_cudf._expr.accessors import ListMethods return ListMethods(self) @cached_property def struct(self): - from dask_cudf.accessors import StructMethods + from dask_cudf._expr.accessors import StructMethods return StructMethods(self) diff --git a/python/dask_cudf/dask_cudf/_expr/expr.py b/python/dask_cudf/dask_cudf/_expr/expr.py new file mode 100644 index 00000000000..8b91e53604c --- /dev/null +++ b/python/dask_cudf/dask_cudf/_expr/expr.py @@ -0,0 +1,210 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import functools + +import dask_expr._shuffle as _shuffle_module +from dask_expr import new_collection +from dask_expr._cumulative import CumulativeBlockwise +from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns +from dask_expr._reductions import Reduction, Var + +from dask.dataframe.core import ( + is_dataframe_like, + make_meta, + meta_nonempty, +) +from dask.dataframe.dispatch import is_categorical_dtype +from dask.typing import no_default + +import cudf + +## +## Custom expressions +## + + +class RenameAxisCudf(RenameAxis): + # TODO: Remove this after rename_axis is supported in cudf + # (See: https://github.com/rapidsai/cudf/issues/16895) + @staticmethod + def operation(df, index=no_default, **kwargs): + if index != no_default: + df.index.name = index + return df + raise NotImplementedError( + "Only `index` is supported for the cudf backend" + ) + + +class ToCudfBackend(Elemwise): + # TODO: Inherit from ToBackend when rapids-dask-dependency + # is pinned to dask>=2024.8.1 + _parameters = ["frame", "options"] + _projection_passthrough = True + _filter_passthrough = True + _preserves_partitioning_information = True + + @staticmethod + def operation(df, options): + from dask_cudf.backends import to_cudf_dispatch + + return to_cudf_dispatch(df, **options) + + def _simplify_down(self): + if isinstance( + self.frame._meta, (cudf.DataFrame, cudf.Series, cudf.Index) + ): + # We already have cudf data + return self.frame + + +## +## Custom expression patching +## + + +# This can be removed after cudf#15176 is addressed. +# See: https://github.com/rapidsai/cudf/issues/15176 +class PatchCumulativeBlockwise(CumulativeBlockwise): + @property + def _args(self) -> list: + return self.operands[:1] + + @property + def _kwargs(self) -> dict: + # Must pass axis and skipna as kwargs in cudf + return {"axis": self.axis, "skipna": self.skipna} + + +# The upstream Var code uses `Series.values`, and relies on numpy +# for most of the logic. Unfortunately, cudf -> cupy conversion +# is not supported for data containing null values. Therefore, +# we must implement our own version of Var for now. This logic +# is mostly copied from dask-cudf. + + +class VarCudf(Reduction): + # Uses the parallel version of Welford's online algorithm (Chan '79) + # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf) + _parameters = [ + "frame", + "skipna", + "ddof", + "numeric_only", + "split_every", + ] + _defaults = { + "skipna": True, + "ddof": 1, + "numeric_only": False, + "split_every": False, + } + + @functools.cached_property + def _meta(self): + return make_meta( + meta_nonempty(self.frame._meta).var( + skipna=self.skipna, numeric_only=self.numeric_only + ) + ) + + @property + def chunk_kwargs(self): + return dict(skipna=self.skipna, numeric_only=self.numeric_only) + + @property + def combine_kwargs(self): + return {} + + @property + def aggregate_kwargs(self): + return dict(ddof=self.ddof) + + @classmethod + def reduction_chunk(cls, x, skipna=True, numeric_only=False): + kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {} + if skipna or numeric_only: + n = x.count(**kwargs) + kwargs["skipna"] = skipna + avg = x.mean(**kwargs) + else: + # Not skipping nulls, so might as well + # avoid the full `count` operation + n = len(x) + kwargs["skipna"] = skipna + avg = x.sum(**kwargs) / n + if numeric_only: + # Workaround for cudf bug + # (see: https://github.com/rapidsai/cudf/issues/13731) + x = x[n.index] + m2 = ((x - avg) ** 2).sum(**kwargs) + return n, avg, m2 + + @classmethod + def reduction_combine(cls, parts): + n, avg, m2 = parts[0] + for i in range(1, len(parts)): + n_a, avg_a, m2_a = n, avg, m2 + n_b, avg_b, m2_b = parts[i] + n = n_a + n_b + avg = (n_a * avg_a + n_b * avg_b) / n + delta = avg_b - avg_a + m2 = m2_a + m2_b + delta**2 * n_a * n_b / n + return n, avg, m2 + + @classmethod + def reduction_aggregate(cls, vals, ddof=1): + vals = cls.reduction_combine(vals) + n, _, m2 = vals + return m2 / (n - ddof) + + +def _patched_var( + self, + axis=0, + skipna=True, + ddof=1, + numeric_only=False, + split_every=False, +): + if axis == 0: + if hasattr(self._meta, "to_pandas"): + return VarCudf(self, skipna, ddof, numeric_only, split_every) + else: + return Var(self, skipna, ddof, numeric_only, split_every) + elif axis == 1: + return VarColumns(self, skipna, ddof, numeric_only) + else: + raise ValueError(f"axis={axis} not supported. Please specify 0 or 1") + + +# Temporary work-around for missing cudf + categorical support +# See: https://github.com/rapidsai/cudf/issues/11795 +# TODO: Fix RepartitionQuantiles and remove this in cudf>24.06 + +_original_get_divisions = _shuffle_module._get_divisions + + +def _patched_get_divisions(frame, other, *args, **kwargs): + # NOTE: The following two lines contains the "patch" + # (we simply convert the partitioning column to pandas) + if is_categorical_dtype(other._meta.dtype) and hasattr( + other.frame._meta, "to_pandas" + ): + other = new_collection(other).to_backend("pandas")._expr + + # Call "original" function + return _original_get_divisions(frame, other, *args, **kwargs) + + +_PATCHED = False + + +def _patch_dask_expr(): + global _PATCHED + + if not _PATCHED: + CumulativeBlockwise._args = PatchCumulativeBlockwise._args + CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs + Expr.var = _patched_var + _shuffle_module._get_divisions = _patched_get_divisions + _PATCHED = True diff --git a/python/dask_cudf/dask_cudf/_expr/groupby.py b/python/dask_cudf/dask_cudf/_expr/groupby.py new file mode 100644 index 00000000000..0242fac6e72 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_expr/groupby.py @@ -0,0 +1,335 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import functools + +import pandas as pd +from dask_expr._collection import new_collection +from dask_expr._groupby import ( + DecomposableGroupbyAggregation, + GroupBy as DXGroupBy, + GroupbyAggregation, + SeriesGroupBy as DXSeriesGroupBy, + SingleAggregation, +) +from dask_expr._util import is_scalar + +from dask.dataframe.core import _concat +from dask.dataframe.groupby import Aggregation + +from cudf.core.groupby.groupby import _deprecate_collect + +## +## Fused groupby aggregations +## + + +def _get_spec_info(gb): + if isinstance(gb.arg, (dict, list)): + aggs = gb.arg.copy() + else: + aggs = gb.arg + + if gb._slice and not isinstance(aggs, dict): + aggs = {gb._slice: aggs} + + gb_cols = gb._by_columns + if isinstance(gb_cols, str): + gb_cols = [gb_cols] + columns = [c for c in gb.frame.columns if c not in gb_cols] + if not isinstance(aggs, dict): + aggs = {col: aggs for col in columns} + + # Assert if our output will have a MultiIndex; this will be the case if + # any value in the `aggs` dict is not a string (i.e. multiple/named + # aggregations per column) + str_cols_out = True + aggs_renames = {} + for col in aggs: + if isinstance(aggs[col], str) or callable(aggs[col]): + aggs[col] = [aggs[col]] + elif isinstance(aggs[col], dict): + str_cols_out = False + col_aggs = [] + for k, v in aggs[col].items(): + aggs_renames[col, v] = k + col_aggs.append(v) + aggs[col] = col_aggs + else: + str_cols_out = False + if col in gb_cols: + columns.append(col) + + return { + "aggs": aggs, + "columns": columns, + "str_cols_out": str_cols_out, + "aggs_renames": aggs_renames, + } + + +def _get_meta(gb): + spec_info = gb.spec_info + gb_cols = gb._by_columns + aggs = spec_info["aggs"].copy() + aggs_renames = spec_info["aggs_renames"] + if spec_info["str_cols_out"]: + # Metadata should use `str` for dict values if that is + # what the user originally specified (column names will + # be str, rather than tuples). + for col in aggs: + aggs[col] = aggs[col][0] + _meta = gb.frame._meta.groupby(gb_cols).agg(aggs) + if aggs_renames: + col_array = [] + agg_array = [] + for col, agg in _meta.columns: + col_array.append(col) + agg_array.append(aggs_renames.get((col, agg), agg)) + _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) + return _meta + + +class DecomposableCudfGroupbyAgg(DecomposableGroupbyAggregation): + sep = "___" + + @functools.cached_property + def spec_info(self): + return _get_spec_info(self) + + @functools.cached_property + def _meta(self): + return _get_meta(self) + + @property + def shuffle_by_index(self): + return False # We always group by column(s) + + @classmethod + def chunk(cls, df, *by, **kwargs): + from dask_cudf._legacy.groupby import _groupby_partition_agg + + return _groupby_partition_agg(df, **kwargs) + + @classmethod + def combine(cls, inputs, **kwargs): + from dask_cudf._legacy.groupby import _tree_node_agg + + return _tree_node_agg(_concat(inputs), **kwargs) + + @classmethod + def aggregate(cls, inputs, **kwargs): + from dask_cudf._legacy.groupby import _finalize_gb_agg + + return _finalize_gb_agg(_concat(inputs), **kwargs) + + @property + def chunk_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + return { + "gb_cols": self._by_columns, + "aggs": self.spec_info["aggs"], + "columns": self.spec_info["columns"], + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + } + + @property + def combine_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + return { + "gb_cols": self._by_columns, + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + } + + @property + def aggregate_kwargs(self) -> dict: + dropna = True if self.dropna is None else self.dropna + final_columns = self._slice or self._meta.columns + return { + "gb_cols": self._by_columns, + "aggs": self.spec_info["aggs"], + "columns": self.spec_info["columns"], + "final_columns": final_columns, + "as_index": True, + "dropna": dropna, + "sort": self.sort, + "sep": self.sep, + "str_cols_out": self.spec_info["str_cols_out"], + "aggs_renames": self.spec_info["aggs_renames"], + } + + +class CudfGroupbyAgg(GroupbyAggregation): + @functools.cached_property + def spec_info(self): + return _get_spec_info(self) + + @functools.cached_property + def _meta(self): + return _get_meta(self) + + def _lower(self): + return DecomposableCudfGroupbyAgg( + self.frame, + self.arg, + self.observed, + self.dropna, + self.split_every, + self.split_out, + self.sort, + self.shuffle_method, + self._slice, + *self.by, + ) + + +def _maybe_get_custom_expr( + gb, + aggs, + split_every=None, + split_out=None, + shuffle_method=None, + **kwargs, +): + from dask_cudf._legacy.groupby import ( + OPTIMIZED_AGGS, + _aggs_optimized, + _redirect_aggs, + ) + + if kwargs: + # Unsupported key-word arguments + return None + + if not hasattr(gb.obj._meta, "to_pandas"): + # Not cuDF-backed data + return None + + _aggs = _redirect_aggs(aggs) + if not _aggs_optimized(_aggs, OPTIMIZED_AGGS): + # One or more aggregations are unsupported + return None + + return CudfGroupbyAgg( + gb.obj.expr, + _aggs, + gb.observed, + gb.dropna, + split_every, + split_out, + gb.sort, + shuffle_method, + gb._slice, + *gb.by, + ) + + +## +## Custom groupby classes +## + + +class ListAgg(SingleAggregation): + @staticmethod + def groupby_chunk(arg): + return arg.agg(list) + + @staticmethod + def groupby_aggregate(arg): + gb = arg.agg(list) + if gb.ndim > 1: + for col in gb.columns: + gb[col] = gb[col].list.concat() + return gb + else: + return gb.list.concat() + + +list_aggregation = Aggregation( + name="list", + chunk=ListAgg.groupby_chunk, + agg=ListAgg.groupby_aggregate, +) + + +def _translate_arg(arg): + # Helper function to translate args so that + # they can be processed correctly by upstream + # dask & dask-expr. Right now, the only necessary + # translation is list aggregations. + if isinstance(arg, dict): + return {k: _translate_arg(v) for k, v in arg.items()} + elif isinstance(arg, list): + return [_translate_arg(x) for x in arg] + elif arg in ("collect", "list", list): + return list_aggregation + else: + return arg + + +# We define our own GroupBy classes in Dask cuDF for +# the following reasons: +# (1) We want to use a custom `aggregate` algorithm +# that performs multiple aggregations on the +# same dataframe partition at once. The upstream +# algorithm breaks distinct aggregations into +# separate tasks. +# (2) We need to work around missing `observed=False` +# support: +# https://github.com/rapidsai/cudf/issues/15173 + + +class GroupBy(DXGroupBy): + def __init__(self, *args, observed=None, **kwargs): + observed = observed if observed is not None else True + super().__init__(*args, observed=observed, **kwargs) + + def __getitem__(self, key): + if is_scalar(key): + return SeriesGroupBy( + self.obj, + by=self.by, + slice=key, + sort=self.sort, + dropna=self.dropna, + observed=self.observed, + ) + g = GroupBy( + self.obj, + by=self.by, + slice=key, + sort=self.sort, + dropna=self.dropna, + observed=self.observed, + group_keys=self.group_keys, + ) + return g + + def collect(self, **kwargs): + _deprecate_collect() + return self._single_agg(ListAgg, **kwargs) + + def aggregate(self, arg, fused=True, **kwargs): + if ( + fused + and (expr := _maybe_get_custom_expr(self, arg, **kwargs)) + is not None + ): + return new_collection(expr) + else: + return super().aggregate(_translate_arg(arg), **kwargs) + + +class SeriesGroupBy(DXSeriesGroupBy): + def __init__(self, *args, observed=None, **kwargs): + observed = observed if observed is not None else True + super().__init__(*args, observed=observed, **kwargs) + + def collect(self, **kwargs): + _deprecate_collect() + return self._single_agg(ListAgg, **kwargs) + + def aggregate(self, arg, **kwargs): + return super().aggregate(_translate_arg(arg), **kwargs) diff --git a/python/dask_cudf/dask_cudf/_legacy/__init__.py b/python/dask_cudf/dask_cudf/_legacy/__init__.py new file mode 100644 index 00000000000..3c827d4ff59 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/__init__.py @@ -0,0 +1 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. diff --git a/python/dask_cudf/dask_cudf/_legacy/core.py b/python/dask_cudf/dask_cudf/_legacy/core.py new file mode 100644 index 00000000000..d6beb775a5e --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/core.py @@ -0,0 +1,711 @@ +# Copyright (c) 2018-2024, NVIDIA CORPORATION. + +import math +import warnings + +import numpy as np +import pandas as pd +from tlz import partition_all + +from dask import dataframe as dd +from dask.base import normalize_token, tokenize +from dask.dataframe.core import ( + Scalar, + handle_out, + make_meta as dask_make_meta, + map_partitions, +) +from dask.dataframe.utils import raise_on_meta_error +from dask.highlevelgraph import HighLevelGraph +from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname + +import cudf +from cudf import _lib as libcudf +from cudf.utils.performance_tracking import _dask_cudf_performance_tracking + +from dask_cudf._expr.accessors import ListMethods, StructMethods +from dask_cudf._legacy import sorting +from dask_cudf._legacy.sorting import ( + _deprecate_shuffle_kwarg, + _get_shuffle_method, +) + + +class _Frame(dd.core._Frame, OperatorMethodMixin): + """Superclass for DataFrame and Series + + Parameters + ---------- + dsk : dict + The dask graph to compute this DataFrame + name : str + The key prefix that specifies which keys in the dask comprise this + particular DataFrame / Series + meta : cudf.DataFrame, cudf.Series, or cudf.Index + An empty cudf object with names, dtypes, and indices matching the + expected output. + divisions : tuple of index values + Values along which we partition our blocks on the index + """ + + def _is_partition_type(self, meta): + return isinstance(meta, self._partition_type) + + def __repr__(self): + s = "" + return s % (type(self).__name__, len(self.dask), self.npartitions) + + +normalize_token.register(_Frame, lambda a: a._name) + + +class DataFrame(_Frame, dd.core.DataFrame): + """ + A distributed Dask DataFrame where the backing dataframe is a + :class:`cuDF DataFrame `. + + Typically you would not construct this object directly, but rather + use one of Dask-cuDF's IO routines. + + Most operations on :doc:`Dask DataFrames ` are + supported, with many of the same caveats. + + """ + + _partition_type = cudf.DataFrame + + @_dask_cudf_performance_tracking + def _assign_column(self, k, v): + def assigner(df, k, v): + out = df.copy() + out[k] = v + return out + + meta = assigner(self._meta, k, dask_make_meta(v)) + return self.map_partitions(assigner, k, v, meta=meta) + + @_dask_cudf_performance_tracking + def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): + import uuid + + if kwargs is None: + kwargs = {} + + if cache_key is None: + cache_key = uuid.uuid4() + + def do_apply_rows(df, func, incols, outcols, kwargs): + return df.apply_rows( + func, incols, outcols, kwargs, cache_key=cache_key + ) + + meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) + return self.map_partitions( + do_apply_rows, func, incols, outcols, kwargs, meta=meta + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def merge(self, other, shuffle_method=None, **kwargs): + on = kwargs.pop("on", None) + if isinstance(on, tuple): + on = list(on) + return super().merge( + other, + on=on, + shuffle_method=_get_shuffle_method(shuffle_method), + **kwargs, + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def join(self, other, shuffle_method=None, **kwargs): + # CuDF doesn't support "right" join yet + how = kwargs.pop("how", "left") + if how == "right": + return other.join(other=self, how="left", **kwargs) + + on = kwargs.pop("on", None) + if isinstance(on, tuple): + on = list(on) + return super().join( + other, + how=how, + on=on, + shuffle_method=_get_shuffle_method(shuffle_method), + **kwargs, + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def set_index( + self, + other, + sorted=False, + divisions=None, + shuffle_method=None, + **kwargs, + ): + pre_sorted = sorted + del sorted + + if divisions == "quantile": + warnings.warn( + "Using divisions='quantile' is now deprecated. " + "Please raise an issue on github if you believe " + "this feature is necessary.", + FutureWarning, + ) + + if ( + divisions == "quantile" + or isinstance(divisions, (cudf.DataFrame, cudf.Series)) + or ( + isinstance(other, str) + and cudf.api.types.is_string_dtype(self[other].dtype) + ) + ): + # Let upstream-dask handle "pre-sorted" case + if pre_sorted: + return dd.shuffle.set_sorted_index( + self, other, divisions=divisions, **kwargs + ) + + by = other + if not isinstance(other, list): + by = [by] + if len(by) > 1: + raise ValueError("Dask does not support MultiIndex (yet).") + if divisions == "quantile": + divisions = None + + # Use dask_cudf's sort_values + df = self.sort_values( + by, + max_branch=kwargs.get("max_branch", None), + divisions=divisions, + set_divisions=True, + ignore_index=True, + shuffle_method=shuffle_method, + ) + + # Ignore divisions if its a dataframe + if isinstance(divisions, cudf.DataFrame): + divisions = None + + # Set index and repartition + df2 = df.map_partitions( + sorting.set_index_post, + index_name=other, + drop=kwargs.get("drop", True), + column_dtype=df.columns.dtype, + ) + npartitions = kwargs.get("npartitions", self.npartitions) + partition_size = kwargs.get("partition_size", None) + if partition_size: + return df2.repartition(partition_size=partition_size) + if not divisions and df2.npartitions != npartitions: + return df2.repartition(npartitions=npartitions) + if divisions and df2.npartitions != len(divisions) - 1: + return df2.repartition(divisions=divisions) + return df2 + + return super().set_index( + other, + sorted=pre_sorted, + shuffle_method=_get_shuffle_method(shuffle_method), + divisions=divisions, + **kwargs, + ) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def sort_values( + self, + by, + ignore_index=False, + max_branch=None, + divisions=None, + set_divisions=False, + ascending=True, + na_position="last", + sort_function=None, + sort_function_kwargs=None, + shuffle_method=None, + **kwargs, + ): + if kwargs: + raise ValueError( + f"Unsupported input arguments passed : {list(kwargs.keys())}" + ) + + df = sorting.sort_values( + self, + by, + max_branch=max_branch, + divisions=divisions, + set_divisions=set_divisions, + ignore_index=ignore_index, + ascending=ascending, + na_position=na_position, + shuffle_method=shuffle_method, + sort_function=sort_function, + sort_function_kwargs=sort_function_kwargs, + ) + + if ignore_index: + return df.reset_index(drop=True) + return df + + @_dask_cudf_performance_tracking + def to_parquet(self, path, *args, **kwargs): + """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" + from dask_cudf._legacy.io import to_parquet + + return to_parquet(self, path, *args, **kwargs) + + @_dask_cudf_performance_tracking + def to_orc(self, path, **kwargs): + """Calls dask_cudf._legacy.io.to_orc""" + from dask_cudf._legacy.io import to_orc + + return to_orc(self, path, **kwargs) + + @derived_from(pd.DataFrame) + @_dask_cudf_performance_tracking + def var( + self, + axis=None, + skipna=True, + ddof=1, + split_every=False, + dtype=None, + out=None, + naive=False, + numeric_only=False, + ): + axis = self._validate_axis(axis) + meta = self._meta_nonempty.var( + axis=axis, skipna=skipna, numeric_only=numeric_only + ) + if axis == 1: + result = map_partitions( + M.var, + self, + meta=meta, + token=self._token_prefix + "var", + axis=axis, + skipna=skipna, + ddof=ddof, + numeric_only=numeric_only, + ) + return handle_out(out, result) + elif naive: + return _naive_var(self, meta, skipna, ddof, split_every, out) + else: + return _parallel_var(self, meta, skipna, split_every, out) + + @_deprecate_shuffle_kwarg + @_dask_cudf_performance_tracking + def shuffle(self, *args, shuffle_method=None, **kwargs): + """Wraps dask.dataframe DataFrame.shuffle method""" + return super().shuffle( + *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs + ) + + @_dask_cudf_performance_tracking + def groupby(self, by=None, **kwargs): + from .groupby import CudfDataFrameGroupBy + + return CudfDataFrameGroupBy(self, by=by, **kwargs) + + +@_dask_cudf_performance_tracking +def sum_of_squares(x): + x = x.astype("f8")._column + outcol = libcudf.reduce.reduce("sum_of_squares", x) + return cudf.Series._from_column(outcol) + + +@_dask_cudf_performance_tracking +def var_aggregate(x2, x, n, ddof): + try: + with warnings.catch_warnings(record=True): + warnings.simplefilter("always") + result = (x2 / n) - (x / n) ** 2 + if ddof != 0: + result = result * n / (n - ddof) + return result + except ZeroDivisionError: + return np.float64(np.nan) + + +@_dask_cudf_performance_tracking +def nlargest_agg(x, **kwargs): + return cudf.concat(x).nlargest(**kwargs) + + +@_dask_cudf_performance_tracking +def nsmallest_agg(x, **kwargs): + return cudf.concat(x).nsmallest(**kwargs) + + +class Series(_Frame, dd.core.Series): + _partition_type = cudf.Series + + @_dask_cudf_performance_tracking + def count(self, split_every=False): + return reduction( + [self], + chunk=M.count, + aggregate=np.sum, + split_every=split_every, + meta="i8", + ) + + @_dask_cudf_performance_tracking + def mean(self, split_every=False): + sum = self.sum(split_every=split_every) + n = self.count(split_every=split_every) + return sum / n + + @derived_from(pd.DataFrame) + @_dask_cudf_performance_tracking + def var( + self, + axis=None, + skipna=True, + ddof=1, + split_every=False, + dtype=None, + out=None, + naive=False, + ): + axis = self._validate_axis(axis) + meta = self._meta_nonempty.var(axis=axis, skipna=skipna) + if axis == 1: + result = map_partitions( + M.var, + self, + meta=meta, + token=self._token_prefix + "var", + axis=axis, + skipna=skipna, + ddof=ddof, + ) + return handle_out(out, result) + elif naive: + return _naive_var(self, meta, skipna, ddof, split_every, out) + else: + return _parallel_var(self, meta, skipna, split_every, out) + + @_dask_cudf_performance_tracking + def groupby(self, *args, **kwargs): + from .groupby import CudfSeriesGroupBy + + return CudfSeriesGroupBy(self, *args, **kwargs) + + @property # type: ignore + @_dask_cudf_performance_tracking + def list(self): + return ListMethods(self) + + @property # type: ignore + @_dask_cudf_performance_tracking + def struct(self): + return StructMethods(self) + + +class Index(Series, dd.core.Index): + _partition_type = cudf.Index # type: ignore + + +@_dask_cudf_performance_tracking +def _naive_var(ddf, meta, skipna, ddof, split_every, out): + num = ddf._get_numeric_data() + x = 1.0 * num.sum(skipna=skipna, split_every=split_every) + x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) + n = num.count(split_every=split_every) + name = ddf._token_prefix + "var" + result = map_partitions( + var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof + ) + if isinstance(ddf, DataFrame): + result.divisions = (min(ddf.columns), max(ddf.columns)) + return handle_out(out, result) + + +@_dask_cudf_performance_tracking +def _parallel_var(ddf, meta, skipna, split_every, out): + def _local_var(x, skipna): + if skipna: + n = x.count() + avg = x.mean(skipna=skipna) + else: + # Not skipping nulls, so might as well + # avoid the full `count` operation + n = len(x) + avg = x.sum(skipna=skipna) / n + m2 = ((x - avg) ** 2).sum(skipna=skipna) + return n, avg, m2 + + def _aggregate_var(parts): + n, avg, m2 = parts[0] + for i in range(1, len(parts)): + n_a, avg_a, m2_a = n, avg, m2 + n_b, avg_b, m2_b = parts[i] + n = n_a + n_b + avg = (n_a * avg_a + n_b * avg_b) / n + delta = avg_b - avg_a + m2 = m2_a + m2_b + delta**2 * n_a * n_b / n + return n, avg, m2 + + def _finalize_var(vals): + n, _, m2 = vals + return m2 / (n - 1) + + # Build graph + nparts = ddf.npartitions + if not split_every: + split_every = nparts + name = "var-" + tokenize(skipna, split_every, out) + local_name = "local-" + name + num = ddf._get_numeric_data() + dsk = { + (local_name, n, 0): (_local_var, (num._name, n), skipna) + for n in range(nparts) + } + + # Use reduction tree + widths = [nparts] + while nparts > 1: + nparts = math.ceil(nparts / split_every) + widths.append(nparts) + height = len(widths) + for depth in range(1, height): + for group in range(widths[depth]): + p_max = widths[depth - 1] + lstart = split_every * group + lstop = min(lstart + split_every, p_max) + node_list = [ + (local_name, p, depth - 1) for p in range(lstart, lstop) + ] + dsk[(local_name, group, depth)] = (_aggregate_var, node_list) + if height == 1: + group = depth = 0 + dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) + + graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) + result = dd.core.new_dd_object(graph, name, meta, (None, None)) + if isinstance(ddf, DataFrame): + result.divisions = (min(ddf.columns), max(ddf.columns)) + return handle_out(out, result) + + +@_dask_cudf_performance_tracking +def _extract_meta(x): + """ + Extract internal cache data (``_meta``) from dask_cudf objects + """ + if isinstance(x, (Scalar, _Frame)): + return x._meta + elif isinstance(x, list): + return [_extract_meta(_x) for _x in x] + elif isinstance(x, tuple): + return tuple(_extract_meta(_x) for _x in x) + elif isinstance(x, dict): + return {k: _extract_meta(v) for k, v in x.items()} + return x + + +@_dask_cudf_performance_tracking +def _emulate(func, *args, **kwargs): + """ + Apply a function using args / kwargs. If arguments contain dd.DataFrame / + dd.Series, using internal cache (``_meta``) for calculation + """ + with raise_on_meta_error(funcname(func)): + return func(*_extract_meta(args), **_extract_meta(kwargs)) + + +@_dask_cudf_performance_tracking +def align_partitions(args): + """Align partitions between dask_cudf objects. + + Note that if all divisions are unknown, but have equal npartitions, then + they will be passed through unchanged. + """ + dfs = [df for df in args if isinstance(df, _Frame)] + if not dfs: + return args + + divisions = dfs[0].divisions + if not all(df.divisions == divisions for df in dfs): + raise NotImplementedError("Aligning mismatched partitions") + return args + + +@_dask_cudf_performance_tracking +def reduction( + args, + chunk=None, + aggregate=None, + combine=None, + meta=None, + token=None, + chunk_kwargs=None, + aggregate_kwargs=None, + combine_kwargs=None, + split_every=None, + **kwargs, +): + """Generic tree reduction operation. + + Parameters + ---------- + args : + Positional arguments for the `chunk` function. All `dask.dataframe` + objects should be partitioned and indexed equivalently. + chunk : function [block-per-arg] -> block + Function to operate on each block of data + aggregate : function list-of-blocks -> block + Function to operate on the list of results of chunk + combine : function list-of-blocks -> block, optional + Function to operate on intermediate lists of results of chunk + in a tree-reduction. If not provided, defaults to aggregate. + $META + token : str, optional + The name to use for the output keys. + chunk_kwargs : dict, optional + Keywords for the chunk function only. + aggregate_kwargs : dict, optional + Keywords for the aggregate function only. + combine_kwargs : dict, optional + Keywords for the combine function only. + split_every : int, optional + Group partitions into groups of this size while performing a + tree-reduction. If set to False, no tree-reduction will be used, + and all intermediates will be concatenated and passed to ``aggregate``. + Default is 8. + kwargs : + All remaining keywords will be passed to ``chunk``, ``aggregate``, and + ``combine``. + """ + if chunk_kwargs is None: + chunk_kwargs = dict() + if aggregate_kwargs is None: + aggregate_kwargs = dict() + chunk_kwargs.update(kwargs) + aggregate_kwargs.update(kwargs) + + if combine is None: + if combine_kwargs: + raise ValueError("`combine_kwargs` provided with no `combine`") + combine = aggregate + combine_kwargs = aggregate_kwargs + else: + if combine_kwargs is None: + combine_kwargs = dict() + combine_kwargs.update(kwargs) + + if not isinstance(args, (tuple, list)): + args = [args] + + npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} + if len(npartitions) > 1: + raise ValueError("All arguments must have same number of partitions") + npartitions = npartitions.pop() + + if split_every is None: + split_every = 8 + elif split_every is False: + split_every = npartitions + elif split_every < 2 or not isinstance(split_every, int): + raise ValueError("split_every must be an integer >= 2") + + token_key = tokenize( + token or (chunk, aggregate), + meta, + args, + chunk_kwargs, + aggregate_kwargs, + combine_kwargs, + split_every, + ) + + # Chunk + a = f"{token or funcname(chunk)}-chunk-{token_key}" + if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: + dsk = { + (a, 0, i): (chunk, key) + for i, key in enumerate(args[0].__dask_keys__()) + } + else: + dsk = { + (a, 0, i): ( + apply, + chunk, + [(x._name, i) if isinstance(x, _Frame) else x for x in args], + chunk_kwargs, + ) + for i in range(args[0].npartitions) + } + + # Combine + b = f"{token or funcname(combine)}-combine-{token_key}" + k = npartitions + depth = 0 + while k > split_every: + for part_i, inds in enumerate(partition_all(split_every, range(k))): + conc = (list, [(a, depth, i) for i in inds]) + dsk[(b, depth + 1, part_i)] = ( + (apply, combine, [conc], combine_kwargs) + if combine_kwargs + else (combine, conc) + ) + k = part_i + 1 + a = b + depth += 1 + + # Aggregate + b = f"{token or funcname(aggregate)}-agg-{token_key}" + conc = (list, [(a, depth, i) for i in range(k)]) + if aggregate_kwargs: + dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) + else: + dsk[(b, 0)] = (aggregate, conc) + + if meta is None: + meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) + meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) + meta = dask_make_meta(meta) + + graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) + return dd.core.new_dd_object(graph, b, meta, (None, None)) + + +for name in ( + "add", + "sub", + "mul", + "truediv", + "floordiv", + "mod", + "pow", + "radd", + "rsub", + "rmul", + "rtruediv", + "rfloordiv", + "rmod", + "rpow", +): + meth = getattr(cudf.DataFrame, name) + DataFrame._bind_operator_method(name, meth, original=cudf.Series) + + meth = getattr(cudf.Series, name) + Series._bind_operator_method(name, meth, original=cudf.Series) + +for name in ("lt", "gt", "le", "ge", "ne", "eq"): + meth = getattr(cudf.Series, name) + Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/_legacy/groupby.py similarity index 99% rename from python/dask_cudf/dask_cudf/groupby.py rename to python/dask_cudf/dask_cudf/_legacy/groupby.py index bbbcde17b51..7e01e91476d 100644 --- a/python/dask_cudf/dask_cudf/groupby.py +++ b/python/dask_cudf/dask_cudf/_legacy/groupby.py @@ -18,7 +18,7 @@ from cudf.core.groupby.groupby import _deprecate_collect from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from dask_cudf.sorting import _deprecate_shuffle_kwarg +from dask_cudf._legacy.sorting import _deprecate_shuffle_kwarg # aggregations that are dask-cudf optimized OPTIMIZED_AGGS = ( diff --git a/python/dask_cudf/dask_cudf/_legacy/io/__init__.py b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py new file mode 100644 index 00000000000..0421bd755f4 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) 2018-2024, NVIDIA CORPORATION. + +from .csv import read_csv # noqa: F401 +from .json import read_json # noqa: F401 +from .orc import read_orc, to_orc # noqa: F401 +from .text import read_text # noqa: F401 + +try: + from .parquet import read_parquet, to_parquet # noqa: F401 +except ImportError: + pass diff --git a/python/dask_cudf/dask_cudf/_legacy/io/csv.py b/python/dask_cudf/dask_cudf/_legacy/io/csv.py new file mode 100644 index 00000000000..fa5400344f9 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/csv.py @@ -0,0 +1,222 @@ +# Copyright (c) 2020-2023, NVIDIA CORPORATION. + +import os +from glob import glob +from warnings import warn + +from fsspec.utils import infer_compression + +from dask import dataframe as dd +from dask.base import tokenize +from dask.dataframe.io.csv import make_reader +from dask.utils import apply, parse_bytes + +import cudf + + +def read_csv(path, blocksize="default", **kwargs): + """ + Read CSV files into a :class:`.DataFrame`. + + This API parallelizes the :func:`cudf:cudf.read_csv` function in + the following ways: + + It supports loading many files at once using globstrings: + + >>> import dask_cudf + >>> df = dask_cudf.read_csv("myfiles.*.csv") + + In some cases it can break up large files: + + >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") + + It can read CSV files from external resources (e.g. S3, HTTP, FTP) + + >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") + >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") + + Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and + supports many of the same keyword arguments with the same + performance guarantees. See the docstring for + :func:`cudf:cudf.read_csv` for more information on available + keyword arguments. + + Parameters + ---------- + path : str, path object, or file-like object + Either a path to a file (a str, :py:class:`pathlib.Path`, or + py._path.local.LocalPath), URL (including http, ftp, and S3 + locations), or any object with a read() method (such as + builtin :py:func:`open` file handler function or + :py:class:`~io.StringIO`). + blocksize : int or str, default "256 MiB" + The target task partition size. If ``None``, a single block + is used for each file. + **kwargs : dict + Passthrough key-word arguments that are sent to + :func:`cudf:cudf.read_csv`. + + Notes + ----- + If any of `skipfooter`/`skiprows`/`nrows` are passed, + `blocksize` will default to None. + + Examples + -------- + >>> import dask_cudf + >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) + >>> ddf.compute() + a b + 0 1 hi + 1 2 hello + 2 3 ai + + """ + + # Handle `chunksize` deprecation + if "chunksize" in kwargs: + chunksize = kwargs.pop("chunksize", "default") + warn( + "`chunksize` is deprecated and will be removed in the future. " + "Please use `blocksize` instead.", + FutureWarning, + ) + if blocksize == "default": + blocksize = chunksize + + # Set default `blocksize` + if blocksize == "default": + if ( + kwargs.get("skipfooter", 0) != 0 + or kwargs.get("skiprows", 0) != 0 + or kwargs.get("nrows", None) is not None + ): + # Cannot read in blocks if skipfooter, + # skiprows or nrows is passed. + blocksize = None + else: + blocksize = "256 MiB" + + if "://" in str(path): + func = make_reader(cudf.read_csv, "read_csv", "CSV") + return func(path, blocksize=blocksize, **kwargs) + else: + return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) + + +def _internal_read_csv(path, blocksize="256 MiB", **kwargs): + if isinstance(blocksize, str): + blocksize = parse_bytes(blocksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + name = "read-csv-" + tokenize( + path, tokenize, **kwargs + ) # TODO: get last modified time + + compression = kwargs.get("compression", "infer") + + if compression == "infer": + # Infer compression from first path by default + compression = infer_compression(filenames[0]) + + if compression and blocksize: + # compressed CSVs reading must read the entire file + kwargs.pop("byte_range", None) + warn( + "Warning %s compression does not support breaking apart files\n" + "Please ensure that each individual file can fit in memory and\n" + "use the keyword ``blocksize=None to remove this message``\n" + "Setting ``blocksize=(size of file)``" % compression + ) + blocksize = None + + if blocksize is None: + return read_csv_without_blocksize(path, **kwargs) + + # Let dask.dataframe generate meta + dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") + kwargs1 = kwargs.copy() + usecols = kwargs1.pop("usecols", None) + dtype = kwargs1.pop("dtype", None) + meta = dask_reader(filenames[0], **kwargs1)._meta + names = meta.columns + if usecols or dtype: + # Regenerate meta with original kwargs if + # `usecols` or `dtype` was specified + meta = dask_reader(filenames[0], **kwargs)._meta + + dsk = {} + i = 0 + dtypes = meta.dtypes.values + + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, blocksize): + kwargs2 = kwargs.copy() + kwargs2["byte_range"] = ( + start, + blocksize, + ) # specify which chunk of the file we care about + if start != 0: + kwargs2["names"] = names # no header in the middle of the file + kwargs2["header"] = None + dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) + + i += 1 + + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) + + +def _read_csv(fn, dtypes=None, **kwargs): + return cudf.read_csv(fn, **kwargs) + + +def read_csv_without_blocksize(path, **kwargs): + """Read entire CSV with optional compression (gzip/zip) + + Parameters + ---------- + path : str + path to files (support for glob) + """ + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + name = "read-csv-" + tokenize(path, **kwargs) + + meta_kwargs = kwargs.copy() + if "skipfooter" in meta_kwargs: + meta_kwargs.pop("skipfooter") + if "nrows" in meta_kwargs: + meta_kwargs.pop("nrows") + # Read "head" of first file (first 5 rows). + # Convert to empty df for metadata. + meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] + + graph = { + (name, i): (apply, cudf.read_csv, [fn], kwargs) + for i, fn in enumerate(filenames) + } + + divisions = [None] * (len(filenames) + 1) + + return dd.core.new_dd_object(graph, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/json.py b/python/dask_cudf/dask_cudf/_legacy/io/json.py new file mode 100644 index 00000000000..98c5ceedb76 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/json.py @@ -0,0 +1,209 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. + +from functools import partial + +import numpy as np +from fsspec.core import get_compression, get_fs_token_paths + +import dask +from dask.utils import parse_bytes + +import cudf +from cudf.core.column import as_column +from cudf.utils.ioutils import _is_local_filesystem + +from dask_cudf.backends import _default_backend + + +def _read_json_partition( + paths, + fs=None, + include_path_column=False, + path_converter=None, + **kwargs, +): + # Transfer all data up front for remote storage + sources = ( + paths + if fs is None + else fs.cat_ranges( + paths, + [0] * len(paths), + fs.sizes(paths), + ) + ) + + if include_path_column: + # Add "path" column. + # Must iterate over sources sequentially + if not isinstance(include_path_column, str): + include_path_column = "path" + converted_paths = ( + paths + if path_converter is None + else [path_converter(path) for path in paths] + ) + dfs = [] + for i, source in enumerate(sources): + df = cudf.read_json(source, **kwargs) + df[include_path_column] = as_column( + converted_paths[i], length=len(df) + ) + dfs.append(df) + return cudf.concat(dfs) + else: + # Pass sources directly to cudf + return cudf.read_json(sources, **kwargs) + + +def read_json( + url_path, + engine="auto", + blocksize=None, + orient="records", + lines=None, + compression="infer", + aggregate_files=True, + **kwargs, +): + """Read JSON data into a :class:`.DataFrame`. + + This function wraps :func:`dask.dataframe.read_json`, and passes + ``engine=partial(cudf.read_json, engine="auto")`` by default. + + Parameters + ---------- + url_path : str, list of str + Location to read from. If a string, can include a glob character to + find a set of file names. + Supports protocol specifications such as ``"s3://"``. + engine : str or Callable, default "auto" + + If str, this value will be used as the ``engine`` argument + when :func:`cudf.read_json` is used to create each partition. + If a :obj:`~collections.abc.Callable`, this value will be used as the + underlying function used to create each partition from JSON + data. The default value is "auto", so that + ``engine=partial(cudf.read_json, engine="auto")`` will be + passed to :func:`dask.dataframe.read_json` by default. + aggregate_files : bool or int + Whether to map multiple files to each output partition. If True, + the `blocksize` argument will be used to determine the number of + files in each partition. If any one file is larger than `blocksize`, + the `aggregate_files` argument will be ignored. If an integer value + is specified, the `blocksize` argument will be ignored, and that + number of files will be mapped to each partition. Default is True. + **kwargs : + Key-word arguments to pass through to :func:`dask.dataframe.read_json`. + + Returns + ------- + :class:`.DataFrame` + + Examples + -------- + Load single file + + >>> from dask_cudf import read_json + >>> read_json('myfile.json') # doctest: +SKIP + + Load large line-delimited JSON files using partitions of approx + 256MB size + + >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP + + Load nested JSON data + + >>> read_json('myfile.json') # doctest: +SKIP + + See Also + -------- + dask.dataframe.read_json + + """ + + if lines is None: + lines = orient == "records" + if orient != "records" and lines: + raise ValueError( + 'Line-delimited JSON is only available with orient="records".' + ) + if blocksize and (orient != "records" or not lines): + raise ValueError( + "JSON file chunking only allowed for JSON-lines" + "input (orient='records', lines=True)." + ) + + inputs = [] + if aggregate_files and blocksize or int(aggregate_files) > 1: + # Attempt custom read if we are mapping multiple files + # to each output partition. Otherwise, upstream logic + # is sufficient. + + storage_options = kwargs.get("storage_options", {}) + fs, _, paths = get_fs_token_paths( + url_path, mode="rb", storage_options=storage_options + ) + if isinstance(aggregate_files, int) and aggregate_files > 1: + # Map a static file count to each partition + inputs = [ + paths[offset : offset + aggregate_files] + for offset in range(0, len(paths), aggregate_files) + ] + elif aggregate_files is True and blocksize: + # Map files dynamically (using blocksize) + file_sizes = fs.sizes(paths) # NOTE: This can be slow + blocksize = parse_bytes(blocksize) + if all([file_size <= blocksize for file_size in file_sizes]): + counts = np.unique( + np.floor(np.cumsum(file_sizes) / blocksize), + return_counts=True, + )[1] + offsets = np.concatenate([[0], counts.cumsum()]) + inputs = [ + paths[offsets[i] : offsets[i + 1]] + for i in range(len(offsets) - 1) + ] + + if inputs: + # Inputs were successfully populated. + # Use custom _read_json_partition function + # to generate each partition. + + compression = get_compression( + url_path[0] if isinstance(url_path, list) else url_path, + compression, + ) + _kwargs = dict( + orient=orient, + lines=lines, + compression=compression, + include_path_column=kwargs.get("include_path_column", False), + path_converter=kwargs.get("path_converter"), + ) + if not _is_local_filesystem(fs): + _kwargs["fs"] = fs + # TODO: Generate meta more efficiently + meta = _read_json_partition(inputs[0][:1], **_kwargs) + return dask.dataframe.from_map( + _read_json_partition, + inputs, + meta=meta, + **_kwargs, + ) + + # Fall back to dask.dataframe.read_json + return _default_backend( + dask.dataframe.read_json, + url_path, + engine=( + partial(cudf.read_json, engine=engine) + if isinstance(engine, str) + else engine + ), + blocksize=blocksize, + orient=orient, + lines=lines, + compression=compression, + **kwargs, + ) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/orc.py b/python/dask_cudf/dask_cudf/_legacy/io/orc.py new file mode 100644 index 00000000000..bed69f038b0 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/orc.py @@ -0,0 +1,199 @@ +# Copyright (c) 2020-2024, NVIDIA CORPORATION. + +from io import BufferedWriter, IOBase + +from fsspec.core import get_fs_token_paths +from fsspec.utils import stringify_path +from pyarrow import orc as orc + +from dask import dataframe as dd +from dask.base import tokenize +from dask.dataframe.io.utils import _get_pyarrow_dtypes + +import cudf + + +def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): + """Pull out specific columns from specific stripe""" + if kwargs is None: + kwargs = {} + with fs.open(path, "rb") as f: + df_stripe = cudf.read_orc( + f, stripes=[stripe], columns=columns, **kwargs + ) + return df_stripe + + +def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): + """Read ORC files into a :class:`.DataFrame`. + + Note that this function is mostly borrowed from upstream Dask. + + Parameters + ---------- + path : str or list[str] + Location of file(s), which can be a full URL with protocol specifier, + and may include glob character if a single string. + columns : None or list[str] + Columns to load. If None, loads all. + filters : None or list of tuple or list of lists of tuples + If not None, specifies a filter predicate used to filter out + row groups using statistics stored for each row group as + Parquet metadata. Row groups that do not match the given + filter predicate are not read. The predicate is expressed in + `disjunctive normal form (DNF) + `__ + like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary + boolean logical combinations of single column predicates. The + innermost tuples each describe a single column predicate. The + list of inner predicates is interpreted as a conjunction + (AND), forming a more selective and multiple column predicate. + Finally, the outermost list combines these filters as a + disjunction (OR). Predicates may also be passed as a list of + tuples. This form is interpreted as a single conjunction. To + express OR in predicates, one must use the (preferred) + notation of list of lists of tuples. + storage_options : None or dict + Further parameters to pass to the bytes backend. + + See Also + -------- + dask.dataframe.read_orc + + Returns + ------- + dask_cudf.DataFrame + + """ + + storage_options = storage_options or {} + fs, fs_token, paths = get_fs_token_paths( + path, mode="rb", storage_options=storage_options + ) + schema = None + nstripes_per_file = [] + for path in paths: + with fs.open(path, "rb") as f: + o = orc.ORCFile(f) + if schema is None: + schema = o.schema + elif schema != o.schema: + raise ValueError( + "Incompatible schemas while parsing ORC files" + ) + nstripes_per_file.append(o.nstripes) + schema = _get_pyarrow_dtypes(schema, categories=None) + if columns is not None: + ex = set(columns) - set(schema) + if ex: + raise ValueError( + f"Requested columns ({ex}) not in schema ({set(schema)})" + ) + else: + columns = list(schema) + + with fs.open(paths[0], "rb") as f: + meta = cudf.read_orc( + f, + stripes=[0] if nstripes_per_file[0] else None, + columns=columns, + **kwargs, + ) + + name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs) + dsk = {} + N = 0 + for path, n in zip(paths, nstripes_per_file): + for stripe in ( + range(n) + if filters is None + else cudf.io.orc._filter_stripes(filters, path) + ): + dsk[(name, N)] = ( + _read_orc_stripe, + fs, + path, + stripe, + columns, + kwargs, + ) + N += 1 + + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) + + +def write_orc_partition(df, path, fs, filename, compression="snappy"): + full_path = fs.sep.join([path, filename]) + with fs.open(full_path, mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + cudf.io.to_orc(df, out_file, compression=compression) + return full_path + + +def to_orc( + df, + path, + write_index=True, + storage_options=None, + compression="snappy", + compute=True, + **kwargs, +): + """ + Write a :class:`.DataFrame` to ORC file(s) (one file per partition). + + Parameters + ---------- + df : DataFrame + path : str or pathlib.Path + Destination directory for data. Prepend with protocol like ``s3://`` + or ``hdfs://`` for remote data. + write_index : boolean, optional + Whether or not to write the index. Defaults to True. + storage_options : None or dict + Further parameters to pass to the bytes backend. + compression : string or dict, optional + compute : bool, optional + If True (default) then the result is computed immediately. If + False then a :class:`~dask.delayed.Delayed` object is returned + for future computation. + + """ + + from dask import compute as dask_compute, delayed + + # TODO: Use upstream dask implementation once available + # (see: Dask Issue#5596) + + if hasattr(path, "name"): + path = stringify_path(path) + fs, _, _ = get_fs_token_paths( + path, mode="wb", storage_options=storage_options + ) + # Trim any protocol information from the path before forwarding + path = fs._strip_protocol(path) + + if write_index: + df = df.reset_index() + else: + # Not writing index - might as well drop it + df = df.reset_index(drop=True) + + fs.mkdirs(path, exist_ok=True) + + # Use i_offset and df.npartitions to define file-name list + filenames = ["part.%i.orc" % i for i in range(df.npartitions)] + + # write parts + dwrite = delayed(write_orc_partition) + parts = [ + dwrite(d, path, fs, filename, compression=compression) + for d, filename in zip(df.to_delayed(), filenames) + ] + + if compute: + return dask_compute(*parts) + + return delayed(list)(parts) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/parquet.py b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py new file mode 100644 index 00000000000..39ac6474958 --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/parquet.py @@ -0,0 +1,513 @@ +# Copyright (c) 2019-2024, NVIDIA CORPORATION. +import itertools +import warnings +from functools import partial +from io import BufferedWriter, BytesIO, IOBase + +import numpy as np +import pandas as pd +from pyarrow import dataset as pa_ds, parquet as pq + +from dask import dataframe as dd +from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine + +try: + from dask.dataframe.io.parquet import ( + create_metadata_file as create_metadata_file_dd, + ) +except ImportError: + create_metadata_file_dd = None + +import cudf +from cudf.core.column import CategoricalColumn, as_column +from cudf.io import write_to_dataset +from cudf.io.parquet import _apply_post_filters, _normalize_filters +from cudf.utils.dtypes import cudf_dtype_from_pa_type + + +class CudfEngine(ArrowDatasetEngine): + @classmethod + def _create_dd_meta(cls, dataset_info, **kwargs): + # Start with pandas-version of meta + meta_pd = super()._create_dd_meta(dataset_info, **kwargs) + + # Convert to cudf + # (drop unsupported timezone information) + for k, v in meta_pd.dtypes.items(): + if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: + meta_pd[k] = meta_pd[k].dt.tz_localize(None) + meta_cudf = cudf.from_pandas(meta_pd) + + # Re-set "object" dtypes to align with pa schema + kwargs = dataset_info.get("kwargs", {}) + set_object_dtypes_from_pa_schema( + meta_cudf, + kwargs.get("schema", None), + ) + + return meta_cudf + + @classmethod + def multi_support(cls): + # Assert that this class is CudfEngine + # and that multi-part reading is supported + return cls == CudfEngine + + @classmethod + def _read_paths( + cls, + paths, + fs, + columns=None, + row_groups=None, + filters=None, + partitions=None, + partitioning=None, + partition_keys=None, + open_file_options=None, + dataset_kwargs=None, + **kwargs, + ): + # Simplify row_groups if all None + if row_groups == [None for path in paths]: + row_groups = None + + # Make sure we read in the columns needed for row-wise + # filtering after IO. This means that one or more columns + # will be dropped almost immediately after IO. However, + # we do NEED these columns for accurate filtering. + filters = _normalize_filters(filters) + projected_columns = None + if columns and filters: + projected_columns = [c for c in columns if c is not None] + columns = sorted( + set(v[0] for v in itertools.chain.from_iterable(filters)) + | set(projected_columns) + ) + + dataset_kwargs = dataset_kwargs or {} + dataset_kwargs["partitioning"] = partitioning or "hive" + + # Use cudf to read in data + try: + df = cudf.read_parquet( + paths, + engine="cudf", + columns=columns, + row_groups=row_groups if row_groups else None, + dataset_kwargs=dataset_kwargs, + categorical_partitions=False, + filesystem=fs, + **kwargs, + ) + except RuntimeError as err: + # TODO: Remove try/except after null-schema issue is resolved + # (See: https://github.com/rapidsai/cudf/issues/12702) + if len(paths) > 1: + df = cudf.concat( + [ + cudf.read_parquet( + path, + engine="cudf", + columns=columns, + row_groups=row_groups[i] if row_groups else None, + dataset_kwargs=dataset_kwargs, + categorical_partitions=False, + filesystem=fs, + **kwargs, + ) + for i, path in enumerate(paths) + ] + ) + else: + raise err + + # Apply filters (if any are defined) + df = _apply_post_filters(df, filters) + + if projected_columns: + # Elements of `projected_columns` may now be in the index. + # We must filter these names from our projection + projected_columns = [ + col for col in projected_columns if col in df._column_names + ] + df = df[projected_columns] + + if partitions and partition_keys is None: + # Use `HivePartitioning` by default + ds = pa_ds.dataset( + paths, + filesystem=fs, + **dataset_kwargs, + ) + frag = next(ds.get_fragments()) + if frag: + # Extract hive-partition keys, and make sure they + # are ordered the same as they are in `partitions` + raw_keys = pa_ds._get_partition_keys(frag.partition_expression) + partition_keys = [ + (hive_part.name, raw_keys[hive_part.name]) + for hive_part in partitions + ] + + if partition_keys: + if partitions is None: + raise ValueError("Must pass partition sets") + + for i, (name, index2) in enumerate(partition_keys): + if len(partitions[i].keys): + # Build a categorical column from `codes` directly + # (since the category is often a larger dtype) + codes = as_column( + partitions[i].keys.get_loc(index2), + length=len(df), + ) + df[name] = CategoricalColumn( + data=None, + size=codes.size, + dtype=cudf.CategoricalDtype( + categories=partitions[i].keys, ordered=False + ), + offset=codes.offset, + children=(codes,), + ) + elif name not in df.columns: + # Add non-categorical partition column + df[name] = as_column(index2, length=len(df)) + + return df + + @classmethod + def read_partition( + cls, + fs, + pieces, + columns, + index, + categories=(), + partitions=(), + filters=None, + partitioning=None, + schema=None, + open_file_options=None, + **kwargs, + ): + if columns is not None: + columns = [c for c in columns] + if isinstance(index, list): + columns += index + + dataset_kwargs = kwargs.get("dataset", {}) + partitioning = partitioning or dataset_kwargs.get("partitioning", None) + if isinstance(partitioning, dict): + partitioning = pa_ds.partitioning(**partitioning) + + # Check if we are actually selecting any columns + read_columns = columns + if schema and columns: + ignored = set(schema.names) - set(columns) + if not ignored: + read_columns = None + + if not isinstance(pieces, list): + pieces = [pieces] + + # Extract supported kwargs from `kwargs` + read_kwargs = kwargs.get("read", {}) + read_kwargs.update(open_file_options or {}) + check_file_size = read_kwargs.pop("check_file_size", None) + + # Wrap reading logic in a `try` block so that we can + # inform the user that the `read_parquet` partition + # size is too large for the available memory + try: + # Assume multi-piece read + paths = [] + rgs = [] + last_partition_keys = None + dfs = [] + + for i, piece in enumerate(pieces): + (path, row_group, partition_keys) = piece + row_group = None if row_group == [None] else row_group + + # File-size check to help "protect" users from change + # to up-stream `split_row_groups` default. We only + # check the file size if this partition corresponds + # to a full file, and `check_file_size` is defined + if check_file_size and len(pieces) == 1 and row_group is None: + file_size = fs.size(path) + if file_size > check_file_size: + warnings.warn( + f"A large parquet file ({file_size}B) is being " + f"used to create a DataFrame partition in " + f"read_parquet. This may cause out of memory " + f"exceptions in operations downstream. See the " + f"notes on split_row_groups in the read_parquet " + f"documentation. Setting split_row_groups " + f"explicitly will silence this warning." + ) + + if i > 0 and partition_keys != last_partition_keys: + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + filters=filters, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + dataset_kwargs=dataset_kwargs, + **read_kwargs, + ) + ) + paths = [] + rgs = [] + last_partition_keys = None + paths.append(path) + rgs.append( + [row_group] + if not isinstance(row_group, list) + and row_group is not None + else row_group + ) + last_partition_keys = partition_keys + + dfs.append( + cls._read_paths( + paths, + fs, + columns=read_columns, + row_groups=rgs if rgs else None, + filters=filters, + partitions=partitions, + partitioning=partitioning, + partition_keys=last_partition_keys, + dataset_kwargs=dataset_kwargs, + **read_kwargs, + ) + ) + df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] + + # Re-set "object" dtypes align with pa schema + set_object_dtypes_from_pa_schema(df, schema) + + if index and (index[0] in df.columns): + df = df.set_index(index[0]) + elif index is False and df.index.names != [None]: + # If index=False, we shouldn't have a named index + df.reset_index(inplace=True) + + except MemoryError as err: + raise MemoryError( + "Parquet data was larger than the available GPU memory!\n\n" + "See the notes on split_row_groups in the read_parquet " + "documentation.\n\n" + "Original Error: " + str(err) + ) + raise err + + return df + + @staticmethod + def write_partition( + df, + path, + fs, + filename, + partition_on, + return_metadata, + fmd=None, + compression="snappy", + index_cols=None, + **kwargs, + ): + preserve_index = False + if len(index_cols) and set(index_cols).issubset(set(df.columns)): + df.set_index(index_cols, drop=True, inplace=True) + preserve_index = True + if partition_on: + md = write_to_dataset( + df=df, + root_path=path, + compression=compression, + filename=filename, + partition_cols=partition_on, + fs=fs, + preserve_index=preserve_index, + return_metadata=return_metadata, + statistics=kwargs.get("statistics", "ROWGROUP"), + int96_timestamps=kwargs.get("int96_timestamps", False), + row_group_size_bytes=kwargs.get("row_group_size_bytes", None), + row_group_size_rows=kwargs.get("row_group_size_rows", None), + max_page_size_bytes=kwargs.get("max_page_size_bytes", None), + max_page_size_rows=kwargs.get("max_page_size_rows", None), + storage_options=kwargs.get("storage_options", None), + ) + else: + with fs.open(fs.sep.join([path, filename]), mode="wb") as out_file: + if not isinstance(out_file, IOBase): + out_file = BufferedWriter(out_file) + md = df.to_parquet( + path=out_file, + engine=kwargs.get("engine", "cudf"), + index=kwargs.get("index", None), + partition_cols=kwargs.get("partition_cols", None), + partition_file_name=kwargs.get( + "partition_file_name", None + ), + partition_offsets=kwargs.get("partition_offsets", None), + statistics=kwargs.get("statistics", "ROWGROUP"), + int96_timestamps=kwargs.get("int96_timestamps", False), + row_group_size_bytes=kwargs.get( + "row_group_size_bytes", None + ), + row_group_size_rows=kwargs.get( + "row_group_size_rows", None + ), + storage_options=kwargs.get("storage_options", None), + metadata_file_path=filename if return_metadata else None, + ) + # Return the schema needed to write the metadata + if return_metadata: + return [{"meta": md}] + else: + return [] + + @staticmethod + def write_metadata(parts, fmd, fs, path, append=False, **kwargs): + if parts: + # Aggregate metadata and write to _metadata file + metadata_path = fs.sep.join([path, "_metadata"]) + _meta = [] + if append and fmd is not None: + # Convert to bytes: + if isinstance(fmd, pq.FileMetaData): + with BytesIO() as myio: + fmd.write_metadata_file(myio) + myio.seek(0) + fmd = np.frombuffer(myio.read(), dtype="uint8") + _meta = [fmd] + _meta.extend([parts[i][0]["meta"] for i in range(len(parts))]) + _meta = ( + cudf.io.merge_parquet_filemetadata(_meta) + if len(_meta) > 1 + else _meta[0] + ) + with fs.open(metadata_path, "wb") as fil: + fil.write(memoryview(_meta)) + + @classmethod + def collect_file_metadata(cls, path, fs, file_path): + with fs.open(path, "rb") as f: + meta = pq.ParquetFile(f).metadata + if file_path: + meta.set_file_path(file_path) + with BytesIO() as myio: + meta.write_metadata_file(myio) + myio.seek(0) + meta = np.frombuffer(myio.read(), dtype="uint8") + return meta + + @classmethod + def aggregate_metadata(cls, meta_list, fs, out_path): + meta = ( + cudf.io.merge_parquet_filemetadata(meta_list) + if len(meta_list) > 1 + else meta_list[0] + ) + if out_path: + metadata_path = fs.sep.join([out_path, "_metadata"]) + with fs.open(metadata_path, "wb") as fil: + fil.write(memoryview(meta)) + return None + else: + return meta + + +def set_object_dtypes_from_pa_schema(df, schema): + # Simple utility to modify cudf DataFrame + # "object" dtypes to agree with a specific + # pyarrow schema. + if schema: + for col_name, col in df._data.items(): + if col_name is None: + # Pyarrow cannot handle `None` as a field name. + # However, this should be a simple range index that + # we can ignore anyway + continue + typ = cudf_dtype_from_pa_type(schema.field(col_name).type) + if ( + col_name in schema.names + and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) + and isinstance(col, cudf.core.column.StringColumn) + ): + df._data[col_name] = col.astype(typ) + + +def read_parquet(path, columns=None, **kwargs): + """ + Read parquet files into a :class:`.DataFrame`. + + Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` + to coordinate the execution of :func:`cudf.read_parquet`, and to + ultimately create a :class:`.DataFrame` collection. + + See the :func:`dask.dataframe.read_parquet` documentation for + all available options. + + Examples + -------- + >>> from dask_cudf import read_parquet + >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP + + When dealing with one or more large parquet files having an + in-memory footprint >15% device memory, the ``split_row_groups`` + argument should be used to map Parquet **row-groups** to DataFrame + partitions (instead of **files** to partitions). For example, the + following code will map each row-group to a distinct partition: + + >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP + + To map **multiple** row-groups to each partition, an integer can be + passed to ``split_row_groups`` to specify the **maximum** number of + row-groups allowed in each output partition: + + >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP + + See Also + -------- + cudf.read_parquet + dask.dataframe.read_parquet + """ + if isinstance(columns, str): + columns = [columns] + + # Set "check_file_size" option to determine whether we + # should check the parquet-file size. This check is meant + # to "protect" users from `split_row_groups` default changes + check_file_size = kwargs.pop("check_file_size", 500_000_000) + if ( + check_file_size + and ("split_row_groups" not in kwargs) + and ("chunksize" not in kwargs) + ): + # User is not specifying `split_row_groups` or `chunksize`, + # so we should warn them if/when a file is ~>0.5GB on disk. + # They can set `split_row_groups` explicitly to silence/skip + # this check + if "read" not in kwargs: + kwargs["read"] = {} + kwargs["read"]["check_file_size"] = check_file_size + + return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) + + +to_parquet = partial(dd.to_parquet, engine=CudfEngine) + +if create_metadata_file_dd is None: + create_metadata_file = create_metadata_file_dd +else: + create_metadata_file = partial(create_metadata_file_dd, engine=CudfEngine) diff --git a/python/dask_cudf/dask_cudf/_legacy/io/text.py b/python/dask_cudf/dask_cudf/_legacy/io/text.py new file mode 100644 index 00000000000..9cdb7c5220b --- /dev/null +++ b/python/dask_cudf/dask_cudf/_legacy/io/text.py @@ -0,0 +1,54 @@ +# Copyright (c) 2022-2024, NVIDIA CORPORATION. + +import os +from glob import glob + +import dask.dataframe as dd +from dask.base import tokenize +from dask.utils import apply, parse_bytes + +import cudf + + +def read_text(path, chunksize="256 MiB", **kwargs): + if isinstance(chunksize, str): + chunksize = parse_bytes(chunksize) + + if isinstance(path, list): + filenames = path + elif isinstance(path, str): + filenames = sorted(glob(path)) + elif hasattr(path, "__fspath__"): + filenames = sorted(glob(path.__fspath__())) + else: + raise TypeError(f"Path type not understood:{type(path)}") + + if not filenames: + msg = f"A file in: {filenames} does not exist." + raise FileNotFoundError(msg) + + name = "read-text-" + tokenize(path, tokenize, **kwargs) + + if chunksize: + dsk = {} + i = 0 + for fn in filenames: + size = os.path.getsize(fn) + for start in range(0, size, chunksize): + kwargs1 = kwargs.copy() + kwargs1["byte_range"] = ( + start, + chunksize, + ) # specify which chunk of the file we care about + + dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1) + i += 1 + else: + dsk = { + (name, i): (apply, cudf.read_text, [fn], kwargs) + for i, fn in enumerate(filenames) + } + + meta = cudf.Series([], dtype="O") + divisions = [None] * (len(dsk) + 1) + return dd.core.new_dd_object(dsk, name, meta, divisions) diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/_legacy/sorting.py similarity index 100% rename from python/dask_cudf/dask_cudf/sorting.py rename to python/dask_cudf/dask_cudf/_legacy/sorting.py diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index bead964a0ef..fb02e0ac772 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -46,7 +46,7 @@ from cudf.api.types import is_string_dtype from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from .core import DataFrame, Index, Series +from ._legacy.core import DataFrame, Index, Series get_parallel_type.register(cudf.DataFrame, lambda _: DataFrame) get_parallel_type.register(cudf.Series, lambda _: Series) @@ -574,7 +574,7 @@ class CudfBackendEntrypoint(DataFrameBackendEntrypoint): >>> with dask.config.set({"dataframe.backend": "cudf"}): ... ddf = dd.from_dict({"a": range(10)}) >>> type(ddf) - + """ @classmethod @@ -610,7 +610,7 @@ def from_dict( @staticmethod def read_parquet(*args, engine=None, **kwargs): - from dask_cudf.io.parquet import CudfEngine + from dask_cudf._legacy.io.parquet import CudfEngine _raise_unsupported_parquet_kwargs(**kwargs) return _default_backend( @@ -622,19 +622,19 @@ def read_parquet(*args, engine=None, **kwargs): @staticmethod def read_json(*args, **kwargs): - from dask_cudf.io.json import read_json + from dask_cudf._legacy.io.json import read_json return read_json(*args, **kwargs) @staticmethod def read_orc(*args, **kwargs): - from dask_cudf.io import read_orc + from dask_cudf._legacy.io import read_orc return read_orc(*args, **kwargs) @staticmethod def read_csv(*args, **kwargs): - from dask_cudf.io import read_csv + from dask_cudf._legacy.io import read_csv return read_csv(*args, **kwargs) @@ -674,7 +674,7 @@ class CudfDXBackendEntrypoint(DataFrameBackendEntrypoint): def to_backend(data, **kwargs): import dask_expr as dx - from dask_cudf.expr._expr import ToCudfBackend + from dask_cudf._expr.expr import ToCudfBackend return dx.new_collection(ToCudfBackend(data, kwargs)) @@ -710,7 +710,7 @@ def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs): and filesystem.lower() == "fsspec" ): # Default "fsspec" filesystem - from dask_cudf.io.parquet import CudfEngine + from dask_cudf._legacy.io.parquet import CudfEngine _raise_unsupported_parquet_kwargs(**kwargs) return _default_backend( @@ -736,7 +736,7 @@ def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs): from dask.core import flatten from dask.dataframe.utils import pyarrow_strings_enabled - from dask_cudf.expr._expr import CudfReadParquetPyarrowFS + from dask_cudf.io.parquet import CudfReadParquetPyarrowFS if args: raise ValueError(f"Unexpected positional arguments: {args}") @@ -862,7 +862,7 @@ def read_csv( @staticmethod def read_json(*args, **kwargs): - from dask_cudf.io.json import read_json as read_json_impl + from dask_cudf._legacy.io.json import read_json as read_json_impl return read_json_impl(*args, **kwargs) @@ -870,14 +870,7 @@ def read_json(*args, **kwargs): def read_orc(*args, **kwargs): from dask_expr import from_legacy_dataframe - from dask_cudf.io.orc import read_orc as legacy_read_orc + from dask_cudf._legacy.io.orc import read_orc as legacy_read_orc ddf = legacy_read_orc(*args, **kwargs) return from_legacy_dataframe(ddf) - - -# Import/register cudf-specific classes for dask-expr -try: - import dask_cudf.expr # noqa: F401 -except ImportError: - pass diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py index 3181c8d69ec..7d6d5c05cbe 100644 --- a/python/dask_cudf/dask_cudf/core.py +++ b/python/dask_cudf/dask_cudf/core.py @@ -1,705 +1,25 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. -import math import textwrap -import warnings -import numpy as np -import pandas as pd -from tlz import partition_all - -from dask import dataframe as dd -from dask.base import normalize_token, tokenize -from dask.dataframe.core import ( - Scalar, - handle_out, - make_meta as dask_make_meta, - map_partitions, -) -from dask.dataframe.utils import raise_on_meta_error -from dask.highlevelgraph import HighLevelGraph -from dask.utils import M, OperatorMethodMixin, apply, derived_from, funcname +import dask.dataframe as dd +from dask.tokenize import tokenize import cudf -from cudf import _lib as libcudf from cudf.utils.performance_tracking import _dask_cudf_performance_tracking -from dask_cudf import sorting -from dask_cudf.accessors import ListMethods, StructMethods -from dask_cudf.sorting import _deprecate_shuffle_kwarg, _get_shuffle_method - - -class _Frame(dd.core._Frame, OperatorMethodMixin): - """Superclass for DataFrame and Series - - Parameters - ---------- - dsk : dict - The dask graph to compute this DataFrame - name : str - The key prefix that specifies which keys in the dask comprise this - particular DataFrame / Series - meta : cudf.DataFrame, cudf.Series, or cudf.Index - An empty cudf object with names, dtypes, and indices matching the - expected output. - divisions : tuple of index values - Values along which we partition our blocks on the index - """ - - def _is_partition_type(self, meta): - return isinstance(meta, self._partition_type) - - def __repr__(self): - s = "" - return s % (type(self).__name__, len(self.dask), self.npartitions) - - @_dask_cudf_performance_tracking - def to_dask_dataframe(self, **kwargs): - """Create a dask.dataframe object from a dask_cudf object - - WARNING: This API is deprecated, and may not work properly - when query-planning is active. Please use `*.to_backend("pandas")` - to convert the underlying data to pandas. - """ - - warnings.warn( - "The `to_dask_dataframe` API is now deprecated. " - "Please use `*.to_backend('pandas')` instead.", - FutureWarning, - ) - - return self.to_backend("pandas", **kwargs) - - -concat = dd.concat - - -normalize_token.register(_Frame, lambda a: a._name) - - -class DataFrame(_Frame, dd.core.DataFrame): - """ - A distributed Dask DataFrame where the backing dataframe is a - :class:`cuDF DataFrame `. - - Typically you would not construct this object directly, but rather - use one of Dask-cuDF's IO routines. - - Most operations on :doc:`Dask DataFrames ` are - supported, with many of the same caveats. - - """ - - _partition_type = cudf.DataFrame - - @_dask_cudf_performance_tracking - def _assign_column(self, k, v): - def assigner(df, k, v): - out = df.copy() - out[k] = v - return out - - meta = assigner(self._meta, k, dask_make_meta(v)) - return self.map_partitions(assigner, k, v, meta=meta) - - @_dask_cudf_performance_tracking - def apply_rows(self, func, incols, outcols, kwargs=None, cache_key=None): - import uuid - - if kwargs is None: - kwargs = {} - - if cache_key is None: - cache_key = uuid.uuid4() - - def do_apply_rows(df, func, incols, outcols, kwargs): - return df.apply_rows( - func, incols, outcols, kwargs, cache_key=cache_key - ) - - meta = do_apply_rows(self._meta, func, incols, outcols, kwargs) - return self.map_partitions( - do_apply_rows, func, incols, outcols, kwargs, meta=meta - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def merge(self, other, shuffle_method=None, **kwargs): - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().merge( - other, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def join(self, other, shuffle_method=None, **kwargs): - # CuDF doesn't support "right" join yet - how = kwargs.pop("how", "left") - if how == "right": - return other.join(other=self, how="left", **kwargs) - - on = kwargs.pop("on", None) - if isinstance(on, tuple): - on = list(on) - return super().join( - other, - how=how, - on=on, - shuffle_method=_get_shuffle_method(shuffle_method), - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def set_index( - self, - other, - sorted=False, - divisions=None, - shuffle_method=None, - **kwargs, - ): - pre_sorted = sorted - del sorted - - if divisions == "quantile": - warnings.warn( - "Using divisions='quantile' is now deprecated. " - "Please raise an issue on github if you believe " - "this feature is necessary.", - FutureWarning, - ) - - if ( - divisions == "quantile" - or isinstance(divisions, (cudf.DataFrame, cudf.Series)) - or ( - isinstance(other, str) - and cudf.api.types.is_string_dtype(self[other].dtype) - ) - ): - # Let upstream-dask handle "pre-sorted" case - if pre_sorted: - return dd.shuffle.set_sorted_index( - self, other, divisions=divisions, **kwargs - ) - - by = other - if not isinstance(other, list): - by = [by] - if len(by) > 1: - raise ValueError("Dask does not support MultiIndex (yet).") - if divisions == "quantile": - divisions = None - - # Use dask_cudf's sort_values - df = self.sort_values( - by, - max_branch=kwargs.get("max_branch", None), - divisions=divisions, - set_divisions=True, - ignore_index=True, - shuffle_method=shuffle_method, - ) - - # Ignore divisions if its a dataframe - if isinstance(divisions, cudf.DataFrame): - divisions = None - - # Set index and repartition - df2 = df.map_partitions( - sorting.set_index_post, - index_name=other, - drop=kwargs.get("drop", True), - column_dtype=df.columns.dtype, - ) - npartitions = kwargs.get("npartitions", self.npartitions) - partition_size = kwargs.get("partition_size", None) - if partition_size: - return df2.repartition(partition_size=partition_size) - if not divisions and df2.npartitions != npartitions: - return df2.repartition(npartitions=npartitions) - if divisions and df2.npartitions != len(divisions) - 1: - return df2.repartition(divisions=divisions) - return df2 - - return super().set_index( - other, - sorted=pre_sorted, - shuffle_method=_get_shuffle_method(shuffle_method), - divisions=divisions, - **kwargs, - ) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def sort_values( - self, - by, - ignore_index=False, - max_branch=None, - divisions=None, - set_divisions=False, - ascending=True, - na_position="last", - sort_function=None, - sort_function_kwargs=None, - shuffle_method=None, - **kwargs, - ): - if kwargs: - raise ValueError( - f"Unsupported input arguments passed : {list(kwargs.keys())}" - ) - - df = sorting.sort_values( - self, - by, - max_branch=max_branch, - divisions=divisions, - set_divisions=set_divisions, - ignore_index=ignore_index, - ascending=ascending, - na_position=na_position, - shuffle_method=shuffle_method, - sort_function=sort_function, - sort_function_kwargs=sort_function_kwargs, - ) - - if ignore_index: - return df.reset_index(drop=True) - return df - - @_dask_cudf_performance_tracking - def to_parquet(self, path, *args, **kwargs): - """Calls dask.dataframe.io.to_parquet with CudfEngine backend""" - from dask_cudf.io import to_parquet - - return to_parquet(self, path, *args, **kwargs) - - @_dask_cudf_performance_tracking - def to_orc(self, path, **kwargs): - """Calls dask_cudf.io.to_orc""" - from dask_cudf.io import to_orc - - return to_orc(self, path, **kwargs) - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - numeric_only=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var( - axis=axis, skipna=skipna, numeric_only=numeric_only - ) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - numeric_only=numeric_only, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_deprecate_shuffle_kwarg - @_dask_cudf_performance_tracking - def shuffle(self, *args, shuffle_method=None, **kwargs): - """Wraps dask.dataframe DataFrame.shuffle method""" - return super().shuffle( - *args, shuffle_method=_get_shuffle_method(shuffle_method), **kwargs - ) - - @_dask_cudf_performance_tracking - def groupby(self, by=None, **kwargs): - from .groupby import CudfDataFrameGroupBy - - return CudfDataFrameGroupBy(self, by=by, **kwargs) - - -@_dask_cudf_performance_tracking -def sum_of_squares(x): - x = x.astype("f8")._column - outcol = libcudf.reduce.reduce("sum_of_squares", x) - return cudf.Series._from_column(outcol) - - -@_dask_cudf_performance_tracking -def var_aggregate(x2, x, n, ddof): - try: - with warnings.catch_warnings(record=True): - warnings.simplefilter("always") - result = (x2 / n) - (x / n) ** 2 - if ddof != 0: - result = result * n / (n - ddof) - return result - except ZeroDivisionError: - return np.float64(np.nan) - - -@_dask_cudf_performance_tracking -def nlargest_agg(x, **kwargs): - return cudf.concat(x).nlargest(**kwargs) - - -@_dask_cudf_performance_tracking -def nsmallest_agg(x, **kwargs): - return cudf.concat(x).nsmallest(**kwargs) - - -class Series(_Frame, dd.core.Series): - _partition_type = cudf.Series - - @_dask_cudf_performance_tracking - def count(self, split_every=False): - return reduction( - [self], - chunk=M.count, - aggregate=np.sum, - split_every=split_every, - meta="i8", - ) - - @_dask_cudf_performance_tracking - def mean(self, split_every=False): - sum = self.sum(split_every=split_every) - n = self.count(split_every=split_every) - return sum / n - - @derived_from(pd.DataFrame) - @_dask_cudf_performance_tracking - def var( - self, - axis=None, - skipna=True, - ddof=1, - split_every=False, - dtype=None, - out=None, - naive=False, - ): - axis = self._validate_axis(axis) - meta = self._meta_nonempty.var(axis=axis, skipna=skipna) - if axis == 1: - result = map_partitions( - M.var, - self, - meta=meta, - token=self._token_prefix + "var", - axis=axis, - skipna=skipna, - ddof=ddof, - ) - return handle_out(out, result) - elif naive: - return _naive_var(self, meta, skipna, ddof, split_every, out) - else: - return _parallel_var(self, meta, skipna, split_every, out) - - @_dask_cudf_performance_tracking - def groupby(self, *args, **kwargs): - from .groupby import CudfSeriesGroupBy - - return CudfSeriesGroupBy(self, *args, **kwargs) - - @property # type: ignore - @_dask_cudf_performance_tracking - def list(self): - return ListMethods(self) - - @property # type: ignore - @_dask_cudf_performance_tracking - def struct(self): - return StructMethods(self) - - -class Index(Series, dd.core.Index): - _partition_type = cudf.Index # type: ignore - - -@_dask_cudf_performance_tracking -def _naive_var(ddf, meta, skipna, ddof, split_every, out): - num = ddf._get_numeric_data() - x = 1.0 * num.sum(skipna=skipna, split_every=split_every) - x2 = 1.0 * (num**2).sum(skipna=skipna, split_every=split_every) - n = num.count(split_every=split_every) - name = ddf._token_prefix + "var" - result = map_partitions( - var_aggregate, x2, x, n, token=name, meta=meta, ddof=ddof - ) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _parallel_var(ddf, meta, skipna, split_every, out): - def _local_var(x, skipna): - if skipna: - n = x.count() - avg = x.mean(skipna=skipna) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - avg = x.sum(skipna=skipna) / n - m2 = ((x - avg) ** 2).sum(skipna=skipna) - return n, avg, m2 - - def _aggregate_var(parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - def _finalize_var(vals): - n, _, m2 = vals - return m2 / (n - 1) - - # Build graph - nparts = ddf.npartitions - if not split_every: - split_every = nparts - name = "var-" + tokenize(skipna, split_every, out) - local_name = "local-" + name - num = ddf._get_numeric_data() - dsk = { - (local_name, n, 0): (_local_var, (num._name, n), skipna) - for n in range(nparts) - } - - # Use reduction tree - widths = [nparts] - while nparts > 1: - nparts = math.ceil(nparts / split_every) - widths.append(nparts) - height = len(widths) - for depth in range(1, height): - for group in range(widths[depth]): - p_max = widths[depth - 1] - lstart = split_every * group - lstop = min(lstart + split_every, p_max) - node_list = [ - (local_name, p, depth - 1) for p in range(lstart, lstop) - ] - dsk[(local_name, group, depth)] = (_aggregate_var, node_list) - if height == 1: - group = depth = 0 - dsk[(name, 0)] = (_finalize_var, (local_name, group, depth)) - - graph = HighLevelGraph.from_collections(name, dsk, dependencies=[num, ddf]) - result = dd.core.new_dd_object(graph, name, meta, (None, None)) - if isinstance(ddf, DataFrame): - result.divisions = (min(ddf.columns), max(ddf.columns)) - return handle_out(out, result) - - -@_dask_cudf_performance_tracking -def _extract_meta(x): - """ - Extract internal cache data (``_meta``) from dask_cudf objects - """ - if isinstance(x, (Scalar, _Frame)): - return x._meta - elif isinstance(x, list): - return [_extract_meta(_x) for _x in x] - elif isinstance(x, tuple): - return tuple(_extract_meta(_x) for _x in x) - elif isinstance(x, dict): - return {k: _extract_meta(v) for k, v in x.items()} - return x - - -@_dask_cudf_performance_tracking -def _emulate(func, *args, **kwargs): - """ - Apply a function using args / kwargs. If arguments contain dd.DataFrame / - dd.Series, using internal cache (``_meta``) for calculation - """ - with raise_on_meta_error(funcname(func)): - return func(*_extract_meta(args), **_extract_meta(kwargs)) - - -@_dask_cudf_performance_tracking -def align_partitions(args): - """Align partitions between dask_cudf objects. - - Note that if all divisions are unknown, but have equal npartitions, then - they will be passed through unchanged. - """ - dfs = [df for df in args if isinstance(df, _Frame)] - if not dfs: - return args - - divisions = dfs[0].divisions - if not all(df.divisions == divisions for df in dfs): - raise NotImplementedError("Aligning mismatched partitions") - return args - - -@_dask_cudf_performance_tracking -def reduction( - args, - chunk=None, - aggregate=None, - combine=None, - meta=None, - token=None, - chunk_kwargs=None, - aggregate_kwargs=None, - combine_kwargs=None, - split_every=None, - **kwargs, -): - """Generic tree reduction operation. - - Parameters - ---------- - args : - Positional arguments for the `chunk` function. All `dask.dataframe` - objects should be partitioned and indexed equivalently. - chunk : function [block-per-arg] -> block - Function to operate on each block of data - aggregate : function list-of-blocks -> block - Function to operate on the list of results of chunk - combine : function list-of-blocks -> block, optional - Function to operate on intermediate lists of results of chunk - in a tree-reduction. If not provided, defaults to aggregate. - $META - token : str, optional - The name to use for the output keys. - chunk_kwargs : dict, optional - Keywords for the chunk function only. - aggregate_kwargs : dict, optional - Keywords for the aggregate function only. - combine_kwargs : dict, optional - Keywords for the combine function only. - split_every : int, optional - Group partitions into groups of this size while performing a - tree-reduction. If set to False, no tree-reduction will be used, - and all intermediates will be concatenated and passed to ``aggregate``. - Default is 8. - kwargs : - All remaining keywords will be passed to ``chunk``, ``aggregate``, and - ``combine``. - """ - if chunk_kwargs is None: - chunk_kwargs = dict() - if aggregate_kwargs is None: - aggregate_kwargs = dict() - chunk_kwargs.update(kwargs) - aggregate_kwargs.update(kwargs) - - if combine is None: - if combine_kwargs: - raise ValueError("`combine_kwargs` provided with no `combine`") - combine = aggregate - combine_kwargs = aggregate_kwargs - else: - if combine_kwargs is None: - combine_kwargs = dict() - combine_kwargs.update(kwargs) - - if not isinstance(args, (tuple, list)): - args = [args] - - npartitions = {arg.npartitions for arg in args if isinstance(arg, _Frame)} - if len(npartitions) > 1: - raise ValueError("All arguments must have same number of partitions") - npartitions = npartitions.pop() - - if split_every is None: - split_every = 8 - elif split_every is False: - split_every = npartitions - elif split_every < 2 or not isinstance(split_every, int): - raise ValueError("split_every must be an integer >= 2") - - token_key = tokenize( - token or (chunk, aggregate), - meta, - args, - chunk_kwargs, - aggregate_kwargs, - combine_kwargs, - split_every, +# This module provides backward compatibility for legacy import patterns. +if dd.DASK_EXPR_ENABLED: + from dask_cudf._expr.collection import ( # noqa: E402 + DataFrame, + Index, + Series, ) +else: + from dask_cudf._legacy.core import DataFrame, Index, Series # noqa: F401 - # Chunk - a = f"{token or funcname(chunk)}-chunk-{token_key}" - if len(args) == 1 and isinstance(args[0], _Frame) and not chunk_kwargs: - dsk = { - (a, 0, i): (chunk, key) - for i, key in enumerate(args[0].__dask_keys__()) - } - else: - dsk = { - (a, 0, i): ( - apply, - chunk, - [(x._name, i) if isinstance(x, _Frame) else x for x in args], - chunk_kwargs, - ) - for i in range(args[0].npartitions) - } - # Combine - b = f"{token or funcname(combine)}-combine-{token_key}" - k = npartitions - depth = 0 - while k > split_every: - for part_i, inds in enumerate(partition_all(split_every, range(k))): - conc = (list, [(a, depth, i) for i in inds]) - dsk[(b, depth + 1, part_i)] = ( - (apply, combine, [conc], combine_kwargs) - if combine_kwargs - else (combine, conc) - ) - k = part_i + 1 - a = b - depth += 1 - - # Aggregate - b = f"{token or funcname(aggregate)}-agg-{token_key}" - conc = (list, [(a, depth, i) for i in range(k)]) - if aggregate_kwargs: - dsk[(b, 0)] = (apply, aggregate, [conc], aggregate_kwargs) - else: - dsk[(b, 0)] = (aggregate, conc) - - if meta is None: - meta_chunk = _emulate(apply, chunk, args, chunk_kwargs) - meta = _emulate(apply, aggregate, [[meta_chunk]], aggregate_kwargs) - meta = dask_make_meta(meta) - - graph = HighLevelGraph.from_collections(b, dsk, dependencies=args) - return dd.core.new_dd_object(graph, b, meta, (None, None)) +concat = dd.concat # noqa: F401 @_dask_cudf_performance_tracking @@ -744,59 +64,3 @@ def from_cudf(data, npartitions=None, chunksize=None, sort=True, name=None): # since dask-expr does not provide a docstring for from_pandas. + textwrap.dedent(dd.from_pandas.__doc__ or "") ) - - -@_dask_cudf_performance_tracking -def from_dask_dataframe(df): - """ - Convert a Dask :class:`dask.dataframe.DataFrame` to a Dask-cuDF - one. - - WARNING: This API is deprecated, and may not work properly - when query-planning is active. Please use `*.to_backend("cudf")` - to convert the underlying data to cudf. - - Parameters - ---------- - df : dask.dataframe.DataFrame - The Dask dataframe to convert - - Returns - ------- - dask_cudf.DataFrame : A new Dask collection backed by cuDF objects - """ - - warnings.warn( - "The `from_dask_dataframe` API is now deprecated. " - "Please use `*.to_backend('cudf')` instead.", - FutureWarning, - ) - - return df.to_backend("cudf") - - -for name in ( - "add", - "sub", - "mul", - "truediv", - "floordiv", - "mod", - "pow", - "radd", - "rsub", - "rmul", - "rtruediv", - "rfloordiv", - "rmod", - "rpow", -): - meth = getattr(cudf.DataFrame, name) - DataFrame._bind_operator_method(name, meth, original=cudf.Series) - - meth = getattr(cudf.Series, name) - Series._bind_operator_method(name, meth, original=cudf.Series) - -for name in ("lt", "gt", "le", "ge", "ne", "eq"): - meth = getattr(cudf.Series, name) - Series._bind_comparison_method(name, meth, original=cudf.Series) diff --git a/python/dask_cudf/dask_cudf/expr/__init__.py b/python/dask_cudf/dask_cudf/expr/__init__.py deleted file mode 100644 index 6dadadd5263..00000000000 --- a/python/dask_cudf/dask_cudf/expr/__init__.py +++ /dev/null @@ -1,25 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask import config - -# Check if dask-dataframe is using dask-expr. -# For dask>=2024.3.0, a null value will default to True -QUERY_PLANNING_ON = config.get("dataframe.query-planning", None) is not False - -# Register custom expressions and collections -if QUERY_PLANNING_ON: - # Broadly avoid "p2p" and "disk" defaults for now - config.set({"dataframe.shuffle.method": "tasks"}) - - try: - import dask_cudf.expr._collection # noqa: F401 - import dask_cudf.expr._expr # noqa: F401 - - except ImportError as err: - # Dask *should* raise an error before this. - # However, we can still raise here to be certain. - raise RuntimeError( - "Failed to register the 'cudf' backend for dask-expr." - " Please make sure you have dask-expr installed.\n" - f"Error Message: {err}" - ) diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py deleted file mode 100644 index c7cf66fbffd..00000000000 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ /dev/null @@ -1,511 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. -import functools - -import dask_expr._shuffle as _shuffle_module -import pandas as pd -from dask_expr import new_collection -from dask_expr._cumulative import CumulativeBlockwise -from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns -from dask_expr._groupby import ( - DecomposableGroupbyAggregation, - GroupbyAggregation, -) -from dask_expr._reductions import Reduction, Var -from dask_expr.io.io import FusedParquetIO -from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS - -from dask.dataframe.core import ( - _concat, - is_dataframe_like, - make_meta, - meta_nonempty, -) -from dask.dataframe.dispatch import is_categorical_dtype -from dask.typing import no_default - -import cudf - -## -## Custom expressions -## - - -def _get_spec_info(gb): - if isinstance(gb.arg, (dict, list)): - aggs = gb.arg.copy() - else: - aggs = gb.arg - - if gb._slice and not isinstance(aggs, dict): - aggs = {gb._slice: aggs} - - gb_cols = gb._by_columns - if isinstance(gb_cols, str): - gb_cols = [gb_cols] - columns = [c for c in gb.frame.columns if c not in gb_cols] - if not isinstance(aggs, dict): - aggs = {col: aggs for col in columns} - - # Assert if our output will have a MultiIndex; this will be the case if - # any value in the `aggs` dict is not a string (i.e. multiple/named - # aggregations per column) - str_cols_out = True - aggs_renames = {} - for col in aggs: - if isinstance(aggs[col], str) or callable(aggs[col]): - aggs[col] = [aggs[col]] - elif isinstance(aggs[col], dict): - str_cols_out = False - col_aggs = [] - for k, v in aggs[col].items(): - aggs_renames[col, v] = k - col_aggs.append(v) - aggs[col] = col_aggs - else: - str_cols_out = False - if col in gb_cols: - columns.append(col) - - return { - "aggs": aggs, - "columns": columns, - "str_cols_out": str_cols_out, - "aggs_renames": aggs_renames, - } - - -def _get_meta(gb): - spec_info = gb.spec_info - gb_cols = gb._by_columns - aggs = spec_info["aggs"].copy() - aggs_renames = spec_info["aggs_renames"] - if spec_info["str_cols_out"]: - # Metadata should use `str` for dict values if that is - # what the user originally specified (column names will - # be str, rather than tuples). - for col in aggs: - aggs[col] = aggs[col][0] - _meta = gb.frame._meta.groupby(gb_cols).agg(aggs) - if aggs_renames: - col_array = [] - agg_array = [] - for col, agg in _meta.columns: - col_array.append(col) - agg_array.append(aggs_renames.get((col, agg), agg)) - _meta.columns = pd.MultiIndex.from_arrays([col_array, agg_array]) - return _meta - - -class DecomposableCudfGroupbyAgg(DecomposableGroupbyAggregation): - sep = "___" - - @functools.cached_property - def spec_info(self): - return _get_spec_info(self) - - @functools.cached_property - def _meta(self): - return _get_meta(self) - - @property - def shuffle_by_index(self): - return False # We always group by column(s) - - @classmethod - def chunk(cls, df, *by, **kwargs): - from dask_cudf.groupby import _groupby_partition_agg - - return _groupby_partition_agg(df, **kwargs) - - @classmethod - def combine(cls, inputs, **kwargs): - from dask_cudf.groupby import _tree_node_agg - - return _tree_node_agg(_concat(inputs), **kwargs) - - @classmethod - def aggregate(cls, inputs, **kwargs): - from dask_cudf.groupby import _finalize_gb_agg - - return _finalize_gb_agg(_concat(inputs), **kwargs) - - @property - def chunk_kwargs(self) -> dict: - dropna = True if self.dropna is None else self.dropna - return { - "gb_cols": self._by_columns, - "aggs": self.spec_info["aggs"], - "columns": self.spec_info["columns"], - "dropna": dropna, - "sort": self.sort, - "sep": self.sep, - } - - @property - def combine_kwargs(self) -> dict: - dropna = True if self.dropna is None else self.dropna - return { - "gb_cols": self._by_columns, - "dropna": dropna, - "sort": self.sort, - "sep": self.sep, - } - - @property - def aggregate_kwargs(self) -> dict: - dropna = True if self.dropna is None else self.dropna - final_columns = self._slice or self._meta.columns - return { - "gb_cols": self._by_columns, - "aggs": self.spec_info["aggs"], - "columns": self.spec_info["columns"], - "final_columns": final_columns, - "as_index": True, - "dropna": dropna, - "sort": self.sort, - "sep": self.sep, - "str_cols_out": self.spec_info["str_cols_out"], - "aggs_renames": self.spec_info["aggs_renames"], - } - - -class CudfGroupbyAgg(GroupbyAggregation): - @functools.cached_property - def spec_info(self): - return _get_spec_info(self) - - @functools.cached_property - def _meta(self): - return _get_meta(self) - - def _lower(self): - return DecomposableCudfGroupbyAgg( - self.frame, - self.arg, - self.observed, - self.dropna, - self.split_every, - self.split_out, - self.sort, - self.shuffle_method, - self._slice, - *self.by, - ) - - -def _maybe_get_custom_expr( - gb, - aggs, - split_every=None, - split_out=None, - shuffle_method=None, - **kwargs, -): - from dask_cudf.groupby import ( - OPTIMIZED_AGGS, - _aggs_optimized, - _redirect_aggs, - ) - - if kwargs: - # Unsupported key-word arguments - return None - - if not hasattr(gb.obj._meta, "to_pandas"): - # Not cuDF-backed data - return None - - _aggs = _redirect_aggs(aggs) - if not _aggs_optimized(_aggs, OPTIMIZED_AGGS): - # One or more aggregations are unsupported - return None - - return CudfGroupbyAgg( - gb.obj.expr, - _aggs, - gb.observed, - gb.dropna, - split_every, - split_out, - gb.sort, - shuffle_method, - gb._slice, - *gb.by, - ) - - -class CudfFusedParquetIO(FusedParquetIO): - @staticmethod - def _load_multiple_files( - frag_filters, - columns, - schema, - *to_pandas_args, - ): - import pyarrow as pa - - from dask.base import apply, tokenize - from dask.threaded import get - - token = tokenize(frag_filters, columns, schema) - name = f"pq-file-{token}" - dsk = { - (name, i): ( - CudfReadParquetPyarrowFS._fragment_to_table, - frag, - filter, - columns, - schema, - ) - for i, (frag, filter) in enumerate(frag_filters) - } - dsk[name] = ( - apply, - pa.concat_tables, - [list(dsk.keys())], - {"promote_options": "permissive"}, - ) - return CudfReadParquetPyarrowFS._table_to_pandas( - get(dsk, name), - *to_pandas_args, - ) - - -class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS): - @functools.cached_property - def _dataset_info(self): - from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema - - dataset_info = super()._dataset_info - meta_pd = dataset_info["base_meta"] - if isinstance(meta_pd, cudf.DataFrame): - return dataset_info - - # Convert to cudf - # (drop unsupported timezone information) - for k, v in meta_pd.dtypes.items(): - if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: - meta_pd[k] = meta_pd[k].dt.tz_localize(None) - meta_cudf = cudf.from_pandas(meta_pd) - - # Re-set "object" dtypes to align with pa schema - kwargs = dataset_info.get("kwargs", {}) - set_object_dtypes_from_pa_schema( - meta_cudf, - kwargs.get("schema", None), - ) - - dataset_info["base_meta"] = meta_cudf - self.operands[type(self)._parameters.index("_dataset_info_cache")] = ( - dataset_info - ) - return dataset_info - - @staticmethod - def _table_to_pandas(table, index_name): - df = cudf.DataFrame.from_arrow(table) - if index_name is not None: - df = df.set_index(index_name) - return df - - def _filtered_task(self, index: int): - columns = self.columns.copy() - index_name = self.index.name - if self.index is not None: - index_name = self.index.name - schema = self._dataset_info["schema"].remove_metadata() - if index_name: - if columns is None: - columns = list(schema.names) - columns.append(index_name) - return ( - self._table_to_pandas, - ( - self._fragment_to_table, - FragmentWrapper(self.fragments[index], filesystem=self.fs), - self.filters, - columns, - schema, - ), - index_name, - ) - - def _tune_up(self, parent): - if self._fusion_compression_factor >= 1: - return - if isinstance(parent, CudfFusedParquetIO): - return - return parent.substitute(self, CudfFusedParquetIO(self)) - - -class RenameAxisCudf(RenameAxis): - # TODO: Remove this after rename_axis is supported in cudf - # (See: https://github.com/rapidsai/cudf/issues/16895) - @staticmethod - def operation(df, index=no_default, **kwargs): - if index != no_default: - df.index.name = index - return df - raise NotImplementedError( - "Only `index` is supported for the cudf backend" - ) - - -class ToCudfBackend(Elemwise): - # TODO: Inherit from ToBackend when rapids-dask-dependency - # is pinned to dask>=2024.8.1 - _parameters = ["frame", "options"] - _projection_passthrough = True - _filter_passthrough = True - _preserves_partitioning_information = True - - @staticmethod - def operation(df, options): - from dask_cudf.backends import to_cudf_dispatch - - return to_cudf_dispatch(df, **options) - - def _simplify_down(self): - if isinstance( - self.frame._meta, (cudf.DataFrame, cudf.Series, cudf.Index) - ): - # We already have cudf data - return self.frame - - -## -## Custom expression patching -## - - -# This can be removed after cudf#15176 is addressed. -# See: https://github.com/rapidsai/cudf/issues/15176 -class PatchCumulativeBlockwise(CumulativeBlockwise): - @property - def _args(self) -> list: - return self.operands[:1] - - @property - def _kwargs(self) -> dict: - # Must pass axis and skipna as kwargs in cudf - return {"axis": self.axis, "skipna": self.skipna} - - -CumulativeBlockwise._args = PatchCumulativeBlockwise._args -CumulativeBlockwise._kwargs = PatchCumulativeBlockwise._kwargs - - -# The upstream Var code uses `Series.values`, and relies on numpy -# for most of the logic. Unfortunately, cudf -> cupy conversion -# is not supported for data containing null values. Therefore, -# we must implement our own version of Var for now. This logic -# is mostly copied from dask-cudf. - - -class VarCudf(Reduction): - # Uses the parallel version of Welford's online algorithm (Chan '79) - # (http://i.stanford.edu/pub/cstr/reports/cs/tr/79/773/CS-TR-79-773.pdf) - _parameters = ["frame", "skipna", "ddof", "numeric_only", "split_every"] - _defaults = { - "skipna": True, - "ddof": 1, - "numeric_only": False, - "split_every": False, - } - - @functools.cached_property - def _meta(self): - return make_meta( - meta_nonempty(self.frame._meta).var( - skipna=self.skipna, numeric_only=self.numeric_only - ) - ) - - @property - def chunk_kwargs(self): - return dict(skipna=self.skipna, numeric_only=self.numeric_only) - - @property - def combine_kwargs(self): - return {} - - @property - def aggregate_kwargs(self): - return dict(ddof=self.ddof) - - @classmethod - def reduction_chunk(cls, x, skipna=True, numeric_only=False): - kwargs = {"numeric_only": numeric_only} if is_dataframe_like(x) else {} - if skipna or numeric_only: - n = x.count(**kwargs) - kwargs["skipna"] = skipna - avg = x.mean(**kwargs) - else: - # Not skipping nulls, so might as well - # avoid the full `count` operation - n = len(x) - kwargs["skipna"] = skipna - avg = x.sum(**kwargs) / n - if numeric_only: - # Workaround for cudf bug - # (see: https://github.com/rapidsai/cudf/issues/13731) - x = x[n.index] - m2 = ((x - avg) ** 2).sum(**kwargs) - return n, avg, m2 - - @classmethod - def reduction_combine(cls, parts): - n, avg, m2 = parts[0] - for i in range(1, len(parts)): - n_a, avg_a, m2_a = n, avg, m2 - n_b, avg_b, m2_b = parts[i] - n = n_a + n_b - avg = (n_a * avg_a + n_b * avg_b) / n - delta = avg_b - avg_a - m2 = m2_a + m2_b + delta**2 * n_a * n_b / n - return n, avg, m2 - - @classmethod - def reduction_aggregate(cls, vals, ddof=1): - vals = cls.reduction_combine(vals) - n, _, m2 = vals - return m2 / (n - ddof) - - -def _patched_var( - self, axis=0, skipna=True, ddof=1, numeric_only=False, split_every=False -): - if axis == 0: - if hasattr(self._meta, "to_pandas"): - return VarCudf(self, skipna, ddof, numeric_only, split_every) - else: - return Var(self, skipna, ddof, numeric_only, split_every) - elif axis == 1: - return VarColumns(self, skipna, ddof, numeric_only) - else: - raise ValueError(f"axis={axis} not supported. Please specify 0 or 1") - - -Expr.var = _patched_var - - -# Temporary work-around for missing cudf + categorical support -# See: https://github.com/rapidsai/cudf/issues/11795 -# TODO: Fix RepartitionQuantiles and remove this in cudf>24.06 - -_original_get_divisions = _shuffle_module._get_divisions - - -def _patched_get_divisions(frame, other, *args, **kwargs): - # NOTE: The following two lines contains the "patch" - # (we simply convert the partitioning column to pandas) - if is_categorical_dtype(other._meta.dtype) and hasattr( - other.frame._meta, "to_pandas" - ): - other = new_collection(other).to_backend("pandas")._expr - - # Call "original" function - return _original_get_divisions(frame, other, *args, **kwargs) - - -_shuffle_module._get_divisions = _patched_get_divisions diff --git a/python/dask_cudf/dask_cudf/expr/_groupby.py b/python/dask_cudf/dask_cudf/expr/_groupby.py deleted file mode 100644 index 8a16fe7615d..00000000000 --- a/python/dask_cudf/dask_cudf/expr/_groupby.py +++ /dev/null @@ -1,123 +0,0 @@ -# Copyright (c) 2024, NVIDIA CORPORATION. - -from dask_expr._collection import new_collection -from dask_expr._groupby import ( - GroupBy as DXGroupBy, - SeriesGroupBy as DXSeriesGroupBy, - SingleAggregation, -) -from dask_expr._util import is_scalar - -from dask.dataframe.groupby import Aggregation - -from cudf.core.groupby.groupby import _deprecate_collect - -from dask_cudf.expr._expr import _maybe_get_custom_expr - -## -## Custom groupby classes -## - - -class ListAgg(SingleAggregation): - @staticmethod - def groupby_chunk(arg): - return arg.agg(list) - - @staticmethod - def groupby_aggregate(arg): - gb = arg.agg(list) - if gb.ndim > 1: - for col in gb.columns: - gb[col] = gb[col].list.concat() - return gb - else: - return gb.list.concat() - - -list_aggregation = Aggregation( - name="list", - chunk=ListAgg.groupby_chunk, - agg=ListAgg.groupby_aggregate, -) - - -def _translate_arg(arg): - # Helper function to translate args so that - # they can be processed correctly by upstream - # dask & dask-expr. Right now, the only necessary - # translation is list aggregations. - if isinstance(arg, dict): - return {k: _translate_arg(v) for k, v in arg.items()} - elif isinstance(arg, list): - return [_translate_arg(x) for x in arg] - elif arg in ("collect", "list", list): - return list_aggregation - else: - return arg - - -# We define our own GroupBy classes in Dask cuDF for -# the following reasons: -# (1) We want to use a custom `aggregate` algorithm -# that performs multiple aggregations on the -# same dataframe partition at once. The upstream -# algorithm breaks distinct aggregations into -# separate tasks. -# (2) We need to work around missing `observed=False` -# support: -# https://github.com/rapidsai/cudf/issues/15173 - - -class GroupBy(DXGroupBy): - def __init__(self, *args, observed=None, **kwargs): - observed = observed if observed is not None else True - super().__init__(*args, observed=observed, **kwargs) - - def __getitem__(self, key): - if is_scalar(key): - return SeriesGroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - dropna=self.dropna, - observed=self.observed, - ) - g = GroupBy( - self.obj, - by=self.by, - slice=key, - sort=self.sort, - dropna=self.dropna, - observed=self.observed, - group_keys=self.group_keys, - ) - return g - - def collect(self, **kwargs): - _deprecate_collect() - return self._single_agg(ListAgg, **kwargs) - - def aggregate(self, arg, fused=True, **kwargs): - if ( - fused - and (expr := _maybe_get_custom_expr(self, arg, **kwargs)) - is not None - ): - return new_collection(expr) - else: - return super().aggregate(_translate_arg(arg), **kwargs) - - -class SeriesGroupBy(DXSeriesGroupBy): - def __init__(self, *args, observed=None, **kwargs): - observed = observed if observed is not None else True - super().__init__(*args, observed=observed, **kwargs) - - def collect(self, **kwargs): - _deprecate_collect() - return self._single_agg(ListAgg, **kwargs) - - def aggregate(self, arg, **kwargs): - return super().aggregate(_translate_arg(arg), **kwargs) diff --git a/python/dask_cudf/dask_cudf/io/__init__.py b/python/dask_cudf/dask_cudf/io/__init__.py index 0421bd755f4..1e0f24d78ce 100644 --- a/python/dask_cudf/dask_cudf/io/__init__.py +++ b/python/dask_cudf/dask_cudf/io/__init__.py @@ -1,11 +1,32 @@ -# Copyright (c) 2018-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -from .csv import read_csv # noqa: F401 -from .json import read_json # noqa: F401 -from .orc import read_orc, to_orc # noqa: F401 -from .text import read_text # noqa: F401 +from dask_cudf import _deprecated_api -try: - from .parquet import read_parquet, to_parquet # noqa: F401 -except ImportError: - pass +from . import csv, orc, json, parquet, text # noqa: F401 + + +read_csv = _deprecated_api( + "dask_cudf.io.read_csv", new_api="dask_cudf.read_csv" +) +read_json = _deprecated_api( + "dask_cudf.io.read_json", new_api="dask_cudf.read_json" +) +read_orc = _deprecated_api( + "dask_cudf.io.read_orc", new_api="dask_cudf.read_orc" +) +to_orc = _deprecated_api( + "dask_cudf.io.to_orc", + new_api="dask_cudf._legacy.io.to_orc", + rec="Please use the DataFrame.to_orc method instead.", +) +read_text = _deprecated_api( + "dask_cudf.io.read_text", new_api="dask_cudf.read_text" +) +read_parquet = _deprecated_api( + "dask_cudf.io.read_parquet", new_api="dask_cudf.read_parquet" +) +to_parquet = _deprecated_api( + "dask_cudf.io.to_parquet", + new_api="dask_cudf._legacy.io.parquet.to_parquet", + rec="Please use the DataFrame.to_parquet method instead.", +) diff --git a/python/dask_cudf/dask_cudf/io/csv.py b/python/dask_cudf/dask_cudf/io/csv.py index fa5400344f9..b22b31a591f 100644 --- a/python/dask_cudf/dask_cudf/io/csv.py +++ b/python/dask_cudf/dask_cudf/io/csv.py @@ -1,222 +1,8 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -import os -from glob import glob -from warnings import warn +from dask_cudf import _deprecated_api -from fsspec.utils import infer_compression - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.csv import make_reader -from dask.utils import apply, parse_bytes - -import cudf - - -def read_csv(path, blocksize="default", **kwargs): - """ - Read CSV files into a :class:`.DataFrame`. - - This API parallelizes the :func:`cudf:cudf.read_csv` function in - the following ways: - - It supports loading many files at once using globstrings: - - >>> import dask_cudf - >>> df = dask_cudf.read_csv("myfiles.*.csv") - - In some cases it can break up large files: - - >>> df = dask_cudf.read_csv("largefile.csv", blocksize="256 MiB") - - It can read CSV files from external resources (e.g. S3, HTTP, FTP) - - >>> df = dask_cudf.read_csv("s3://bucket/myfiles.*.csv") - >>> df = dask_cudf.read_csv("https://www.mycloud.com/sample.csv") - - Internally ``read_csv`` uses :func:`cudf:cudf.read_csv` and - supports many of the same keyword arguments with the same - performance guarantees. See the docstring for - :func:`cudf:cudf.read_csv` for more information on available - keyword arguments. - - Parameters - ---------- - path : str, path object, or file-like object - Either a path to a file (a str, :py:class:`pathlib.Path`, or - py._path.local.LocalPath), URL (including http, ftp, and S3 - locations), or any object with a read() method (such as - builtin :py:func:`open` file handler function or - :py:class:`~io.StringIO`). - blocksize : int or str, default "256 MiB" - The target task partition size. If ``None``, a single block - is used for each file. - **kwargs : dict - Passthrough key-word arguments that are sent to - :func:`cudf:cudf.read_csv`. - - Notes - ----- - If any of `skipfooter`/`skiprows`/`nrows` are passed, - `blocksize` will default to None. - - Examples - -------- - >>> import dask_cudf - >>> ddf = dask_cudf.read_csv("sample.csv", usecols=["a", "b"]) - >>> ddf.compute() - a b - 0 1 hi - 1 2 hello - 2 3 ai - - """ - - # Handle `chunksize` deprecation - if "chunksize" in kwargs: - chunksize = kwargs.pop("chunksize", "default") - warn( - "`chunksize` is deprecated and will be removed in the future. " - "Please use `blocksize` instead.", - FutureWarning, - ) - if blocksize == "default": - blocksize = chunksize - - # Set default `blocksize` - if blocksize == "default": - if ( - kwargs.get("skipfooter", 0) != 0 - or kwargs.get("skiprows", 0) != 0 - or kwargs.get("nrows", None) is not None - ): - # Cannot read in blocks if skipfooter, - # skiprows or nrows is passed. - blocksize = None - else: - blocksize = "256 MiB" - - if "://" in str(path): - func = make_reader(cudf.read_csv, "read_csv", "CSV") - return func(path, blocksize=blocksize, **kwargs) - else: - return _internal_read_csv(path=path, blocksize=blocksize, **kwargs) - - -def _internal_read_csv(path, blocksize="256 MiB", **kwargs): - if isinstance(blocksize, str): - blocksize = parse_bytes(blocksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-csv-" + tokenize( - path, tokenize, **kwargs - ) # TODO: get last modified time - - compression = kwargs.get("compression", "infer") - - if compression == "infer": - # Infer compression from first path by default - compression = infer_compression(filenames[0]) - - if compression and blocksize: - # compressed CSVs reading must read the entire file - kwargs.pop("byte_range", None) - warn( - "Warning %s compression does not support breaking apart files\n" - "Please ensure that each individual file can fit in memory and\n" - "use the keyword ``blocksize=None to remove this message``\n" - "Setting ``blocksize=(size of file)``" % compression - ) - blocksize = None - - if blocksize is None: - return read_csv_without_blocksize(path, **kwargs) - - # Let dask.dataframe generate meta - dask_reader = make_reader(cudf.read_csv, "read_csv", "CSV") - kwargs1 = kwargs.copy() - usecols = kwargs1.pop("usecols", None) - dtype = kwargs1.pop("dtype", None) - meta = dask_reader(filenames[0], **kwargs1)._meta - names = meta.columns - if usecols or dtype: - # Regenerate meta with original kwargs if - # `usecols` or `dtype` was specified - meta = dask_reader(filenames[0], **kwargs)._meta - - dsk = {} - i = 0 - dtypes = meta.dtypes.values - - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, blocksize): - kwargs2 = kwargs.copy() - kwargs2["byte_range"] = ( - start, - blocksize, - ) # specify which chunk of the file we care about - if start != 0: - kwargs2["names"] = names # no header in the middle of the file - kwargs2["header"] = None - dsk[(name, i)] = (apply, _read_csv, [fn, dtypes], kwargs2) - - i += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def _read_csv(fn, dtypes=None, **kwargs): - return cudf.read_csv(fn, **kwargs) - - -def read_csv_without_blocksize(path, **kwargs): - """Read entire CSV with optional compression (gzip/zip) - - Parameters - ---------- - path : str - path to files (support for glob) - """ - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - name = "read-csv-" + tokenize(path, **kwargs) - - meta_kwargs = kwargs.copy() - if "skipfooter" in meta_kwargs: - meta_kwargs.pop("skipfooter") - if "nrows" in meta_kwargs: - meta_kwargs.pop("nrows") - # Read "head" of first file (first 5 rows). - # Convert to empty df for metadata. - meta = cudf.read_csv(filenames[0], nrows=5, **meta_kwargs).iloc[:0] - - graph = { - (name, i): (apply, cudf.read_csv, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - divisions = [None] * (len(filenames) + 1) - - return dd.core.new_dd_object(graph, name, meta, divisions) +read_csv = _deprecated_api( + "dask_cudf.io.csv.read_csv", + new_api="dask_cudf.read_csv", +) diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py index 98c5ceedb76..8f85ea54c0a 100644 --- a/python/dask_cudf/dask_cudf/io/json.py +++ b/python/dask_cudf/dask_cudf/io/json.py @@ -1,209 +1,8 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -from functools import partial +from dask_cudf import _deprecated_api -import numpy as np -from fsspec.core import get_compression, get_fs_token_paths - -import dask -from dask.utils import parse_bytes - -import cudf -from cudf.core.column import as_column -from cudf.utils.ioutils import _is_local_filesystem - -from dask_cudf.backends import _default_backend - - -def _read_json_partition( - paths, - fs=None, - include_path_column=False, - path_converter=None, - **kwargs, -): - # Transfer all data up front for remote storage - sources = ( - paths - if fs is None - else fs.cat_ranges( - paths, - [0] * len(paths), - fs.sizes(paths), - ) - ) - - if include_path_column: - # Add "path" column. - # Must iterate over sources sequentially - if not isinstance(include_path_column, str): - include_path_column = "path" - converted_paths = ( - paths - if path_converter is None - else [path_converter(path) for path in paths] - ) - dfs = [] - for i, source in enumerate(sources): - df = cudf.read_json(source, **kwargs) - df[include_path_column] = as_column( - converted_paths[i], length=len(df) - ) - dfs.append(df) - return cudf.concat(dfs) - else: - # Pass sources directly to cudf - return cudf.read_json(sources, **kwargs) - - -def read_json( - url_path, - engine="auto", - blocksize=None, - orient="records", - lines=None, - compression="infer", - aggregate_files=True, - **kwargs, -): - """Read JSON data into a :class:`.DataFrame`. - - This function wraps :func:`dask.dataframe.read_json`, and passes - ``engine=partial(cudf.read_json, engine="auto")`` by default. - - Parameters - ---------- - url_path : str, list of str - Location to read from. If a string, can include a glob character to - find a set of file names. - Supports protocol specifications such as ``"s3://"``. - engine : str or Callable, default "auto" - - If str, this value will be used as the ``engine`` argument - when :func:`cudf.read_json` is used to create each partition. - If a :obj:`~collections.abc.Callable`, this value will be used as the - underlying function used to create each partition from JSON - data. The default value is "auto", so that - ``engine=partial(cudf.read_json, engine="auto")`` will be - passed to :func:`dask.dataframe.read_json` by default. - aggregate_files : bool or int - Whether to map multiple files to each output partition. If True, - the `blocksize` argument will be used to determine the number of - files in each partition. If any one file is larger than `blocksize`, - the `aggregate_files` argument will be ignored. If an integer value - is specified, the `blocksize` argument will be ignored, and that - number of files will be mapped to each partition. Default is True. - **kwargs : - Key-word arguments to pass through to :func:`dask.dataframe.read_json`. - - Returns - ------- - :class:`.DataFrame` - - Examples - -------- - Load single file - - >>> from dask_cudf import read_json - >>> read_json('myfile.json') # doctest: +SKIP - - Load large line-delimited JSON files using partitions of approx - 256MB size - - >>> read_json('data/file*.csv', blocksize=2**28) # doctest: +SKIP - - Load nested JSON data - - >>> read_json('myfile.json') # doctest: +SKIP - - See Also - -------- - dask.dataframe.read_json - - """ - - if lines is None: - lines = orient == "records" - if orient != "records" and lines: - raise ValueError( - 'Line-delimited JSON is only available with orient="records".' - ) - if blocksize and (orient != "records" or not lines): - raise ValueError( - "JSON file chunking only allowed for JSON-lines" - "input (orient='records', lines=True)." - ) - - inputs = [] - if aggregate_files and blocksize or int(aggregate_files) > 1: - # Attempt custom read if we are mapping multiple files - # to each output partition. Otherwise, upstream logic - # is sufficient. - - storage_options = kwargs.get("storage_options", {}) - fs, _, paths = get_fs_token_paths( - url_path, mode="rb", storage_options=storage_options - ) - if isinstance(aggregate_files, int) and aggregate_files > 1: - # Map a static file count to each partition - inputs = [ - paths[offset : offset + aggregate_files] - for offset in range(0, len(paths), aggregate_files) - ] - elif aggregate_files is True and blocksize: - # Map files dynamically (using blocksize) - file_sizes = fs.sizes(paths) # NOTE: This can be slow - blocksize = parse_bytes(blocksize) - if all([file_size <= blocksize for file_size in file_sizes]): - counts = np.unique( - np.floor(np.cumsum(file_sizes) / blocksize), - return_counts=True, - )[1] - offsets = np.concatenate([[0], counts.cumsum()]) - inputs = [ - paths[offsets[i] : offsets[i + 1]] - for i in range(len(offsets) - 1) - ] - - if inputs: - # Inputs were successfully populated. - # Use custom _read_json_partition function - # to generate each partition. - - compression = get_compression( - url_path[0] if isinstance(url_path, list) else url_path, - compression, - ) - _kwargs = dict( - orient=orient, - lines=lines, - compression=compression, - include_path_column=kwargs.get("include_path_column", False), - path_converter=kwargs.get("path_converter"), - ) - if not _is_local_filesystem(fs): - _kwargs["fs"] = fs - # TODO: Generate meta more efficiently - meta = _read_json_partition(inputs[0][:1], **_kwargs) - return dask.dataframe.from_map( - _read_json_partition, - inputs, - meta=meta, - **_kwargs, - ) - - # Fall back to dask.dataframe.read_json - return _default_backend( - dask.dataframe.read_json, - url_path, - engine=( - partial(cudf.read_json, engine=engine) - if isinstance(engine, str) - else engine - ), - blocksize=blocksize, - orient=orient, - lines=lines, - compression=compression, - **kwargs, - ) +read_json = _deprecated_api( + "dask_cudf.io.json.read_json", + new_api="dask_cudf.read_json", +) diff --git a/python/dask_cudf/dask_cudf/io/orc.py b/python/dask_cudf/dask_cudf/io/orc.py index bed69f038b0..5219cdacc31 100644 --- a/python/dask_cudf/dask_cudf/io/orc.py +++ b/python/dask_cudf/dask_cudf/io/orc.py @@ -1,199 +1,13 @@ -# Copyright (c) 2020-2024, NVIDIA CORPORATION. - -from io import BufferedWriter, IOBase - -from fsspec.core import get_fs_token_paths -from fsspec.utils import stringify_path -from pyarrow import orc as orc - -from dask import dataframe as dd -from dask.base import tokenize -from dask.dataframe.io.utils import _get_pyarrow_dtypes - -import cudf - - -def _read_orc_stripe(fs, path, stripe, columns, kwargs=None): - """Pull out specific columns from specific stripe""" - if kwargs is None: - kwargs = {} - with fs.open(path, "rb") as f: - df_stripe = cudf.read_orc( - f, stripes=[stripe], columns=columns, **kwargs - ) - return df_stripe - - -def read_orc(path, columns=None, filters=None, storage_options=None, **kwargs): - """Read ORC files into a :class:`.DataFrame`. - - Note that this function is mostly borrowed from upstream Dask. - - Parameters - ---------- - path : str or list[str] - Location of file(s), which can be a full URL with protocol specifier, - and may include glob character if a single string. - columns : None or list[str] - Columns to load. If None, loads all. - filters : None or list of tuple or list of lists of tuples - If not None, specifies a filter predicate used to filter out - row groups using statistics stored for each row group as - Parquet metadata. Row groups that do not match the given - filter predicate are not read. The predicate is expressed in - `disjunctive normal form (DNF) - `__ - like ``[[('x', '=', 0), ...], ...]``. DNF allows arbitrary - boolean logical combinations of single column predicates. The - innermost tuples each describe a single column predicate. The - list of inner predicates is interpreted as a conjunction - (AND), forming a more selective and multiple column predicate. - Finally, the outermost list combines these filters as a - disjunction (OR). Predicates may also be passed as a list of - tuples. This form is interpreted as a single conjunction. To - express OR in predicates, one must use the (preferred) - notation of list of lists of tuples. - storage_options : None or dict - Further parameters to pass to the bytes backend. - - See Also - -------- - dask.dataframe.read_orc - - Returns - ------- - dask_cudf.DataFrame - - """ - - storage_options = storage_options or {} - fs, fs_token, paths = get_fs_token_paths( - path, mode="rb", storage_options=storage_options - ) - schema = None - nstripes_per_file = [] - for path in paths: - with fs.open(path, "rb") as f: - o = orc.ORCFile(f) - if schema is None: - schema = o.schema - elif schema != o.schema: - raise ValueError( - "Incompatible schemas while parsing ORC files" - ) - nstripes_per_file.append(o.nstripes) - schema = _get_pyarrow_dtypes(schema, categories=None) - if columns is not None: - ex = set(columns) - set(schema) - if ex: - raise ValueError( - f"Requested columns ({ex}) not in schema ({set(schema)})" - ) - else: - columns = list(schema) - - with fs.open(paths[0], "rb") as f: - meta = cudf.read_orc( - f, - stripes=[0] if nstripes_per_file[0] else None, - columns=columns, - **kwargs, - ) - - name = "read-orc-" + tokenize(fs_token, path, columns, filters, **kwargs) - dsk = {} - N = 0 - for path, n in zip(paths, nstripes_per_file): - for stripe in ( - range(n) - if filters is None - else cudf.io.orc._filter_stripes(filters, path) - ): - dsk[(name, N)] = ( - _read_orc_stripe, - fs, - path, - stripe, - columns, - kwargs, - ) - N += 1 - - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) - - -def write_orc_partition(df, path, fs, filename, compression="snappy"): - full_path = fs.sep.join([path, filename]) - with fs.open(full_path, mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - cudf.io.to_orc(df, out_file, compression=compression) - return full_path - - -def to_orc( - df, - path, - write_index=True, - storage_options=None, - compression="snappy", - compute=True, - **kwargs, -): - """ - Write a :class:`.DataFrame` to ORC file(s) (one file per partition). - - Parameters - ---------- - df : DataFrame - path : str or pathlib.Path - Destination directory for data. Prepend with protocol like ``s3://`` - or ``hdfs://`` for remote data. - write_index : boolean, optional - Whether or not to write the index. Defaults to True. - storage_options : None or dict - Further parameters to pass to the bytes backend. - compression : string or dict, optional - compute : bool, optional - If True (default) then the result is computed immediately. If - False then a :class:`~dask.delayed.Delayed` object is returned - for future computation. - - """ - - from dask import compute as dask_compute, delayed - - # TODO: Use upstream dask implementation once available - # (see: Dask Issue#5596) - - if hasattr(path, "name"): - path = stringify_path(path) - fs, _, _ = get_fs_token_paths( - path, mode="wb", storage_options=storage_options - ) - # Trim any protocol information from the path before forwarding - path = fs._strip_protocol(path) - - if write_index: - df = df.reset_index() - else: - # Not writing index - might as well drop it - df = df.reset_index(drop=True) - - fs.mkdirs(path, exist_ok=True) - - # Use i_offset and df.npartitions to define file-name list - filenames = ["part.%i.orc" % i for i in range(df.npartitions)] - - # write parts - dwrite = delayed(write_orc_partition) - parts = [ - dwrite(d, path, fs, filename, compression=compression) - for d, filename in zip(df.to_delayed(), filenames) - ] - - if compute: - return dask_compute(*parts) - - return delayed(list)(parts) +# Copyright (c) 2024, NVIDIA CORPORATION. + +from dask_cudf import _deprecated_api + +read_orc = _deprecated_api( + "dask_cudf.io.orc.read_orc", + new_api="dask_cudf.read_orc", +) +to_orc = _deprecated_api( + "dask_cudf.io.orc.to_orc", + new_api="dask_cudf._legacy.io.orc.to_orc", + rec="Please use the DataFrame.to_orc method instead.", +) diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py index 39ac6474958..48cea7266af 100644 --- a/python/dask_cudf/dask_cudf/io/parquet.py +++ b/python/dask_cudf/dask_cudf/io/parquet.py @@ -1,35 +1,66 @@ -# Copyright (c) 2019-2024, NVIDIA CORPORATION. -import itertools -import warnings -from functools import partial -from io import BufferedWriter, BytesIO, IOBase +# Copyright (c) 2024, NVIDIA CORPORATION. +import functools -import numpy as np import pandas as pd -from pyarrow import dataset as pa_ds, parquet as pq +from dask_expr.io.io import FusedParquetIO +from dask_expr.io.parquet import FragmentWrapper, ReadParquetPyarrowFS -from dask import dataframe as dd -from dask.dataframe.io.parquet.arrow import ArrowDatasetEngine +import cudf -try: - from dask.dataframe.io.parquet import ( - create_metadata_file as create_metadata_file_dd, - ) -except ImportError: - create_metadata_file_dd = None +from dask_cudf import _deprecated_api + +# Dask-expr imports CudfEngine from this module +from dask_cudf._legacy.io.parquet import CudfEngine # noqa: F401 + + +class CudfFusedParquetIO(FusedParquetIO): + @staticmethod + def _load_multiple_files( + frag_filters, + columns, + schema, + *to_pandas_args, + ): + import pyarrow as pa + + from dask.base import apply, tokenize + from dask.threaded import get + + token = tokenize(frag_filters, columns, schema) + name = f"pq-file-{token}" + dsk = { + (name, i): ( + CudfReadParquetPyarrowFS._fragment_to_table, + frag, + filter, + columns, + schema, + ) + for i, (frag, filter) in enumerate(frag_filters) + } + dsk[name] = ( + apply, + pa.concat_tables, + [list(dsk.keys())], + {"promote_options": "permissive"}, + ) + return CudfReadParquetPyarrowFS._table_to_pandas( + get(dsk, name), + *to_pandas_args, + ) -import cudf -from cudf.core.column import CategoricalColumn, as_column -from cudf.io import write_to_dataset -from cudf.io.parquet import _apply_post_filters, _normalize_filters -from cudf.utils.dtypes import cudf_dtype_from_pa_type +class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS): + @functools.cached_property + def _dataset_info(self): + from dask_cudf._legacy.io.parquet import ( + set_object_dtypes_from_pa_schema, + ) -class CudfEngine(ArrowDatasetEngine): - @classmethod - def _create_dd_meta(cls, dataset_info, **kwargs): - # Start with pandas-version of meta - meta_pd = super()._create_dd_meta(dataset_info, **kwargs) + dataset_info = super()._dataset_info + meta_pd = dataset_info["base_meta"] + if isinstance(meta_pd, cudf.DataFrame): + return dataset_info # Convert to cudf # (drop unsupported timezone information) @@ -45,469 +76,60 @@ def _create_dd_meta(cls, dataset_info, **kwargs): kwargs.get("schema", None), ) - return meta_cudf - - @classmethod - def multi_support(cls): - # Assert that this class is CudfEngine - # and that multi-part reading is supported - return cls == CudfEngine - - @classmethod - def _read_paths( - cls, - paths, - fs, - columns=None, - row_groups=None, - filters=None, - partitions=None, - partitioning=None, - partition_keys=None, - open_file_options=None, - dataset_kwargs=None, - **kwargs, - ): - # Simplify row_groups if all None - if row_groups == [None for path in paths]: - row_groups = None - - # Make sure we read in the columns needed for row-wise - # filtering after IO. This means that one or more columns - # will be dropped almost immediately after IO. However, - # we do NEED these columns for accurate filtering. - filters = _normalize_filters(filters) - projected_columns = None - if columns and filters: - projected_columns = [c for c in columns if c is not None] - columns = sorted( - set(v[0] for v in itertools.chain.from_iterable(filters)) - | set(projected_columns) - ) - - dataset_kwargs = dataset_kwargs or {} - dataset_kwargs["partitioning"] = partitioning or "hive" - - # Use cudf to read in data - try: - df = cudf.read_parquet( - paths, - engine="cudf", - columns=columns, - row_groups=row_groups if row_groups else None, - dataset_kwargs=dataset_kwargs, - categorical_partitions=False, - filesystem=fs, - **kwargs, - ) - except RuntimeError as err: - # TODO: Remove try/except after null-schema issue is resolved - # (See: https://github.com/rapidsai/cudf/issues/12702) - if len(paths) > 1: - df = cudf.concat( - [ - cudf.read_parquet( - path, - engine="cudf", - columns=columns, - row_groups=row_groups[i] if row_groups else None, - dataset_kwargs=dataset_kwargs, - categorical_partitions=False, - filesystem=fs, - **kwargs, - ) - for i, path in enumerate(paths) - ] - ) - else: - raise err - - # Apply filters (if any are defined) - df = _apply_post_filters(df, filters) - - if projected_columns: - # Elements of `projected_columns` may now be in the index. - # We must filter these names from our projection - projected_columns = [ - col for col in projected_columns if col in df._column_names - ] - df = df[projected_columns] - - if partitions and partition_keys is None: - # Use `HivePartitioning` by default - ds = pa_ds.dataset( - paths, - filesystem=fs, - **dataset_kwargs, - ) - frag = next(ds.get_fragments()) - if frag: - # Extract hive-partition keys, and make sure they - # are ordered the same as they are in `partitions` - raw_keys = pa_ds._get_partition_keys(frag.partition_expression) - partition_keys = [ - (hive_part.name, raw_keys[hive_part.name]) - for hive_part in partitions - ] - - if partition_keys: - if partitions is None: - raise ValueError("Must pass partition sets") - - for i, (name, index2) in enumerate(partition_keys): - if len(partitions[i].keys): - # Build a categorical column from `codes` directly - # (since the category is often a larger dtype) - codes = as_column( - partitions[i].keys.get_loc(index2), - length=len(df), - ) - df[name] = CategoricalColumn( - data=None, - size=codes.size, - dtype=cudf.CategoricalDtype( - categories=partitions[i].keys, ordered=False - ), - offset=codes.offset, - children=(codes,), - ) - elif name not in df.columns: - # Add non-categorical partition column - df[name] = as_column(index2, length=len(df)) - - return df - - @classmethod - def read_partition( - cls, - fs, - pieces, - columns, - index, - categories=(), - partitions=(), - filters=None, - partitioning=None, - schema=None, - open_file_options=None, - **kwargs, - ): - if columns is not None: - columns = [c for c in columns] - if isinstance(index, list): - columns += index - - dataset_kwargs = kwargs.get("dataset", {}) - partitioning = partitioning or dataset_kwargs.get("partitioning", None) - if isinstance(partitioning, dict): - partitioning = pa_ds.partitioning(**partitioning) - - # Check if we are actually selecting any columns - read_columns = columns - if schema and columns: - ignored = set(schema.names) - set(columns) - if not ignored: - read_columns = None - - if not isinstance(pieces, list): - pieces = [pieces] - - # Extract supported kwargs from `kwargs` - read_kwargs = kwargs.get("read", {}) - read_kwargs.update(open_file_options or {}) - check_file_size = read_kwargs.pop("check_file_size", None) - - # Wrap reading logic in a `try` block so that we can - # inform the user that the `read_parquet` partition - # size is too large for the available memory - try: - # Assume multi-piece read - paths = [] - rgs = [] - last_partition_keys = None - dfs = [] - - for i, piece in enumerate(pieces): - (path, row_group, partition_keys) = piece - row_group = None if row_group == [None] else row_group - - # File-size check to help "protect" users from change - # to up-stream `split_row_groups` default. We only - # check the file size if this partition corresponds - # to a full file, and `check_file_size` is defined - if check_file_size and len(pieces) == 1 and row_group is None: - file_size = fs.size(path) - if file_size > check_file_size: - warnings.warn( - f"A large parquet file ({file_size}B) is being " - f"used to create a DataFrame partition in " - f"read_parquet. This may cause out of memory " - f"exceptions in operations downstream. See the " - f"notes on split_row_groups in the read_parquet " - f"documentation. Setting split_row_groups " - f"explicitly will silence this warning." - ) - - if i > 0 and partition_keys != last_partition_keys: - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - filters=filters, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - dataset_kwargs=dataset_kwargs, - **read_kwargs, - ) - ) - paths = [] - rgs = [] - last_partition_keys = None - paths.append(path) - rgs.append( - [row_group] - if not isinstance(row_group, list) - and row_group is not None - else row_group - ) - last_partition_keys = partition_keys - - dfs.append( - cls._read_paths( - paths, - fs, - columns=read_columns, - row_groups=rgs if rgs else None, - filters=filters, - partitions=partitions, - partitioning=partitioning, - partition_keys=last_partition_keys, - dataset_kwargs=dataset_kwargs, - **read_kwargs, - ) - ) - df = cudf.concat(dfs) if len(dfs) > 1 else dfs[0] - - # Re-set "object" dtypes align with pa schema - set_object_dtypes_from_pa_schema(df, schema) - - if index and (index[0] in df.columns): - df = df.set_index(index[0]) - elif index is False and df.index.names != [None]: - # If index=False, we shouldn't have a named index - df.reset_index(inplace=True) - - except MemoryError as err: - raise MemoryError( - "Parquet data was larger than the available GPU memory!\n\n" - "See the notes on split_row_groups in the read_parquet " - "documentation.\n\n" - "Original Error: " + str(err) - ) - raise err - - return df - - @staticmethod - def write_partition( - df, - path, - fs, - filename, - partition_on, - return_metadata, - fmd=None, - compression="snappy", - index_cols=None, - **kwargs, - ): - preserve_index = False - if len(index_cols) and set(index_cols).issubset(set(df.columns)): - df.set_index(index_cols, drop=True, inplace=True) - preserve_index = True - if partition_on: - md = write_to_dataset( - df=df, - root_path=path, - compression=compression, - filename=filename, - partition_cols=partition_on, - fs=fs, - preserve_index=preserve_index, - return_metadata=return_metadata, - statistics=kwargs.get("statistics", "ROWGROUP"), - int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get("row_group_size_bytes", None), - row_group_size_rows=kwargs.get("row_group_size_rows", None), - max_page_size_bytes=kwargs.get("max_page_size_bytes", None), - max_page_size_rows=kwargs.get("max_page_size_rows", None), - storage_options=kwargs.get("storage_options", None), - ) - else: - with fs.open(fs.sep.join([path, filename]), mode="wb") as out_file: - if not isinstance(out_file, IOBase): - out_file = BufferedWriter(out_file) - md = df.to_parquet( - path=out_file, - engine=kwargs.get("engine", "cudf"), - index=kwargs.get("index", None), - partition_cols=kwargs.get("partition_cols", None), - partition_file_name=kwargs.get( - "partition_file_name", None - ), - partition_offsets=kwargs.get("partition_offsets", None), - statistics=kwargs.get("statistics", "ROWGROUP"), - int96_timestamps=kwargs.get("int96_timestamps", False), - row_group_size_bytes=kwargs.get( - "row_group_size_bytes", None - ), - row_group_size_rows=kwargs.get( - "row_group_size_rows", None - ), - storage_options=kwargs.get("storage_options", None), - metadata_file_path=filename if return_metadata else None, - ) - # Return the schema needed to write the metadata - if return_metadata: - return [{"meta": md}] - else: - return [] + dataset_info["base_meta"] = meta_cudf + self.operands[type(self)._parameters.index("_dataset_info_cache")] = ( + dataset_info + ) + return dataset_info @staticmethod - def write_metadata(parts, fmd, fs, path, append=False, **kwargs): - if parts: - # Aggregate metadata and write to _metadata file - metadata_path = fs.sep.join([path, "_metadata"]) - _meta = [] - if append and fmd is not None: - # Convert to bytes: - if isinstance(fmd, pq.FileMetaData): - with BytesIO() as myio: - fmd.write_metadata_file(myio) - myio.seek(0) - fmd = np.frombuffer(myio.read(), dtype="uint8") - _meta = [fmd] - _meta.extend([parts[i][0]["meta"] for i in range(len(parts))]) - _meta = ( - cudf.io.merge_parquet_filemetadata(_meta) - if len(_meta) > 1 - else _meta[0] - ) - with fs.open(metadata_path, "wb") as fil: - fil.write(memoryview(_meta)) - - @classmethod - def collect_file_metadata(cls, path, fs, file_path): - with fs.open(path, "rb") as f: - meta = pq.ParquetFile(f).metadata - if file_path: - meta.set_file_path(file_path) - with BytesIO() as myio: - meta.write_metadata_file(myio) - myio.seek(0) - meta = np.frombuffer(myio.read(), dtype="uint8") - return meta + def _table_to_pandas(table, index_name): + df = cudf.DataFrame.from_arrow(table) + if index_name is not None: + df = df.set_index(index_name) + return df - @classmethod - def aggregate_metadata(cls, meta_list, fs, out_path): - meta = ( - cudf.io.merge_parquet_filemetadata(meta_list) - if len(meta_list) > 1 - else meta_list[0] + def _filtered_task(self, index: int): + columns = self.columns.copy() + index_name = self.index.name + if self.index is not None: + index_name = self.index.name + schema = self._dataset_info["schema"].remove_metadata() + if index_name: + if columns is None: + columns = list(schema.names) + columns.append(index_name) + return ( + self._table_to_pandas, + ( + self._fragment_to_table, + FragmentWrapper(self.fragments[index], filesystem=self.fs), + self.filters, + columns, + schema, + ), + index_name, ) - if out_path: - metadata_path = fs.sep.join([out_path, "_metadata"]) - with fs.open(metadata_path, "wb") as fil: - fil.write(memoryview(meta)) - return None - else: - return meta - - -def set_object_dtypes_from_pa_schema(df, schema): - # Simple utility to modify cudf DataFrame - # "object" dtypes to agree with a specific - # pyarrow schema. - if schema: - for col_name, col in df._data.items(): - if col_name is None: - # Pyarrow cannot handle `None` as a field name. - # However, this should be a simple range index that - # we can ignore anyway - continue - typ = cudf_dtype_from_pa_type(schema.field(col_name).type) - if ( - col_name in schema.names - and not isinstance(typ, (cudf.ListDtype, cudf.StructDtype)) - and isinstance(col, cudf.core.column.StringColumn) - ): - df._data[col_name] = col.astype(typ) - - -def read_parquet(path, columns=None, **kwargs): - """ - Read parquet files into a :class:`.DataFrame`. - - Calls :func:`dask.dataframe.read_parquet` with ``engine=CudfEngine`` - to coordinate the execution of :func:`cudf.read_parquet`, and to - ultimately create a :class:`.DataFrame` collection. - - See the :func:`dask.dataframe.read_parquet` documentation for - all available options. - - Examples - -------- - >>> from dask_cudf import read_parquet - >>> df = read_parquet("/path/to/dataset/") # doctest: +SKIP - - When dealing with one or more large parquet files having an - in-memory footprint >15% device memory, the ``split_row_groups`` - argument should be used to map Parquet **row-groups** to DataFrame - partitions (instead of **files** to partitions). For example, the - following code will map each row-group to a distinct partition: - - >>> df = read_parquet(..., split_row_groups=True) # doctest: +SKIP - - To map **multiple** row-groups to each partition, an integer can be - passed to ``split_row_groups`` to specify the **maximum** number of - row-groups allowed in each output partition: - - >>> df = read_parquet(..., split_row_groups=10) # doctest: +SKIP - - See Also - -------- - cudf.read_parquet - dask.dataframe.read_parquet - """ - if isinstance(columns, str): - columns = [columns] - - # Set "check_file_size" option to determine whether we - # should check the parquet-file size. This check is meant - # to "protect" users from `split_row_groups` default changes - check_file_size = kwargs.pop("check_file_size", 500_000_000) - if ( - check_file_size - and ("split_row_groups" not in kwargs) - and ("chunksize" not in kwargs) - ): - # User is not specifying `split_row_groups` or `chunksize`, - # so we should warn them if/when a file is ~>0.5GB on disk. - # They can set `split_row_groups` explicitly to silence/skip - # this check - if "read" not in kwargs: - kwargs["read"] = {} - kwargs["read"]["check_file_size"] = check_file_size - - return dd.read_parquet(path, columns=columns, engine=CudfEngine, **kwargs) - - -to_parquet = partial(dd.to_parquet, engine=CudfEngine) -if create_metadata_file_dd is None: - create_metadata_file = create_metadata_file_dd -else: - create_metadata_file = partial(create_metadata_file_dd, engine=CudfEngine) + def _tune_up(self, parent): + if self._fusion_compression_factor >= 1: + return + if isinstance(parent, CudfFusedParquetIO): + return + return parent.substitute(self, CudfFusedParquetIO(self)) + + +read_parquet = _deprecated_api( + "dask_cudf.io.parquet.read_parquet", + new_api="dask_cudf.read_parquet", +) +to_parquet = _deprecated_api( + "dask_cudf.io.parquet.to_parquet", + new_api="dask_cudf._legacy.io.parquet.to_parquet", + rec="Please use the DataFrame.to_parquet method instead.", +) +create_metadata_file = _deprecated_api( + "dask_cudf.io.parquet.create_metadata_file", + new_api="dask_cudf._legacy.io.parquet.create_metadata_file", + rec="Please raise an issue if this feature is needed.", +) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py index a35a9f1be48..a0acb86f5a9 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py @@ -264,3 +264,18 @@ def test_read_csv_nrows_error(csv_end_bad_lines): dask_cudf.read_csv( csv_end_bad_lines, nrows=2, blocksize="100 MiB" ).compute() + + +def test_deprecated_api_paths(tmp_path): + csv_path = str(tmp_path / "data-*.csv") + df = dask_cudf.DataFrame.from_dict({"a": range(100)}, npartitions=1) + df.to_csv(csv_path, index=False) + + # Encourage top-level read_csv import only + with pytest.warns(match="dask_cudf.io.read_csv is now deprecated"): + df2 = dask_cudf.io.read_csv(csv_path) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.csv.read_csv is now deprecated"): + df2 = dask_cudf.io.csv.read_csv(csv_path) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py index abafbffd197..f5509cf91c3 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_json.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py @@ -126,3 +126,18 @@ def test_read_json_aggregate_files(tmp_path): assert name in df2.columns assert len(df2[name].compute().unique()) == df1.npartitions dd.assert_eq(df1, df2.drop(columns=[name]), check_index=False) + + +def test_deprecated_api_paths(tmp_path): + path = str(tmp_path / "data-*.json") + df = dd.from_dict({"a": range(100)}, npartitions=1) + df.to_json(path) + + # Encourage top-level read_json import only + with pytest.warns(match="dask_cudf.io.read_json is now deprecated"): + df2 = dask_cudf.io.read_json(path) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.json.read_json is now deprecated"): + df2 = dask_cudf.io.json.read_json(path) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py index 457e5546bd9..b6064d851ca 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py @@ -145,3 +145,21 @@ def test_to_orc(tmpdir, dtypes, compression, compute): # the cudf dataframes (df and df_read) dd.assert_eq(df, ddf_read) dd.assert_eq(df_read, ddf_read) + + +def test_deprecated_api_paths(tmpdir): + df = dask_cudf.DataFrame.from_dict({"a": range(100)}, npartitions=1) + path = tmpdir.join("test.orc") + # Top-level to_orc function is deprecated + with pytest.warns(match="dask_cudf.to_orc is now deprecated"): + dask_cudf.to_orc(df, path, write_index=False) + + # Encourage top-level read_orc import only + paths = glob.glob(str(path) + "/*.orc") + with pytest.warns(match="dask_cudf.io.read_orc is now deprecated"): + df2 = dask_cudf.io.read_orc(paths) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.orc.read_orc is now deprecated"): + df2 = dask_cudf.io.orc.read_orc(paths) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py index a29cf9a342a..522a21e12a5 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py @@ -15,6 +15,7 @@ import cudf import dask_cudf +from dask_cudf._legacy.io.parquet import create_metadata_file from dask_cudf.tests.utils import ( require_dask_expr, skip_dask_expr, @@ -24,7 +25,7 @@ # Check if create_metadata_file is supported by # the current dask.dataframe version need_create_meta = pytest.mark.skipif( - dask_cudf.io.parquet.create_metadata_file is None, + create_metadata_file is None, reason="Need create_metadata_file support in dask.dataframe.", ) @@ -425,10 +426,14 @@ def test_create_metadata_file(tmpdir, partition_on): fns = glob.glob(os.path.join(tmpdir, partition_on + "=*/*.parquet")) else: fns = glob.glob(os.path.join(tmpdir, "*.parquet")) - dask_cudf.io.parquet.create_metadata_file( - fns, - split_every=3, # Force tree reduction - ) + + with pytest.warns( + match="dask_cudf.io.parquet.create_metadata_file is now deprecated" + ): + dask_cudf.io.parquet.create_metadata_file( + fns, + split_every=3, # Force tree reduction + ) # Check that we can now read the ddf # with the _metadata file present @@ -472,7 +477,7 @@ def test_create_metadata_file_inconsistent_schema(tmpdir): # Add global metadata file. # Dask-CuDF can do this without requiring schema # consistency. - dask_cudf.io.parquet.create_metadata_file([p0, p1]) + create_metadata_file([p0, p1]) # Check that we can still read the ddf # with the _metadata file present @@ -533,9 +538,9 @@ def test_check_file_size(tmpdir): fn = str(tmpdir.join("test.parquet")) cudf.DataFrame({"a": np.arange(1000)}).to_parquet(fn) with pytest.warns(match="large parquet file"): - # Need to use `dask_cudf.io` path + # Need to use `dask_cudf._legacy.io` path # TODO: Remove outdated `check_file_size` functionality - dask_cudf.io.read_parquet(fn, check_file_size=1).compute() + dask_cudf._legacy.io.read_parquet(fn, check_file_size=1).compute() @xfail_dask_expr("HivePartitioning cannot be hashed", lt_version="2024.3.0") @@ -664,3 +669,21 @@ def test_to_parquet_append(tmpdir, write_metadata_file): ) ddf2 = dask_cudf.read_parquet(tmpdir) dd.assert_eq(cudf.concat([df, df]), ddf2) + + +def test_deprecated_api_paths(tmpdir): + df = dask_cudf.DataFrame.from_dict({"a": range(100)}, npartitions=1) + # io.to_parquet function is deprecated + with pytest.warns(match="dask_cudf.io.to_parquet is now deprecated"): + dask_cudf.io.to_parquet(df, tmpdir) + + # Encourage top-level read_parquet import only + with pytest.warns(match="dask_cudf.io.read_parquet is now deprecated"): + df2 = dask_cudf.io.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns( + match="dask_cudf.io.parquet.read_parquet is now deprecated" + ): + df2 = dask_cudf.io.parquet.read_parquet(tmpdir) + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_text.py b/python/dask_cudf/dask_cudf/io/tests/test_text.py index 8912b7d5da6..e35b6411a9d 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_text.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_text.py @@ -34,3 +34,15 @@ def test_read_text_byte_range(offset, size): text_file, chunksize=None, delimiter=".", byte_range=(offset, size) ) dd.assert_eq(df1, df2, check_index=False) + + +def test_deprecated_api_paths(): + # Encourage top-level read_text import only + df = cudf.read_text(text_file, delimiter=".") + with pytest.warns(match="dask_cudf.io.read_text is now deprecated"): + df2 = dask_cudf.io.read_text(text_file, delimiter=".") + dd.assert_eq(df, df2, check_divisions=False) + + with pytest.warns(match="dask_cudf.io.text.read_text is now deprecated"): + df2 = dask_cudf.io.text.read_text(text_file, delimiter=".") + dd.assert_eq(df, df2, check_divisions=False) diff --git a/python/dask_cudf/dask_cudf/io/text.py b/python/dask_cudf/dask_cudf/io/text.py index 9cdb7c5220b..1caf4e81d8e 100644 --- a/python/dask_cudf/dask_cudf/io/text.py +++ b/python/dask_cudf/dask_cudf/io/text.py @@ -1,54 +1,8 @@ -# Copyright (c) 2022-2024, NVIDIA CORPORATION. +# Copyright (c) 2024, NVIDIA CORPORATION. -import os -from glob import glob +from dask_cudf import _deprecated_api -import dask.dataframe as dd -from dask.base import tokenize -from dask.utils import apply, parse_bytes - -import cudf - - -def read_text(path, chunksize="256 MiB", **kwargs): - if isinstance(chunksize, str): - chunksize = parse_bytes(chunksize) - - if isinstance(path, list): - filenames = path - elif isinstance(path, str): - filenames = sorted(glob(path)) - elif hasattr(path, "__fspath__"): - filenames = sorted(glob(path.__fspath__())) - else: - raise TypeError(f"Path type not understood:{type(path)}") - - if not filenames: - msg = f"A file in: {filenames} does not exist." - raise FileNotFoundError(msg) - - name = "read-text-" + tokenize(path, tokenize, **kwargs) - - if chunksize: - dsk = {} - i = 0 - for fn in filenames: - size = os.path.getsize(fn) - for start in range(0, size, chunksize): - kwargs1 = kwargs.copy() - kwargs1["byte_range"] = ( - start, - chunksize, - ) # specify which chunk of the file we care about - - dsk[(name, i)] = (apply, cudf.read_text, [fn], kwargs1) - i += 1 - else: - dsk = { - (name, i): (apply, cudf.read_text, [fn], kwargs) - for i, fn in enumerate(filenames) - } - - meta = cudf.Series([], dtype="O") - divisions = [None] * (len(dsk) + 1) - return dd.core.new_dd_object(dsk, name, meta, divisions) +read_text = _deprecated_api( + "dask_cudf.io.text.read_text", + new_api="dask_cudf.read_text", +) diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py index 8e42c847ddf..5130b804179 100644 --- a/python/dask_cudf/dask_cudf/tests/test_core.py +++ b/python/dask_cudf/dask_cudf/tests/test_core.py @@ -39,30 +39,6 @@ def test_from_dict_backend_dispatch(): dd.assert_eq(expect, ddf) -def test_to_dask_dataframe_deprecated(): - gdf = cudf.DataFrame({"a": range(100)}) - ddf = dd.from_pandas(gdf, npartitions=2) - assert isinstance(ddf._meta, cudf.DataFrame) - - with pytest.warns(FutureWarning, match="API is now deprecated"): - assert isinstance( - ddf.to_dask_dataframe()._meta, - pd.DataFrame, - ) - - -def test_from_dask_dataframe_deprecated(): - gdf = pd.DataFrame({"a": range(100)}) - ddf = dd.from_pandas(gdf, npartitions=2) - assert isinstance(ddf._meta, pd.DataFrame) - - with pytest.warns(FutureWarning, match="API is now deprecated"): - assert isinstance( - dask_cudf.from_dask_dataframe(ddf)._meta, - cudf.DataFrame, - ) - - def test_to_backend(): rng = np.random.default_rng(seed=0) data = { diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py index 042e69d86f4..918290aa6fa 100644 --- a/python/dask_cudf/dask_cudf/tests/test_groupby.py +++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py @@ -13,7 +13,7 @@ from cudf.testing._utils import expect_warning_if import dask_cudf -from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized +from dask_cudf._legacy.groupby import OPTIMIZED_AGGS, _aggs_optimized from dask_cudf.tests.utils import ( QUERY_PLANNING_ON, require_dask_expr, diff --git a/python/dask_cudf/dask_cudf/tests/utils.py b/python/dask_cudf/dask_cudf/tests/utils.py index 9aaf6dc8420..a9f61f75762 100644 --- a/python/dask_cudf/dask_cudf/tests/utils.py +++ b/python/dask_cudf/dask_cudf/tests/utils.py @@ -10,7 +10,7 @@ import cudf -from dask_cudf.expr import QUERY_PLANNING_ON +from dask_cudf import QUERY_PLANNING_ON if QUERY_PLANNING_ON: DASK_VERSION = Version(dask.__version__) From 9d5041c5419cd17c880961559e3a1457cdae9fcc Mon Sep 17 00:00:00 2001 From: "Richard (Rick) Zamora" Date: Tue, 5 Nov 2024 09:56:35 -0600 Subject: [PATCH 05/12] Separate evaluation logic from `IR` objects in cudf-polars (#17175) Closes https://github.com/rapidsai/cudf/issues/17127 - This PR implements the proposal in #17127 - This change technically "breaks" with the existing `IR.evaluate` convention. Authors: - Richard (Rick) Zamora (https://github.com/rjzamora) - Lawrence Mitchell (https://github.com/wence-) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17175 --- python/cudf_polars/cudf_polars/dsl/ir.py | 450 +++++++++++++++-------- python/cudf_polars/docs/overview.md | 6 +- 2 files changed, 298 insertions(+), 158 deletions(-) diff --git a/python/cudf_polars/cudf_polars/dsl/ir.py b/python/cudf_polars/cudf_polars/dsl/ir.py index 04aa74024cd..a242ff9300f 100644 --- a/python/cudf_polars/cudf_polars/dsl/ir.py +++ b/python/cudf_polars/cudf_polars/dsl/ir.py @@ -127,9 +127,12 @@ def broadcast(*columns: Column, target_length: int | None = None) -> list[Column class IR(Node["IR"]): """Abstract plan node, representing an unevaluated dataframe.""" - __slots__ = ("schema",) + __slots__ = ("schema", "_non_child_args") # This annotation is needed because of https://github.com/python/mypy/issues/17981 _non_child: ClassVar[tuple[str, ...]] = ("schema",) + # Concrete classes should set this up with the arguments that will + # be passed to do_evaluate. + _non_child_args: tuple[Any, ...] schema: Schema """Mapping from column names to their data types.""" @@ -146,9 +149,37 @@ def get_hashable(self) -> Hashable: schema_hash = tuple(self.schema.items()) return (type(self), schema_hash, args) + # Hacky to avoid type-checking issues, just advertise the + # signature. Both mypy and pyright complain if we have an abstract + # method that takes arbitrary *args, but the subclasses have + # tighter signatures. This complaint is correct because the + # subclass is not Liskov-substitutable for the superclass. + # However, we know do_evaluate will only be called with the + # correct arguments by "construction". + do_evaluate: Callable[..., DataFrame] + """ + Evaluate the node (given its evaluated children), and return a dataframe. + + Parameters + ---------- + args + Non child arguments followed by any evaluated dataframe inputs. + + Returns + ------- + DataFrame (on device) representing the evaluation of this plan + node. + + Raises + ------ + NotImplementedError + If evaluation fails. Ideally this should not occur, since the + translation phase should fail earlier. + """ + def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """ - Evaluate the node and return a dataframe. + Evaluate the node (recursively) and return a dataframe. Parameters ---------- @@ -156,21 +187,27 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: Mapping from cached node ids to constructed DataFrames. Used to implement evaluation of the `Cache` node. + Notes + ----- + Prefer not to override this method. Instead implement + :meth:`do_evaluate` which doesn't encode a recursion scheme + and just assumes already evaluated inputs. + Returns ------- DataFrame (on device) representing the evaluation of this plan - node. + node (and its children). Raises ------ NotImplementedError - If we couldn't evaluate things. Ideally this should not occur, - since the translation phase should pick up things that we - cannot handle. + If evaluation fails. Ideally this should not occur, since the + translation phase should fail earlier. """ - raise NotImplementedError( - f"Evaluation of plan {type(self).__name__}" - ) # pragma: no cover + return self.do_evaluate( + *self._non_child_args, + *(child.evaluate(cache=cache) for child in self.children), + ) class PythonScan(IR): @@ -187,6 +224,7 @@ def __init__(self, schema: Schema, options: Any, predicate: expr.NamedExpr | Non self.schema = schema self.options = options self.predicate = predicate + self._non_child_args = (schema, options, predicate) self.children = () raise NotImplementedError("PythonScan not implemented") @@ -259,6 +297,17 @@ def __init__( self.n_rows = n_rows self.row_index = row_index self.predicate = predicate + self._non_child_args = ( + schema, + typ, + reader_options, + paths, + with_columns, + skip_rows, + n_rows, + row_index, + predicate, + ) self.children = () if self.typ not in ("csv", "parquet", "ndjson"): # pragma: no cover # This line is unhittable ATM since IPC/Anonymous scan raise @@ -341,19 +390,28 @@ def get_hashable(self) -> Hashable: self.predicate, ) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + schema: Schema, + typ: str, + reader_options: dict[str, Any], + paths: list[str], + with_columns: list[str] | None, + skip_rows: int, + n_rows: int, + row_index: tuple[str, int] | None, + predicate: expr.NamedExpr | None, + ): """Evaluate and return a dataframe.""" - with_columns = self.with_columns - row_index = self.row_index - n_rows = self.n_rows - if self.typ == "csv": - parse_options = self.reader_options["parse_options"] + if typ == "csv": + parse_options = reader_options["parse_options"] sep = chr(parse_options["separator"]) quote = chr(parse_options["quote_char"]) eol = chr(parse_options["eol_char"]) - if self.reader_options["schema"] is not None: + if reader_options["schema"] is not None: # Reader schema provides names - column_names = list(self.reader_options["schema"]["fields"].keys()) + column_names = list(reader_options["schema"]["fields"].keys()) else: # file provides column names column_names = None @@ -380,8 +438,8 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # polars skips blank lines at the beginning of the file pieces = [] read_partial = n_rows != -1 - for p in self.paths: - skiprows = self.reader_options["skip_rows"] + for p in paths: + skiprows = reader_options["skip_rows"] path = Path(p) with path.open() as f: while f.readline() == "\n": @@ -400,7 +458,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: skiprows=skiprows, comment=comment, decimal=decimal, - dtypes=self.schema, + dtypes=schema, nrows=n_rows, ) pieces.append(tbl_w_meta) @@ -419,17 +477,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.concatenate.concatenate(list(tables)), colnames[0], ) - elif self.typ == "parquet": + elif typ == "parquet": filters = None - if self.predicate is not None and self.row_index is None: + if predicate is not None and row_index is None: # Can't apply filters during read if we have a row index. - filters = to_parquet_filter(self.predicate.value) + filters = to_parquet_filter(predicate.value) tbl_w_meta = plc.io.parquet.read_parquet( - plc.io.SourceInfo(self.paths), + plc.io.SourceInfo(paths), columns=with_columns, filters=filters, nrows=n_rows, - skip_rows=self.skip_rows, + skip_rows=skip_rows, ) df = DataFrame.from_table( tbl_w_meta.tbl, @@ -439,12 +497,12 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: if filters is not None: # Mask must have been applied. return df - elif self.typ == "ndjson": + elif typ == "ndjson": json_schema: list[tuple[str, str, list]] = [ - (name, typ, []) for name, typ in self.schema.items() + (name, typ, []) for name, typ in schema.items() ] plc_tbl_w_meta = plc.io.json.read_json( - plc.io.SourceInfo(self.paths), + plc.io.SourceInfo(paths), lines=True, dtypes=json_schema, prune_columns=True, @@ -454,20 +512,17 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df = DataFrame.from_table( plc_tbl_w_meta.tbl, plc_tbl_w_meta.column_names(include_children=False) ) - col_order = list(self.schema.keys()) - # TODO: remove condition when dropping support for polars 1.0 - # https://github.com/pola-rs/polars/pull/17363 - if row_index is not None and row_index[0] in self.schema: + col_order = list(schema.keys()) + if row_index is not None: col_order.remove(row_index[0]) - if col_order is not None: - df = df.select(col_order) + df = df.select(col_order) else: raise NotImplementedError( - f"Unhandled scan type: {self.typ}" + f"Unhandled scan type: {typ}" ) # pragma: no cover; post init trips first if row_index is not None: name, offset = row_index - dtype = self.schema[name] + dtype = schema[name] step = plc.interop.from_arrow( pa.scalar(1, type=plc.interop.to_arrow(dtype)) ) @@ -482,13 +537,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: name=name, ) df = DataFrame([index, *df.columns]) - assert all( - c.obj.type() == self.schema[name] for name, c in df.column_map.items() - ) - if self.predicate is None: + assert all(c.obj.type() == schema[name] for name, c in df.column_map.items()) + if predicate is None: return df else: - (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) + (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) @@ -508,9 +561,21 @@ def __init__(self, schema: Schema, key: int, value: IR): self.schema = schema self.key = key self.children = (value,) + self._non_child_args = (key,) + + @classmethod + def do_evaluate( + cls, key: int, df: DataFrame + ) -> DataFrame: # pragma: no cover; basic evaluation never calls this + """Evaluate and return a dataframe.""" + # Our value has already been computed for us, so let's just + # return it. + return df def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: """Evaluate and return a dataframe.""" + # We must override the recursion scheme because we don't want + # to recurse if we're in the cache. try: return cache[self.key] except KeyError: @@ -545,6 +610,7 @@ def __init__( self.df = df self.projection = tuple(projection) if projection is not None else None self.predicate = predicate + self._non_child_args = (schema, df, self.projection, predicate) self.children = () def get_hashable(self) -> Hashable: @@ -557,18 +623,25 @@ def get_hashable(self) -> Hashable: schema_hash = tuple(self.schema.items()) return (type(self), schema_hash, id(self.df), self.projection, self.predicate) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + schema: Schema, + df: Any, + projection: tuple[str, ...] | None, + predicate: expr.NamedExpr | None, + ) -> DataFrame: """Evaluate and return a dataframe.""" - pdf = pl.DataFrame._from_pydf(self.df) - if self.projection is not None: - pdf = pdf.select(self.projection) + pdf = pl.DataFrame._from_pydf(df) + if projection is not None: + pdf = pdf.select(projection) df = DataFrame.from_polars(pdf) assert all( c.obj.type() == dtype - for c, dtype in zip(df.columns, self.schema.values(), strict=True) + for c, dtype in zip(df.columns, schema.values(), strict=True) ) - if self.predicate is not None: - (mask,) = broadcast(self.predicate.evaluate(df), target_length=df.num_rows) + if predicate is not None: + (mask,) = broadcast(predicate.evaluate(df), target_length=df.num_rows) return df.filter(mask) else: return df @@ -595,14 +668,19 @@ def __init__( self.exprs = tuple(exprs) self.should_broadcast = should_broadcast self.children = (df,) + self._non_child_args = (self.exprs, should_broadcast) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + exprs: tuple[expr.NamedExpr, ...], + should_broadcast: bool, # noqa: FBT001 + df: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) # Handle any broadcasting - columns = [e.evaluate(df) for e in self.exprs] - if self.should_broadcast: + columns = [e.evaluate(df) for e in exprs] + if should_broadcast: columns = broadcast(*columns) return DataFrame(columns) @@ -625,14 +703,14 @@ def __init__( self.schema = schema self.exprs = tuple(exprs) self.children = (df,) + self._non_child_args = (self.exprs,) - def evaluate( - self, *, cache: MutableMapping[int, DataFrame] - ) -> DataFrame: # pragma: no cover; polars doesn't emit this node yet + @classmethod + def do_evaluate( + cls, exprs: tuple[expr.NamedExpr, ...], df: DataFrame + ) -> DataFrame: # pragma: no cover; not exposed by polars yet """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - columns = broadcast(*(e.evaluate(df) for e in self.exprs)) + columns = broadcast(*(e.evaluate(df) for e in exprs)) assert all(column.obj.size() == 1 for column in columns) return DataFrame(columns) @@ -681,6 +759,13 @@ def __init__( if any(GroupBy.check_agg(a.value) > 1 for a in self.agg_requests): raise NotImplementedError("Nested aggregations in groupby") self.agg_infos = [req.collect_agg(depth=0) for req in self.agg_requests] + self._non_child_args = ( + self.keys, + self.agg_requests, + maintain_order, + options, + self.agg_infos, + ) @staticmethod def check_agg(agg: expr.Expr) -> int: @@ -710,13 +795,18 @@ def check_agg(agg: expr.Expr) -> int: else: raise NotImplementedError(f"No handler for {agg=}") - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + keys_in: Sequence[expr.NamedExpr], + agg_requests: Sequence[expr.NamedExpr], + maintain_order: bool, # noqa: FBT001 + options: Any, + agg_infos: Sequence[expr.AggInfo], + df: DataFrame, + ): """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - keys = broadcast( - *(k.evaluate(df) for k in self.keys), target_length=df.num_rows - ) + keys = broadcast(*(k.evaluate(df) for k in keys_in), target_length=df.num_rows) sorted = ( plc.types.Sorted.YES if all(k.is_sorted for k in keys) @@ -732,7 +822,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # TODO: uniquify requests = [] replacements: list[expr.Expr] = [] - for info in self.agg_infos: + for info in agg_infos: for pre_eval, req, rep in info.requests: if pre_eval is None: # A count aggregation, doesn't touch the column, @@ -754,12 +844,10 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for key, grouped_key in zip(keys, group_keys.columns(), strict=True) ] result_subs = DataFrame(raw_columns) - results = [ - req.evaluate(result_subs, mapping=mapping) for req in self.agg_requests - ] + results = [req.evaluate(result_subs, mapping=mapping) for req in agg_requests] broadcasted = broadcast(*result_keys, *results) # Handle order preservation of groups - if self.maintain_order and not sorted: + if maintain_order and not sorted: # The order we want want = plc.stream_compaction.stable_distinct( plc.Table([k.obj for k in keys]), @@ -799,7 +887,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ordered_table.columns(), broadcasted, strict=True ) ] - return DataFrame(broadcasted).slice(self.options.slice) + return DataFrame(broadcasted).slice(options.slice) class Join(IR): @@ -841,6 +929,7 @@ def __init__( self.right_on = tuple(right_on) self.options = options self.children = (left, right) + self._non_child_args = (self.left_on, self.right_on, self.options) if any( isinstance(e.value, expr.Literal) for e in itertools.chain(self.left_on, self.right_on) @@ -886,8 +975,8 @@ def _joiners( ) assert_never(how) + @staticmethod def _reorder_maps( - self, left_rows: int, lg: plc.Column, left_policy: plc.copying.OutOfBoundsPolicy, @@ -939,10 +1028,23 @@ def _reorder_maps( [plc.types.NullOrder.AFTER, plc.types.NullOrder.AFTER], ).columns() - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + left_on_exprs: Sequence[expr.NamedExpr], + right_on_exprs: Sequence[expr.NamedExpr], + options: tuple[ + Literal["inner", "left", "right", "full", "semi", "anti", "cross"], + bool, + tuple[int, int] | None, + str, + bool, + ], + left: DataFrame, + right: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - left, right = (c.evaluate(cache=cache) for c in self.children) - how, join_nulls, zlice, suffix, coalesce = self.options + how, join_nulls, zlice, suffix, coalesce = options if how == "cross": # Separate implementation, since cross_join returns the # result, not the gather maps @@ -966,14 +1068,14 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: ] return DataFrame([*left_cols, *right_cols]).slice(zlice) # TODO: Waiting on clarity based on https://github.com/pola-rs/polars/issues/17184 - left_on = DataFrame(broadcast(*(e.evaluate(left) for e in self.left_on))) - right_on = DataFrame(broadcast(*(e.evaluate(right) for e in self.right_on))) + left_on = DataFrame(broadcast(*(e.evaluate(left) for e in left_on_exprs))) + right_on = DataFrame(broadcast(*(e.evaluate(right) for e in right_on_exprs))) null_equality = ( plc.types.NullEquality.EQUAL if join_nulls else plc.types.NullEquality.UNEQUAL ) - join_fn, left_policy, right_policy = Join._joiners(how) + join_fn, left_policy, right_policy = cls._joiners(how) if right_policy is None: # Semi join lg = join_fn(left_on.table, right_on.table, null_equality) @@ -987,7 +1089,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: lg, rg = join_fn(left_on.table, right_on.table, null_equality) if how == "left" or how == "right": # Order of left table is preserved - lg, rg = self._reorder_maps( + lg, rg = cls._reorder_maps( left.num_rows, lg, left_policy, right.num_rows, rg, right_policy ) if coalesce and how == "inner": @@ -1046,14 +1148,19 @@ def __init__( self.schema = schema self.columns = tuple(columns) self.should_broadcast = should_broadcast + self._non_child_args = (self.columns, self.should_broadcast) self.children = (df,) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + exprs: Sequence[expr.NamedExpr], + should_broadcast: bool, # noqa: FBT001 + df: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - columns = [c.evaluate(df) for c in self.columns] - if self.should_broadcast: + columns = [c.evaluate(df) for c in exprs] + if should_broadcast: columns = broadcast(*columns, target_length=df.num_rows) else: # Polars ensures this is true, but let's make sure nothing @@ -1063,7 +1170,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: # table that might have mismatching column lengths will # never be turned into a pylibcudf Table with all columns # by the Select, which is why this is safe. - assert all(e.name.startswith("__POLARS_CSER_0x") for e in self.columns) + assert all(e.name.startswith("__POLARS_CSER_0x") for e in exprs) return df.with_columns(columns) @@ -1096,6 +1203,7 @@ def __init__( self.subset = subset self.zlice = zlice self.stable = stable + self._non_child_args = (keep, subset, zlice, stable) self.children = (df,) _KEEP_MAP: ClassVar[dict[str, plc.stream_compaction.DuplicateKeepOption]] = { @@ -1105,33 +1213,39 @@ def __init__( "any": plc.stream_compaction.DuplicateKeepOption.KEEP_ANY, } - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + keep: plc.stream_compaction.DuplicateKeepOption, + subset: frozenset[str] | None, + zlice: tuple[int, int] | None, + stable: bool, # noqa: FBT001 + df: DataFrame, + ): """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - if self.subset is None: + if subset is None: indices = list(range(df.num_columns)) keys_sorted = all(c.is_sorted for c in df.column_map.values()) else: - indices = [i for i, k in enumerate(df.column_names) if k in self.subset] - keys_sorted = all(df.column_map[name].is_sorted for name in self.subset) + indices = [i for i, k in enumerate(df.column_names) if k in subset] + keys_sorted = all(df.column_map[name].is_sorted for name in subset) if keys_sorted: table = plc.stream_compaction.unique( df.table, indices, - self.keep, + keep, plc.types.NullEquality.EQUAL, ) else: distinct = ( plc.stream_compaction.stable_distinct - if self.stable + if stable else plc.stream_compaction.distinct ) table = distinct( df.table, indices, - self.keep, + keep, plc.types.NullEquality.EQUAL, plc.types.NanEquality.ALL_EQUAL, ) @@ -1142,9 +1256,9 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: for new, old in zip(table.columns(), df.columns, strict=True) ] ) - if keys_sorted or self.stable: + if keys_sorted or stable: result = result.sorted_like(df) - return result.slice(self.zlice) + return result.slice(zlice) class Sort(IR): @@ -1179,29 +1293,39 @@ def __init__( self.null_order = tuple(null_order) self.stable = stable self.zlice = zlice + self._non_child_args = ( + self.by, + self.order, + self.null_order, + self.stable, + self.zlice, + ) self.children = (df,) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate( + cls, + by: Sequence[expr.NamedExpr], + order: Sequence[plc.types.Order], + null_order: Sequence[plc.types.NullOrder], + stable: bool, # noqa: FBT001 + zlice: tuple[int, int] | None, + df: DataFrame, + ) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - sort_keys = broadcast( - *(k.evaluate(df) for k in self.by), target_length=df.num_rows - ) + sort_keys = broadcast(*(k.evaluate(df) for k in by), target_length=df.num_rows) # TODO: More robust identification here. keys_in_result = { k.name: i for i, k in enumerate(sort_keys) if k.name in df.column_map and k.obj is df.column_map[k.name].obj } - do_sort = ( - plc.sorting.stable_sort_by_key if self.stable else plc.sorting.sort_by_key - ) + do_sort = plc.sorting.stable_sort_by_key if stable else plc.sorting.sort_by_key table = do_sort( df.table, plc.Table([k.obj for k in sort_keys]), - list(self.order), - list(self.null_order), + list(order), + list(null_order), ) columns: list[Column] = [] for name, c in zip(df.column_map, table.columns(), strict=True): @@ -1211,11 +1335,11 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: i = keys_in_result[name] column = column.set_sorted( is_sorted=plc.types.Sorted.YES, - order=self.order[i], - null_order=self.null_order[i], + order=order[i], + null_order=null_order[i], ) columns.append(column) - return DataFrame(columns).slice(self.zlice) + return DataFrame(columns).slice(zlice) class Slice(IR): @@ -1232,13 +1356,13 @@ def __init__(self, schema: Schema, offset: int, length: int, df: IR): self.schema = schema self.offset = offset self.length = length + self._non_child_args = (offset, length) self.children = (df,) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, offset: int, length: int, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - return df.slice((self.offset, self.length)) + return df.slice((offset, length)) class Filter(IR): @@ -1252,13 +1376,13 @@ class Filter(IR): def __init__(self, schema: Schema, mask: expr.NamedExpr, df: IR): self.schema = schema self.mask = mask + self._non_child_args = (mask,) self.children = (df,) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, mask_expr: expr.NamedExpr, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) - (mask,) = broadcast(self.mask.evaluate(df), target_length=df.num_rows) + (mask,) = broadcast(mask_expr.evaluate(df), target_length=df.num_rows) return df.filter(mask) @@ -1270,15 +1394,15 @@ class Projection(IR): def __init__(self, schema: Schema, df: IR): self.schema = schema + self._non_child_args = (schema,) self.children = (df,) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, schema: Schema, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - df = child.evaluate(cache=cache) # This can reorder things. columns = broadcast( - *(df.column_map[name] for name in self.schema), target_length=df.num_rows + *(df.column_map[name] for name in schema), target_length=df.num_rows ) return DataFrame(columns) @@ -1341,33 +1465,41 @@ def __init__(self, schema: Schema, name: str, options: Any, df: IR): "Unpivot cannot cast all input columns to " f"{self.schema[value_name].id()}" ) - self.options = (tuple(indices), tuple(pivotees), variable_name, value_name) + self.options = ( + tuple(indices), + tuple(pivotees), + (variable_name, schema[variable_name]), + (value_name, schema[value_name]), + ) + self._non_child_args = (name, self.options) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, name: str, options: Any, df: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - (child,) = self.children - if self.name == "rechunk": + if name == "rechunk": # No-op in our data model # Don't think this appears in a plan tree from python - return child.evaluate(cache=cache) # pragma: no cover - elif self.name == "rename": - df = child.evaluate(cache=cache) + return df # pragma: no cover + elif name == "rename": # final tag is "swapping" which is useful for the # optimiser (it blocks some pushdown operations) - old, new, _ = self.options + old, new, _ = options return df.rename_columns(dict(zip(old, new, strict=True))) - elif self.name == "explode": - df = child.evaluate(cache=cache) - ((to_explode,),) = self.options + elif name == "explode": + ((to_explode,),) = options index = df.column_names.index(to_explode) subset = df.column_names_set - {to_explode} return DataFrame.from_table( plc.lists.explode_outer(df.table, index), df.column_names ).sorted_like(df, subset=subset) - elif self.name == "unpivot": - indices, pivotees, variable_name, value_name = self.options + elif name == "unpivot": + ( + indices, + pivotees, + (variable_name, variable_dtype), + (value_name, value_dtype), + ) = options npiv = len(pivotees) - df = child.evaluate(cache=cache) index_columns = [ Column(col, name=name) for col, name in zip( @@ -1382,7 +1514,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: plc.interop.from_arrow( pa.array( pivotees, - type=plc.interop.to_arrow(self.schema[variable_name]), + type=plc.interop.to_arrow(variable_dtype), ), ) ] @@ -1390,10 +1522,7 @@ def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: df.num_rows, ).columns() value_column = plc.concatenate.concatenate( - [ - df.column_map[pivotee].astype(self.schema[value_name]).obj - for pivotee in pivotees - ] + [df.column_map[pivotee].astype(value_dtype).obj for pivotee in pivotees] ) return DataFrame( [ @@ -1417,18 +1546,20 @@ class Union(IR): def __init__(self, schema: Schema, zlice: tuple[int, int] | None, *children: IR): self.schema = schema self.zlice = zlice + self._non_child_args = (zlice,) self.children = children schema = self.children[0].schema if not all(s.schema == schema for s in self.children[1:]): raise NotImplementedError("Schema mismatch") - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, zlice: tuple[int, int] | None, *dfs: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - # TODO: only evaluate what we need if we have a slice - dfs = [df.evaluate(cache=cache) for df in self.children] + # TODO: only evaluate what we need if we have a slice? return DataFrame.from_table( - plc.concatenate.concatenate([df.table for df in dfs]), dfs[0].column_names - ).slice(self.zlice) + plc.concatenate.concatenate([df.table for df in dfs]), + dfs[0].column_names, + ).slice(zlice) class HConcat(IR): @@ -1439,6 +1570,7 @@ class HConcat(IR): def __init__(self, schema: Schema, *children: IR): self.schema = schema + self._non_child_args = () self.children = children @staticmethod @@ -1469,18 +1601,22 @@ def _extend_with_nulls(table: plc.Table, *, nrows: int) -> plc.Table: ] ) - def evaluate(self, *, cache: MutableMapping[int, DataFrame]) -> DataFrame: + @classmethod + def do_evaluate(cls, *dfs: DataFrame) -> DataFrame: """Evaluate and return a dataframe.""" - dfs = [df.evaluate(cache=cache) for df in self.children] max_rows = max(df.num_rows for df in dfs) # Horizontal concatenation extends shorter tables with nulls - dfs = [ - df - if df.num_rows == max_rows - else DataFrame.from_table( - self._extend_with_nulls(df.table, nrows=max_rows - df.num_rows), - df.column_names, + return DataFrame( + itertools.chain.from_iterable( + df.columns + for df in ( + df + if df.num_rows == max_rows + else DataFrame.from_table( + cls._extend_with_nulls(df.table, nrows=max_rows - df.num_rows), + df.column_names, + ) + for df in dfs + ) ) - for df in dfs - ] - return DataFrame(itertools.chain.from_iterable(df.columns for df in dfs)) + ) diff --git a/python/cudf_polars/docs/overview.md b/python/cudf_polars/docs/overview.md index 74b2cd4e5de..17a94c633f8 100644 --- a/python/cudf_polars/docs/overview.md +++ b/python/cudf_polars/docs/overview.md @@ -212,7 +212,11 @@ methods. Plan node definitions live in `cudf_polars/dsl/ir.py`, these all inherit from the base `IR` node. The evaluation of a plan node is done -by implementing the `evaluate` method. +by implementing the `do_evaluate` method. This method takes in +the non-child arguments specified in `_non_child_args`, followed by +pre-evaluated child nodes (`DataFrame` objects). To perform the +evaluation, one should use the base class (generic) `evaluate` method +which handles the recursive evaluation of child nodes. To translate the plan node, add a case handler in `translate_ir` that lives in `cudf_polars/dsl/translate.py`. From ac5b3ed1fd69abc424255b07bb66cebea5666f08 Mon Sep 17 00:00:00 2001 From: Matthew Murray <41342305+Matt711@users.noreply.github.com> Date: Tue, 5 Nov 2024 18:43:44 -0500 Subject: [PATCH 06/12] Deprecate single component extraction methods in libcudf (#17221) This PR deprecates the single component extraction methods (eg. `cudf::datetime::extract_year`) that are already covered by `cudf::datetime::extract_datetime_component`. xref #17143 Authors: - Matthew Murray (https://github.com/Matt711) Approvers: - David Wendt (https://github.com/davidwendt) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/17221 --- cpp/include/cudf/datetime.hpp | 40 +++-- cpp/tests/datetime/datetime_ops_test.cpp | 209 +++++++++-------------- cpp/tests/streams/datetime_test.cpp | 30 ++-- 3 files changed, 135 insertions(+), 144 deletions(-) diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp index 1eaea5b6374..1f6e86d0389 100644 --- a/cpp/include/cudf/datetime.hpp +++ b/cpp/include/cudf/datetime.hpp @@ -58,6 +58,8 @@ enum class datetime_component : uint8_t { * @brief Extracts year from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -65,7 +67,7 @@ enum class datetime_component : uint8_t { * @returns cudf::column of the extracted int16_t years * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_year( +[[deprecated]] std::unique_ptr extract_year( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -74,6 +76,8 @@ std::unique_ptr extract_year( * @brief Extracts month from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -81,7 +85,7 @@ std::unique_ptr extract_year( * @returns cudf::column of the extracted int16_t months * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_month( +[[deprecated]] std::unique_ptr extract_month( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -90,6 +94,8 @@ std::unique_ptr extract_month( * @brief Extracts day from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -97,7 +103,7 @@ std::unique_ptr extract_month( * @returns cudf::column of the extracted int16_t days * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_day( +[[deprecated]] std::unique_ptr extract_day( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -106,6 +112,8 @@ std::unique_ptr extract_day( * @brief Extracts a weekday from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -113,7 +121,7 @@ std::unique_ptr extract_day( * @returns cudf::column of the extracted int16_t days * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_weekday( +[[deprecated]] std::unique_ptr extract_weekday( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -122,6 +130,8 @@ std::unique_ptr extract_weekday( * @brief Extracts hour from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -129,7 +139,7 @@ std::unique_ptr extract_weekday( * @returns cudf::column of the extracted int16_t hours * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_hour( +[[deprecated]] std::unique_ptr extract_hour( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -138,6 +148,8 @@ std::unique_ptr extract_hour( * @brief Extracts minute from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -145,7 +157,7 @@ std::unique_ptr extract_hour( * @returns cudf::column of the extracted int16_t minutes * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_minute( +[[deprecated]] std::unique_ptr extract_minute( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -154,6 +166,8 @@ std::unique_ptr extract_minute( * @brief Extracts second from any datetime type and returns an int16_t * cudf::column. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -161,7 +175,7 @@ std::unique_ptr extract_minute( * @returns cudf::column of the extracted int16_t seconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_second( +[[deprecated]] std::unique_ptr extract_second( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -173,6 +187,8 @@ std::unique_ptr extract_second( * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration. * For example, the millisecond fraction of 1.234567890 seconds is 234. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -180,7 +196,7 @@ std::unique_ptr extract_second( * @returns cudf::column of the extracted int16_t milliseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_millisecond_fraction( +[[deprecated]] std::unique_ptr extract_millisecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -192,6 +208,8 @@ std::unique_ptr extract_millisecond_fraction( * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration. * For example, the microsecond fraction of 1.234567890 seconds is 567. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -199,7 +217,7 @@ std::unique_ptr extract_millisecond_fraction( * @returns cudf::column of the extracted int16_t microseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_microsecond_fraction( +[[deprecated]] std::unique_ptr extract_microsecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); @@ -211,6 +229,8 @@ std::unique_ptr extract_microsecond_fraction( * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration. * For example, the nanosecond fraction of 1.234567890 seconds is 890. * + * @deprecated Deprecated in 24.12, to be removed in 25.02 + * * @param column cudf::column_view of the input datetime values * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the returned column @@ -218,7 +238,7 @@ std::unique_ptr extract_microsecond_fraction( * @returns cudf::column of the extracted int16_t nanoseconds * @throw cudf::logic_error if input column datatype is not TIMESTAMP */ -std::unique_ptr extract_nanosecond_fraction( +[[deprecated]] std::unique_ptr extract_nanosecond_fraction( cudf::column_view const& column, rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::device_async_resource_ref mr = cudf::get_current_device_resource_ref()); diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp index 44f99adc0e9..1d1deb42a51 100644 --- a/cpp/tests/datetime/datetime_ops_test.cpp +++ b/cpp/tests/datetime/datetime_ops_test.cpp @@ -52,16 +52,26 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp) cudf::data_type dtype{cudf::type_to_id()}; cudf::column col{dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; - EXPECT_THROW(extract_year(col), cudf::logic_error); - EXPECT_THROW(extract_month(col), cudf::logic_error); - EXPECT_THROW(extract_day(col), cudf::logic_error); - EXPECT_THROW(extract_weekday(col), cudf::logic_error); - EXPECT_THROW(extract_hour(col), cudf::logic_error); - EXPECT_THROW(extract_minute(col), cudf::logic_error); - EXPECT_THROW(extract_second(col), cudf::logic_error); - EXPECT_THROW(extract_millisecond_fraction(col), cudf::logic_error); - EXPECT_THROW(extract_microsecond_fraction(col), cudf::logic_error); - EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::YEAR), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MONTH), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::DAY), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::WEEKDAY), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::HOUR), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MINUTE), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::SECOND), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MILLISECOND), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::MICROSECOND), + cudf::logic_error); + EXPECT_THROW(extract_datetime_component(col, cudf::datetime::datetime_component::NANOSECOND), + cudf::logic_error); EXPECT_THROW(last_day_of_month(col), cudf::logic_error); EXPECT_THROW(day_of_year(col), cudf::logic_error); EXPECT_THROW(add_calendrical_months(col, *cudf::make_empty_column(cudf::type_id::INT16)), @@ -104,96 +114,6 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents) 987234623 // 1970-01-01 00:00:00.987234623 GMT }; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_D), - fixed_width_column_wrapper{1965, 2018, 2023}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_s), - fixed_width_column_wrapper{1965, 2018, 2023}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ms), - fixed_width_column_wrapper{1965, 2018, 2023}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ns), - fixed_width_column_wrapper{1969, 1970, 1970}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_D), - fixed_width_column_wrapper{10, 7, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_s), - fixed_width_column_wrapper{10, 7, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ms), - fixed_width_column_wrapper{10, 7, 1}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ns), - fixed_width_column_wrapper{12, 1, 1}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_D), - fixed_width_column_wrapper{26, 4, 25}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_s), - fixed_width_column_wrapper{26, 4, 25}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ms), - fixed_width_column_wrapper{26, 4, 25}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ns), - fixed_width_column_wrapper{31, 1, 1}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_D), - fixed_width_column_wrapper{2, 3, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_s), - fixed_width_column_wrapper{2, 3, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), - fixed_width_column_wrapper{2, 3, 3}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms), - fixed_width_column_wrapper{2, 3, 3}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_s), - fixed_width_column_wrapper{14, 12, 7}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ms), - fixed_width_column_wrapper{14, 12, 7}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ns), - fixed_width_column_wrapper{23, 0, 0}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_s), - fixed_width_column_wrapper{1, 0, 32}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ms), - fixed_width_column_wrapper{1, 0, 32}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), - fixed_width_column_wrapper{59, 0, 0}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_s), - fixed_width_column_wrapper{12, 0, 12}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_ms), - fixed_width_column_wrapper{12, 0, 12}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns), - fixed_width_column_wrapper{59, 0, 0}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_s), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ms), - fixed_width_column_wrapper{762, 0, 929}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ns), - fixed_width_column_wrapper{976, 23, 987}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_s), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ms), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ns), - fixed_width_column_wrapper{675, 432, 234}); - - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_D), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_s), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ms), - fixed_width_column_wrapper{0, 0, 0}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns), - fixed_width_column_wrapper{766, 424, 623}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL( *extract_datetime_component(timestamps_D, cudf::datetime::datetime_component::YEAR), fixed_width_column_wrapper{1965, 2018, 2023}); @@ -346,16 +266,29 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns) cudf::column int16s{int16s_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; cudf::column timestamps{timestamps_dtype, 0, rmm::device_buffer{}, rmm::device_buffer{}, 0}; - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps), int16s); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::YEAR), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MONTH), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::DAY), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::WEEKDAY), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::HOUR), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MINUTE), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::SECOND), int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MILLISECOND), + int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MICROSECOND), + int16s); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::NANOSECOND), + int16s); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) @@ -385,13 +318,27 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents) expected_seconds = fixed_width_column_wrapper{0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), expected_years); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), expected_months); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps), expected_days); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps), expected_weekdays); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), expected_hours); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), expected_minutes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), expected_seconds); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::YEAR), + expected_years); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MONTH), + expected_months); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::DAY), + expected_days); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::WEEKDAY), + expected_weekdays); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::HOUR), + expected_hours); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MINUTE), + expected_minutes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::SECOND), + expected_seconds); } TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponents) @@ -441,13 +388,27 @@ TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedNullableDatetimeComponen {true, false, true, false, true, false, true, false, true, false}}; } - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps), expected_years); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps), expected_months); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps), expected_days); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps), expected_weekdays); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), expected_hours); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), expected_minutes); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), expected_seconds); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::YEAR), + expected_years); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MONTH), + expected_months); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::DAY), + expected_days); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::WEEKDAY), + expected_weekdays); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::HOUR), + expected_hours); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::MINUTE), + expected_minutes); + CUDF_TEST_EXPECT_COLUMNS_EQUAL( + *extract_datetime_component(timestamps, cudf::datetime::datetime_component::SECOND), + expected_seconds); } TEST_F(BasicDatetimeOpsTest, TestLastDayOfMonthWithSeconds) diff --git a/cpp/tests/streams/datetime_test.cpp b/cpp/tests/streams/datetime_test.cpp index 82629156fa6..29b302c3637 100644 --- a/cpp/tests/streams/datetime_test.cpp +++ b/cpp/tests/streams/datetime_test.cpp @@ -35,52 +35,62 @@ class DatetimeTest : public cudf::test::BaseFixture { TEST_F(DatetimeTest, ExtractYear) { - cudf::datetime::extract_year(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::YEAR, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMonth) { - cudf::datetime::extract_month(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MONTH, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractDay) { - cudf::datetime::extract_day(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::DAY, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractWeekday) { - cudf::datetime::extract_weekday(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::WEEKDAY, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractHour) { - cudf::datetime::extract_hour(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::HOUR, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMinute) { - cudf::datetime::extract_minute(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MINUTE, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractSecond) { - cudf::datetime::extract_second(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::SECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMillisecondFraction) { - cudf::datetime::extract_millisecond_fraction(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MILLISECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractMicrosecondFraction) { - cudf::datetime::extract_microsecond_fraction(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::MICROSECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, ExtractNanosecondFraction) { - cudf::datetime::extract_nanosecond_fraction(timestamps, cudf::test::get_default_stream()); + cudf::datetime::extract_datetime_component( + timestamps, cudf::datetime::datetime_component::NANOSECOND, cudf::test::get_default_stream()); } TEST_F(DatetimeTest, LastDayOfMonth) From adf32694e7b4eb9f91e928bf6dbf0818b97bcf35 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 6 Nov 2024 09:26:58 -0800 Subject: [PATCH 07/12] Search for kvikio with lowercase (#17243) ## Description The case-sensitive name KvikIO is will throw off `find_package` searches, particularly after https://github.com/rapidsai/devcontainers/pull/414 make the usage consistent in devcontainers. ## Checklist - [x] I am familiar with the [Contributing Guidelines](https://github.com/rapidsai/cudf/blob/HEAD/CONTRIBUTING.md). - [x] New or existing tests cover these changes. - [x] The documentation is up to date with these changes. --- cpp/cmake/thirdparty/get_kvikio.cmake | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/cmake/thirdparty/get_kvikio.cmake b/cpp/cmake/thirdparty/get_kvikio.cmake index 20712beec41..c949f48505e 100644 --- a/cpp/cmake/thirdparty/get_kvikio.cmake +++ b/cpp/cmake/thirdparty/get_kvikio.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -16,7 +16,7 @@ function(find_and_configure_kvikio VERSION) rapids_cpm_find( - KvikIO ${VERSION} + kvikio ${VERSION} GLOBAL_TARGETS kvikio::kvikio CPM_ARGS GIT_REPOSITORY https://github.com/rapidsai/kvikio.git From 06b3f83b3e7f1b1364973be34f58fac4caf773f3 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 6 Nov 2024 16:54:28 -0500 Subject: [PATCH 08/12] Disallow cuda-python 12.6.1 and 11.8.4 (#17253) Due to a bug in cuda-python we must disallow cuda-python 12.6.1 and 11.8.4. This PR disallows those versions. It also silences new cuda-python deprecation warnings so that our test suite passes. See https://github.com/rapidsai/build-planning/issues/116 for more information. --------- Co-authored-by: James Lamb --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 4 ++-- conda/recipes/pylibcudf/meta.yaml | 4 ++-- dependencies.yaml | 8 ++++---- python/cudf/pyproject.toml | 4 +++- python/cudf_kafka/pyproject.toml | 4 +++- python/cudf_polars/pyproject.toml | 4 +++- python/custreamz/pyproject.toml | 2 ++ python/dask_cudf/pyproject.toml | 2 ++ python/pylibcudf/pyproject.toml | 4 +++- 11 files changed, 26 insertions(+), 14 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 9d9fec97731..ace55a15c09 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0 +- cuda-python>=11.7.1,<12.0a0,!=11.8.4 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 19e3eafd641..d20db44497e 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0 +- cuda-python>=12.0,<13.0a0,!=12.6.1 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 2c254415318..6debcb281b1 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.7.1,<12.0a0,!=11.8.4 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.0,<13.0a0,!=12.6.1 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 3d965f30986..92ca495f972 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0 + - cuda-python >=11.7.1,<12.0a0,!=11.8.4 {% else %} - - cuda-python >=12.0,<13.0a0 + - cuda-python >=12.0,<13.0a0,!=12.6.1 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/dependencies.yaml b/dependencies.yaml index 90255ca674c..cc31619c217 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -658,10 +658,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.0,<13.0a0,!=12.6.1 - matrix: {cuda: "11.*"} packages: &run_pylibcudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.7.1,<12.0a0,!=11.8.4 - {matrix: null, packages: *run_pylibcudf_packages_all_cu11} run_cudf: common: @@ -684,10 +684,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0 + - cuda-python>=12.0,<13.0a0,!=12.6.1 - matrix: {cuda: "11.*"} packages: &run_cudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0 + - cuda-python>=11.7.1,<12.0a0,!=11.8.4 - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index b6105c17b3e..53f22a11e6b 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,7 +20,7 @@ requires-python = ">=3.10" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0", + "cuda-python>=11.7.1,<12.0a0,!=11.8.4", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", @@ -90,6 +90,8 @@ filterwarnings = [ "error", "ignore:::.*xdist.*", "ignore:::.*pytest.*", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", # some third-party dependencies (e.g. 'boto3') still using datetime.datetime.utcnow() "ignore:.*datetime.*utcnow.*scheduled for removal.*:DeprecationWarning:botocore", # Deprecation warning from Pyarrow Table.to_pandas() with pandas-2.2+ diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 667cd7b1db8..ec0bc0eb22b 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -51,7 +51,9 @@ rapids = ["rmm", "cudf", "dask_cudf"] addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ - "error" + "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", ] xfail_strict = true diff --git a/python/cudf_polars/pyproject.toml b/python/cudf_polars/pyproject.toml index a2c62ef9460..2e75dff5c9e 100644 --- a/python/cudf_polars/pyproject.toml +++ b/python/cudf_polars/pyproject.toml @@ -53,7 +53,9 @@ version = {file = "cudf_polars/VERSION"} addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ - "error" + "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", ] xfail_strict = true diff --git a/python/custreamz/pyproject.toml b/python/custreamz/pyproject.toml index a8ab05a3922..d3baf3bf4d2 100644 --- a/python/custreamz/pyproject.toml +++ b/python/custreamz/pyproject.toml @@ -85,6 +85,8 @@ addopts = "--tb=native --strict-config --strict-markers" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", "ignore:unclosed =11.7.1,<12.0a0", + "cuda-python>=11.7.1,<12.0a0,!=11.8.4", "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", @@ -74,6 +74,8 @@ addopts = "--tb=native --strict-config --strict-markers --import-mode=importlib" empty_parameter_set_mark = "fail_at_collect" filterwarnings = [ "error", + # https://github.com/rapidsai/build-planning/issues/116 + "ignore:.*cuda..* module is deprecated.*:DeprecationWarning", "ignore:::.*xdist.*", "ignore:::.*pytest.*" ] From 57900dee500a1a051393dea438d32d94ecd4de61 Mon Sep 17 00:00:00 2001 From: "Mads R. B. Kristensen" Date: Thu, 7 Nov 2024 02:47:47 +0100 Subject: [PATCH 09/12] KvikIO shared library (#17239) Update cudf to use the new KvikIO shared library: https://github.com/rapidsai/kvikio/pull/527 #### Tasks - [x] Wait for the [KvikIO shared library PR](https://github.com/rapidsai/kvikio/pull/527) to be merged. - [x] Revert the use of the [KvikIO shared library](https://github.com/rapidsai/kvikio/pull/527) in CI: https://github.com/rapidsai/cudf/commit/2d8eeafe4959357a17f6ad488811837e0a07ba65. Authors: - Mads R. B. Kristensen (https://github.com/madsbk) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - James Lamb (https://github.com/jameslamb) URL: https://github.com/rapidsai/cudf/pull/17239 --- ci/build_wheel_cudf.sh | 1 + ci/build_wheel_libcudf.sh | 1 + ci/build_wheel_pylibcudf.sh | 1 + dependencies.yaml | 1 + python/libcudf/libcudf/load.py | 11 +++++++++++ python/libcudf/pyproject.toml | 1 + 6 files changed, 16 insertions(+) diff --git a/ci/build_wheel_cudf.sh b/ci/build_wheel_cudf.sh index fef4416a366..ae4eb0d5c66 100755 --- a/ci/build_wheel_cudf.sh +++ b/ci/build_wheel_cudf.sh @@ -23,6 +23,7 @@ export PIP_CONSTRAINT="/tmp/constraints.txt" python -m auditwheel repair \ --exclude libcudf.so \ --exclude libnvcomp.so \ + --exclude libkvikio.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* diff --git a/ci/build_wheel_libcudf.sh b/ci/build_wheel_libcudf.sh index b3d6778ea04..aabd3814a24 100755 --- a/ci/build_wheel_libcudf.sh +++ b/ci/build_wheel_libcudf.sh @@ -33,6 +33,7 @@ RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})" mkdir -p ${package_dir}/final_dist python -m auditwheel repair \ --exclude libnvcomp.so.4 \ + --exclude libkvikio.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* diff --git a/ci/build_wheel_pylibcudf.sh b/ci/build_wheel_pylibcudf.sh index 839d98846fe..c4a89f20f5f 100755 --- a/ci/build_wheel_pylibcudf.sh +++ b/ci/build_wheel_pylibcudf.sh @@ -21,6 +21,7 @@ export PIP_CONSTRAINT="/tmp/constraints.txt" python -m auditwheel repair \ --exclude libcudf.so \ --exclude libnvcomp.so \ + --exclude libkvikio.so \ -w ${package_dir}/final_dist \ ${package_dir}/dist/* diff --git a/dependencies.yaml b/dependencies.yaml index cc31619c217..41ac6ce1808 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -177,6 +177,7 @@ files: extras: table: project includes: + - depends_on_libkvikio - depends_on_nvcomp py_build_pylibcudf: output: pyproject diff --git a/python/libcudf/libcudf/load.py b/python/libcudf/libcudf/load.py index ba134710868..bf27ecfa7f5 100644 --- a/python/libcudf/libcudf/load.py +++ b/python/libcudf/libcudf/load.py @@ -18,6 +18,17 @@ def load_library(): + try: + # libkvikio must be loaded before libcudf because libcudf references its symbols + import libkvikio + + libkvikio.load_library() + except ModuleNotFoundError: + # libcudf's runtime dependency on libkvikio may be satisfied by a natively + # installed library or a conda package, in which case the import will fail and + # we assume the library is discoverable on system paths. + pass + # Dynamically load libcudf.so. Prefer a system library if one is present to # avoid clobbering symbols that other packages might expect, but if no # other library is present use the one in the wheel. diff --git a/python/libcudf/pyproject.toml b/python/libcudf/pyproject.toml index c6d9ae56467..62726bb0df4 100644 --- a/python/libcudf/pyproject.toml +++ b/python/libcudf/pyproject.toml @@ -38,6 +38,7 @@ classifiers = [ "Environment :: GPU :: NVIDIA CUDA", ] dependencies = [ + "libkvikio==24.12.*,>=0.0.0a0", "nvidia-nvcomp==4.1.0.6", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. From 29484cb87a417e2e36c8f3b6cd2ec961abec3156 Mon Sep 17 00:00:00 2001 From: James Lamb Date: Thu, 7 Nov 2024 00:51:59 -0600 Subject: [PATCH 10/12] Put a ceiling on cuda-python (#17264) Follow-up to #17253 Contributes to https://github.com/rapidsai/build-planning/issues/116 That PR used `!=` requirements to skip a particular version of `cuda-python` that `cudf` and `pylibcudf` were incompatible with. A newer version of `cuda-python` (12.6.2 for CUDA 12, 11.8.5 for CUDA 11) was just released, and it also causes some build issues for RAPIDS libraries: https://github.com/rapidsai/cuvs/pull/445#issuecomment-2461146449 To unblock CI across RAPIDS, this proposes **temporarily** switching to ceilings on the `cuda-python` dependency here. Authors: - James Lamb (https://github.com/jameslamb) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/17264 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-125_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 4 ++-- conda/recipes/pylibcudf/meta.yaml | 4 ++-- dependencies.yaml | 8 ++++---- python/cudf/pyproject.toml | 2 +- python/pylibcudf/pyproject.toml | 2 +- 7 files changed, 12 insertions(+), 12 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index ace55a15c09..8a64ebf40c5 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -19,7 +19,7 @@ dependencies: - cramjam - cubinlinker - cuda-nvtx=11.8 -- cuda-python>=11.7.1,<12.0a0,!=11.8.4 +- cuda-python>=11.7.1,<12.0a0,<=11.8.3 - cuda-sanitizer-api=11.8.86 - cuda-version=11.8 - cudatoolkit diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index d20db44497e..5f779c3170f 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -21,7 +21,7 @@ dependencies: - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev -- cuda-python>=12.0,<13.0a0,!=12.6.1 +- cuda-python>=12.0,<13.0a0,<=12.6.0 - cuda-sanitizer-api - cuda-version=12.5 - cupy>=12.0.0 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 6debcb281b1..2aafcae072d 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -91,7 +91,7 @@ requirements: - cudatoolkit - ptxcompiler >=0.7.0 - cubinlinker # CUDA enhanced compatibility. - - cuda-python >=11.7.1,<12.0a0,!=11.8.4 + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 {% else %} - cuda-cudart - libcufile # [linux64] @@ -100,7 +100,7 @@ requirements: # TODO: Add nvjitlink here # xref: https://github.com/rapidsai/cudf/issues/12822 - cuda-nvrtc - - cuda-python >=12.0,<13.0a0,!=12.6.1 + - cuda-python >=12.0,<13.0a0,<=12.6.0 - pynvjitlink {% endif %} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/pylibcudf/meta.yaml b/conda/recipes/pylibcudf/meta.yaml index 92ca495f972..ec3fcd59c62 100644 --- a/conda/recipes/pylibcudf/meta.yaml +++ b/conda/recipes/pylibcudf/meta.yaml @@ -83,9 +83,9 @@ requirements: - {{ pin_compatible('rmm', max_pin='x.x') }} - fsspec >=0.6.0 {% if cuda_major == "11" %} - - cuda-python >=11.7.1,<12.0a0,!=11.8.4 + - cuda-python >=11.7.1,<12.0a0,<=11.8.3 {% else %} - - cuda-python >=12.0,<13.0a0,!=12.6.1 + - cuda-python >=12.0,<13.0a0,<=12.6.0 {% endif %} - nvtx >=0.2.1 - packaging diff --git a/dependencies.yaml b/dependencies.yaml index 41ac6ce1808..4c6aefe996f 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -659,10 +659,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0,!=12.6.1 + - cuda-python>=12.0,<13.0a0,<=12.6.0 - matrix: {cuda: "11.*"} packages: &run_pylibcudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0,!=11.8.4 + - cuda-python>=11.7.1,<12.0a0,<=11.8.3 - {matrix: null, packages: *run_pylibcudf_packages_all_cu11} run_cudf: common: @@ -685,10 +685,10 @@ dependencies: matrices: - matrix: {cuda: "12.*"} packages: - - cuda-python>=12.0,<13.0a0,!=12.6.1 + - cuda-python>=12.0,<13.0a0,<=12.6.0 - matrix: {cuda: "11.*"} packages: &run_cudf_packages_all_cu11 - - cuda-python>=11.7.1,<12.0a0,!=11.8.4 + - cuda-python>=11.7.1,<12.0a0,<=11.8.3 - {matrix: null, packages: *run_cudf_packages_all_cu11} - output_types: conda matrices: diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 53f22a11e6b..1eadceaaccd 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -20,7 +20,7 @@ requires-python = ">=3.10" dependencies = [ "cachetools", "cubinlinker", - "cuda-python>=11.7.1,<12.0a0,!=11.8.4", + "cuda-python>=11.7.1,<12.0a0,<=11.8.3", "cupy-cuda11x>=12.0.0", "fsspec>=0.6.0", "libcudf==24.12.*,>=0.0.0a0", diff --git a/python/pylibcudf/pyproject.toml b/python/pylibcudf/pyproject.toml index e8052dfba4c..b2cec80f484 100644 --- a/python/pylibcudf/pyproject.toml +++ b/python/pylibcudf/pyproject.toml @@ -18,7 +18,7 @@ authors = [ license = { text = "Apache 2.0" } requires-python = ">=3.10" dependencies = [ - "cuda-python>=11.7.1,<12.0a0,!=11.8.4", + "cuda-python>=11.7.1,<12.0a0,<=11.8.3", "libcudf==24.12.*,>=0.0.0a0", "nvtx>=0.2.1", "packaging", From bbd3b43719545754e9a1f6b204aad5b143f48419 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 7 Nov 2024 01:57:47 -0800 Subject: [PATCH 11/12] Fix the example in documentation for `get_dremel_data()` (#17242) Closes #11396. Fixes the example in the documentation of `get_dremel_data()` Authors: - Muhammad Haseeb (https://github.com/mhaseeb123) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - David Wendt (https://github.com/davidwendt) - Vukasin Milovanovic (https://github.com/vuule) - Mike Wilson (https://github.com/hyperbolic2346) - MithunR (https://github.com/mythrocks) URL: https://github.com/rapidsai/cudf/pull/17242 --- cpp/include/cudf/lists/detail/dremel.hpp | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/cpp/include/cudf/lists/detail/dremel.hpp b/cpp/include/cudf/lists/detail/dremel.hpp index 96ee30dd261..f45da8e8d8d 100644 --- a/cpp/include/cudf/lists/detail/dremel.hpp +++ b/cpp/include/cudf/lists/detail/dremel.hpp @@ -58,7 +58,7 @@ struct dremel_data { }; /** - * @brief Get the dremel offsets and repetition and definition levels for a LIST column + * @brief Get the dremel offsets, repetition levels, and definition levels for a LIST column * * Dremel is a query system created by Google for ad hoc data analysis. The Dremel engine is * described in depth in the paper "Dremel: Interactive Analysis of Web-Scale @@ -74,7 +74,7 @@ struct dremel_data { * * http://www.goldsborough.me/distributed-systems/2019/05/18/21-09-00-a_look_at_dremel/ * https://akshays-blog.medium.com/wrapping-head-around-repetition-and-definition-levels-in-dremel-powering-bigquery-c1a33c9695da - * https://blog.twitter.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet + * https://blog.x.com/engineering/en_us/a/2013/dremel-made-simple-with-parquet * * The remainder of this documentation assumes familiarity with the Dremel concepts. * @@ -102,16 +102,17 @@ struct dremel_data { * ``` * We can represent it in cudf format with two level of offsets like this: * ``` - * Level 0 offsets = {0, 0, 3, 5, 6} + * Level 0 offsets = {0, 0, 3, 4} * Level 1 offsets = {0, 0, 3, 5, 5} * Values = {1, 2, 3, 4, 5} * ``` - * The desired result of this function is the repetition and definition level values that - * correspond to the data values: + * This function returns the dremel offsets, repetition levels, and definition level + * values that correspond to the data values: * ``` - * col = {[], [[], [1, 2, 3], [4, 5]], [[]]} - * def = { 0 1, 2, 2, 2, 2, 2, 1 } - * rep = { 0, 0, 0, 2, 2, 1, 2, 0 } + * col = {[], [[], [1, 2, 3], [4, 5]], [[]]} + * dremel_offsets = { 0, 1, 7, 8} + * def_levels = { 0, 1, 2, 2, 2, 2, 2, 1 } + * rep_levels = { 0, 0, 1, 2, 2, 1, 2, 0 } * ``` * * Since repetition and definition levels arrays contain a value for each empty list, the size of From e29e0ab477f4a541752a578f8769d8dd816ffbe8 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Thu, 7 Nov 2024 06:14:58 -0500 Subject: [PATCH 12/12] Move strings/numeric convert benchmarks to nvbench (#17255) Moves the `cpp/benchmarks/string/convert_numerics.cpp` and `cpp/benchmarks/string/convert_fixed_point.cpp` benchmark implementations from google-bench to nvbench. Authors: - David Wendt (https://github.com/davidwendt) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Shruti Shivakumar (https://github.com/shrshi) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/17255 --- cpp/benchmarks/CMakeLists.txt | 4 +- cpp/benchmarks/string/convert_fixed_point.cpp | 111 +++++--------- cpp/benchmarks/string/convert_numerics.cpp | 138 ++++++------------ 3 files changed, 79 insertions(+), 174 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 68781889c53..bdc360c082b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -358,8 +358,6 @@ ConfigureBench( STRINGS_BENCH string/convert_datetime.cpp string/convert_durations.cpp - string/convert_fixed_point.cpp - string/convert_numerics.cpp string/copy.cu string/factory.cu string/filter.cpp @@ -375,6 +373,8 @@ ConfigureNVBench( string/char_types.cpp string/combine.cpp string/contains.cpp + string/convert_fixed_point.cpp + string/convert_numerics.cpp string/copy_if_else.cpp string/copy_range.cpp string/count.cpp diff --git a/cpp/benchmarks/string/convert_fixed_point.cpp b/cpp/benchmarks/string/convert_fixed_point.cpp index e5bd794e405..97e114c0795 100644 --- a/cpp/benchmarks/string/convert_fixed_point.cpp +++ b/cpp/benchmarks/string/convert_fixed_point.cpp @@ -16,93 +16,48 @@ #include #include -#include #include #include #include -namespace { +#include -std::unique_ptr get_strings_column(cudf::size_type rows) -{ - auto result = - create_random_column(cudf::type_id::FLOAT32, row_count{static_cast(rows)}); - return cudf::strings::from_floats(result->view()); -} - -} // anonymous namespace - -class StringsToFixedPoint : public cudf::benchmark {}; - -template -void convert_to_fixed_point(benchmark::State& state) -{ - auto const rows = static_cast(state.range(0)); - auto const strings_col = get_strings_column(rows); - auto const strings_view = cudf::strings_column_view(strings_col->view()); - auto const dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; - - for (auto _ : state) { - cuda_event_timer raii(state, true); - auto volatile results = cudf::strings::to_fixed_point(strings_view, dtype); - } +using Types = nvbench::type_list; - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (strings_view.chars_size(cudf::get_default_stream()) + rows * cudf::size_of(dtype))); -} - -class StringsFromFixedPoint : public cudf::benchmark {}; +NVBENCH_DECLARE_TYPE_STRINGS(numeric::decimal32, "decimal32", "decimal32"); +NVBENCH_DECLARE_TYPE_STRINGS(numeric::decimal64, "decimal64", "decimal64"); -template -void convert_from_fixed_point(benchmark::State& state) +template +void bench_convert_fixed_point(nvbench::state& state, nvbench::type_list) { - auto const rows = static_cast(state.range(0)); - auto const strings_col = get_strings_column(rows); - auto const dtype = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; - auto const fp_col = - cudf::strings::to_fixed_point(cudf::strings_column_view(strings_col->view()), dtype); - - std::unique_ptr results = nullptr; - - for (auto _ : state) { - cuda_event_timer raii(state, true); - results = cudf::strings::from_fixed_point(fp_col->view()); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_num = state.get_string("dir") == "from"; + + auto const data_type = cudf::data_type{cudf::type_to_id(), numeric::scale_type{-2}}; + auto const fp_col = create_random_column(data_type.id(), row_count{num_rows}); + + auto const strings_col = cudf::strings::from_fixed_point(fp_col->view()); + auto const sv = cudf::strings_column_view(strings_col->view()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_num) { + state.add_global_memory_reads(num_rows * cudf::size_of(data_type)); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::to_fixed_point(sv, data_type); }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows * cudf::size_of(data_type)); + state.exec(nvbench::exec_tag::sync, + [&](nvbench::launch& launch) { cudf::strings::from_fixed_point(fp_col->view()); }); } - - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) + - rows * cudf::size_of(dtype))); } -#define CONVERT_TO_FIXED_POINT_BMD(name, fixed_point_type) \ - BENCHMARK_DEFINE_F(StringsToFixedPoint, name)(::benchmark::State & state) \ - { \ - convert_to_fixed_point(state); \ - } \ - BENCHMARK_REGISTER_F(StringsToFixedPoint, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 12, 1 << 24) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define CONVERT_FROM_FIXED_POINT_BMD(name, fixed_point_type) \ - BENCHMARK_DEFINE_F(StringsFromFixedPoint, name)(::benchmark::State & state) \ - { \ - convert_from_fixed_point(state); \ - } \ - BENCHMARK_REGISTER_F(StringsFromFixedPoint, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 12, 1 << 24) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -CONVERT_TO_FIXED_POINT_BMD(strings_to_decimal32, numeric::decimal32); -CONVERT_TO_FIXED_POINT_BMD(strings_to_decimal64, numeric::decimal64); - -CONVERT_FROM_FIXED_POINT_BMD(strings_from_decimal32, numeric::decimal32); -CONVERT_FROM_FIXED_POINT_BMD(strings_from_decimal64, numeric::decimal64); +NVBENCH_BENCH_TYPES(bench_convert_fixed_point, NVBENCH_TYPE_AXES(Types)) + .set_name("fixed_point") + .set_type_axes_names({"DataType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22}); diff --git a/cpp/benchmarks/string/convert_numerics.cpp b/cpp/benchmarks/string/convert_numerics.cpp index 8f875c5c80f..e1f650dd6cd 100644 --- a/cpp/benchmarks/string/convert_numerics.cpp +++ b/cpp/benchmarks/string/convert_numerics.cpp @@ -16,117 +16,67 @@ #include #include -#include #include #include #include -namespace { +#include -template -std::unique_ptr get_numerics_column(cudf::size_type rows) -{ - return create_random_column(cudf::type_to_id(), row_count{rows}); -} +namespace { template -std::unique_ptr get_strings_column(cudf::size_type rows) +std::unique_ptr get_strings_column(cudf::column_view const& nv) { - auto const numerics_col = get_numerics_column(rows); if constexpr (std::is_floating_point_v) { - return cudf::strings::from_floats(numerics_col->view()); + return cudf::strings::from_floats(nv); } else { - return cudf::strings::from_integers(numerics_col->view()); - } -} -} // anonymous namespace - -class StringsToNumeric : public cudf::benchmark {}; - -template -void convert_to_number(benchmark::State& state) -{ - auto const rows = static_cast(state.range(0)); - - auto const strings_col = get_strings_column(rows); - auto const strings_view = cudf::strings_column_view(strings_col->view()); - auto const col_type = cudf::type_to_id(); - - for (auto _ : state) { - cuda_event_timer raii(state, true); - if constexpr (std::is_floating_point_v) { - cudf::strings::to_floats(strings_view, cudf::data_type{col_type}); - } else { - cudf::strings::to_integers(strings_view, cudf::data_type{col_type}); - } + return cudf::strings::from_integers(nv); } - - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (strings_view.chars_size(cudf::get_default_stream()) + rows * sizeof(NumericType))); } +} // namespace -class StringsFromNumeric : public cudf::benchmark {}; +using Types = nvbench::type_list; template -void convert_from_number(benchmark::State& state) +void bench_convert_number(nvbench::state& state, nvbench::type_list) { - auto const rows = static_cast(state.range(0)); - - auto const numerics_col = get_numerics_column(rows); - auto const numerics_view = numerics_col->view(); - - std::unique_ptr results = nullptr; - - for (auto _ : state) { - cuda_event_timer raii(state, true); - if constexpr (std::is_floating_point_v) - results = cudf::strings::from_floats(numerics_view); - else - results = cudf::strings::from_integers(numerics_view); + auto const num_rows = static_cast(state.get_int64("num_rows")); + auto const from_num = state.get_string("dir") == "from"; + + auto const data_type = cudf::data_type(cudf::type_to_id()); + auto const num_col = create_random_column(data_type.id(), row_count{num_rows}); + + auto const strings_col = get_strings_column(num_col->view()); + auto const sv = cudf::strings_column_view(strings_col->view()); + + auto stream = cudf::get_default_stream(); + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + + if (from_num) { + state.add_global_memory_reads(num_rows); + state.add_global_memory_writes(sv.chars_size(stream)); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if constexpr (std::is_floating_point_v) { + cudf::strings::to_floats(sv, data_type); + } else { + cudf::strings::to_integers(sv, data_type); + } + }); + } else { + state.add_global_memory_reads(sv.chars_size(stream)); + state.add_global_memory_writes(num_rows); + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + if constexpr (std::is_floating_point_v) + cudf::strings::from_floats(num_col->view()); + else + cudf::strings::from_integers(num_col->view()); + }); } - - // bytes_processed = bytes_input + bytes_output - state.SetBytesProcessed( - state.iterations() * - (cudf::strings_column_view(results->view()).chars_size(cudf::get_default_stream()) + - rows * sizeof(NumericType))); } -#define CONVERT_TO_NUMERICS_BD(name, type) \ - BENCHMARK_DEFINE_F(StringsToNumeric, name)(::benchmark::State & state) \ - { \ - convert_to_number(state); \ - } \ - BENCHMARK_REGISTER_F(StringsToNumeric, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -#define CONVERT_FROM_NUMERICS_BD(name, type) \ - BENCHMARK_DEFINE_F(StringsFromNumeric, name)(::benchmark::State & state) \ - { \ - convert_from_number(state); \ - } \ - BENCHMARK_REGISTER_F(StringsFromNumeric, name) \ - ->RangeMultiplier(4) \ - ->Range(1 << 10, 1 << 17) \ - ->UseManualTime() \ - ->Unit(benchmark::kMicrosecond); - -CONVERT_TO_NUMERICS_BD(strings_to_float32, float); -CONVERT_TO_NUMERICS_BD(strings_to_float64, double); -CONVERT_TO_NUMERICS_BD(strings_to_int32, int32_t); -CONVERT_TO_NUMERICS_BD(strings_to_int64, int64_t); -CONVERT_TO_NUMERICS_BD(strings_to_uint8, uint8_t); -CONVERT_TO_NUMERICS_BD(strings_to_uint16, uint16_t); - -CONVERT_FROM_NUMERICS_BD(strings_from_float32, float); -CONVERT_FROM_NUMERICS_BD(strings_from_float64, double); -CONVERT_FROM_NUMERICS_BD(strings_from_int32, int32_t); -CONVERT_FROM_NUMERICS_BD(strings_from_int64, int64_t); -CONVERT_FROM_NUMERICS_BD(strings_from_uint8, uint8_t); -CONVERT_FROM_NUMERICS_BD(strings_from_uint16, uint16_t); +NVBENCH_BENCH_TYPES(bench_convert_number, NVBENCH_TYPE_AXES(Types)) + .set_name("numeric") + .set_type_axes_names({"NumericType"}) + .add_string_axis("dir", {"to", "from"}) + .add_int64_axis("num_rows", {1 << 16, 1 << 18, 1 << 20, 1 << 22});