Merge branch 'branch-24.02' into fix-groupby-apply-jit-nans

rapidsai · Jan 9, 2024 · c0ca7da · c0ca7da
2 parents ffdeb85 + 3a1601d
commit c0ca7da
Show file tree

Hide file tree

Showing 31 changed files with 560 additions and 290 deletions.
diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml
@@ -116,7 +116,7 @@ jobs:
       build_type: pull-request
       script: ci/test_wheel_cudf.sh
   wheel-build-dask-cudf:
-    needs: wheel-tests-cudf
+    needs: wheel-build-cudf
     secrets: inherit
     uses: rapidsai/shared-workflows/.github/workflows/[email protected]
     with:

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
@@ -1,5 +1,5 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
@@ -23,7 +23,7 @@ pyproject_file="${package_dir}/pyproject.toml"
 
 sed -i "s/^name = \"${package_name}\"/name = \"${package_name}${PACKAGE_CUDA_SUFFIX}\"/g" ${pyproject_file}
 echo "${version}" > VERSION
-sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name}/_version.py"
+sed -i "/^__git_commit__/ s/= .*/= \"${commit}\"/g" "${package_dir}/${package_name//-/_}/_version.py"
 
 # For nightlies we want to ensure that we're pulling in alphas as well. The
 # easiest way to do so is to augment the spec with a constraint containing a
@@ -34,7 +34,7 @@ if ! rapids-is-release-build; then
     alpha_spec=',>=0.0.0a0'
 fi
 
-if [[ ${package_name} == "dask_cudf" ]]; then
+if [[ ${package_name} == "dask-cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
     sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}

diff --git a/ci/build_wheel_dask_cudf.sh b/ci/build_wheel_dask_cudf.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
-# Copyright (c) 2023, NVIDIA CORPORATION.
+# Copyright (c) 2023-2024, NVIDIA CORPORATION.
 
 set -euo pipefail
 
 package_dir="python/dask_cudf"
 
-./ci/build_wheel.sh dask_cudf ${package_dir}
+./ci/build_wheel.sh dask-cudf ${package_dir}
 
 RAPIDS_PY_CUDA_SUFFIX="$(rapids-wheel-ctk-name-gen ${RAPIDS_CUDA_VERSION})"
 RAPIDS_PY_WHEEL_NAME="dask_cudf_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 ${package_dir}/dist
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -51,7 +51,6 @@ dependencies:
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.2.*
 - make
-- mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
 - myst-nb

diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -50,7 +50,6 @@ dependencies:
 - librdkafka>=1.9.0,<1.10.0a0
 - librmm==24.2.*
 - make
-- mimesis>=4.1.0
 - moto>=4.0.8
 - msgpack-python
 - myst-nb

diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -63,10 +63,11 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
    *
    * Use the indexalator_factory to create an iterator instance.
    *
-   * @param data      Pointer to an integer array in device memory.
-   * @param dtype Type of data in data
+   * @param data   Pointer to an integer array in device memory
+   * @param dtype  Type of data in data
+   * @param offset Index value within `offsets` to use as the beginning of the iterator
    */
-  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
+  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype, size_type offset = 0)
     : base_normalator<input_offsetalator, int64_t>(
         dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
       p_{static_cast<char const*>(data)}
@@ -78,6 +79,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
     cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
                 "Unexpected offsets type");
 #endif
+    p_ += (this->width_ * offset);
   }
 
  protected:

diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,14 +28,19 @@ namespace detail {
 struct offsetalator_factory {
   /**
    * @brief Create an input offsetalator instance from an offsets column
+   *
+   * @param offsets Column to wrap with an offsetalator
+   * @param offset Index value within `offsets` to use as the beginning of the iterator
    */
-  static input_offsetalator make_input_iterator(column_view const& offsets)
+  static input_offsetalator make_input_iterator(column_view const& offsets, size_type offset = 0)
   {
-    return input_offsetalator(offsets.head(), offsets.type());
+    return input_offsetalator(offsets.head(), offsets.type(), offset);
   }
 
   /**
    * @brief Create an output offsetalator instance from an offsets column
+   *
+   * @param offsets Column to wrap with an offsetalator
    */
   static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
   {

diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
@@ -124,6 +124,14 @@ class writer {
    * @brief Finishes the chunked/streamed write process.
    */
   void close();
+
+  /**
+   * @brief Skip work done in `close()`; should be called if `write()` failed.
+   *
+   * Calling skip_close() prevents the writer from writing the (invalid) file footer and the
+   * postscript.
+   */
+  void skip_close();
 };
 }  // namespace orc::detail
 }  // namespace cudf::io
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,7 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/offsets_iterator_factory.cuh>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -79,7 +80,7 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr)
 template <typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
                                                 char* out_chars,
-                                                cudf::device_span<int32_t const> const out_offsets,
+                                                cudf::detail::input_offsetalator const out_offsets,
                                                 MapIterator string_indices,
                                                 size_type total_out_strings)
 {
@@ -109,28 +110,25 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
     // between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
     // `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid
     // `load_uint4` reading beyond string boundaries.
-    int32_t out_start_aligned =
+    int64_t const out_start_aligned =
       (out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) /
         out_datatype_size * out_datatype_size -
       alignment_offset;
-    int32_t out_end_aligned =
+    int64_t const out_end_aligned =
       (out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
       alignment_offset;
 
-    for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size;
-         ichar < out_end_aligned;
+    for (int64_t ichar = out_start_aligned + warp_lane * out_datatype_size; ichar < out_end_aligned;
          ichar += cudf::detail::warp_size * out_datatype_size) {
       *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
         load_uint4(in_start + ichar - out_start);
     }
 
-    // Tail logic: copy characters of the current string outside `[out_start_aligned,
-    // out_end_aligned)`.
+    // Copy characters of the current string outside [out_start_aligned, out_end_aligned)
     if (out_end_aligned <= out_start_aligned) {
       // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
       // entire string.
-      for (int32_t ichar = out_start + warp_lane; ichar < out_end;
-           ichar += cudf::detail::warp_size) {
+      for (auto ichar = out_start + warp_lane; ichar < out_end; ichar += cudf::detail::warp_size) {
         out_chars[ichar] = in_start[ichar - out_start];
       }
     } else {
@@ -139,7 +137,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
         out_chars[out_start + warp_lane] = in_start[warp_lane];
       }
       // Copy characters in range `[out_end_aligned, out_end)`.
-      int32_t ichar = out_end_aligned + warp_lane;
+      auto const ichar = out_end_aligned + warp_lane;
       if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; }
     }
   }
@@ -164,11 +162,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
 template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
 __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
                                               char* out_chars,
-                                              cudf::device_span<int32_t const> const out_offsets,
+                                              cudf::detail::input_offsetalator const out_offsets,
                                               MapIterator string_indices,
                                               size_type total_out_strings)
 {
-  __shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1];
+  __shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1];
 
   // Current thread block will process output strings starting at `begin_out_string_idx`.
   size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock;
@@ -185,7 +183,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
   }
   __syncthreads();
 
-  for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
+  for (int64_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
        out_ibyte < out_offsets_threadblock[strings_current_threadblock];
        out_ibyte += blockDim.x) {
     // binary search for the string index corresponding to out_ibyte
@@ -197,7 +195,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
     size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter);
 
     // calculate which character to load within the string
-    int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];
+    auto const icharacter = out_ibyte - out_offsets_threadblock[string_idx];
 
     size_type in_string_idx = string_indices[begin_out_string_idx + string_idx];
     out_chars[out_ibyte]    = strings_begin[in_string_idx].data()[icharacter];
@@ -227,7 +225,7 @@ template <typename StringIterator, typename MapIterator>
 std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
                                            MapIterator map_begin,
                                            MapIterator map_end,
-                                           cudf::device_span<int32_t const> const offsets,
+                                           cudf::detail::input_offsetalator const offsets,
                                            size_type chars_bytes,
                                            rmm::cuda_stream_view stream,
                                            rmm::mr::device_memory_resource* mr)
@@ -300,22 +298,25 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
 
   // build offsets column
   auto const d_strings    = column_device_view::create(strings.parent(), stream);
-  auto const d_in_offsets = !strings.is_empty() ? strings.offsets_begin() : nullptr;
+  auto const d_in_offsets = cudf::detail::offsetalator_factory::make_input_iterator(
+    strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
+    strings.offset());
 
   auto offsets_itr = thrust::make_transform_iterator(
     begin,
     cuda::proclaim_return_type<size_type>(
       [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
         if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; }
         if (not d_strings.is_valid(idx)) { return 0; }
-        return d_in_offsets[idx + 1] - d_in_offsets[idx];
+        return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
       }));
   auto [out_offsets_column, total_bytes] =
     cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
 
   // build chars column
-  auto const offsets_view = out_offsets_column->view();
-  auto out_chars_column   = gather_chars(
+  auto const offsets_view =
+    cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
+  auto out_chars_column = gather_chars(
     d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
 
   return make_strings_column(output_count,

diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -103,9 +103,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
       auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
       // use a character-parallel kernel for long string lengths
       if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
-        auto const d_data = offsets_view.template data<size_type>();
         auto const d_offsets =
-          device_span<size_type const>{d_data, static_cast<std::size_t>(offsets_view.size())};
+          cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
         auto const str_begin = thrust::make_transform_iterator(
           begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
             return string_view{ip.first, ip.second};

diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,6 +53,20 @@ rmm::device_uvector<string_view> create_string_vector_from_column(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr);
 
+/**
+ * @brief Return a normalized offset value from a strings offsets column
+ *
+ * @throw std::invalid_argument if `offsets` is neither INT32 nor INT64
+ *
+ * @param offsets Input column of type INT32 or INT64
+ * @param index Row value to retrieve
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Value at `offsets[index]`
+ */
+int64_t get_offset_value(cudf::column_view const& offsets,
+                         size_type index,
+                         rmm::cuda_stream_view stream);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
@@ -436,7 +436,16 @@ void write_orc(orc_writer_options const& options)
   auto writer = std::make_unique<orc::detail::writer>(
     std::move(sinks[0]), options, io_detail::single_write_mode::YES, cudf::get_default_stream());
 
-  writer->write(options.get_table());
+  try {
+    writer->write(options.get_table());
+  } catch (...) {
+    // If an exception is thrown, the output is incomplete/corrupted.
+    // Make sure the writer will not close with such corrupted data.
+    // In addition, the writer may throw an exception while trying to close, which would terminate
+    // the process.
+    writer->skip_close();
+    throw;
+  }
 }
 
 /**