NVIDIA · pciolkosz · Nov 25, 2024 · Nov 21, 2024 · Nov 22, 2024 · Nov 22, 2024
@@ -23,15 +23,17 @@
 
 #include <cuda/std/__ranges/concepts.h>
 #include <cuda/std/__type_traits/is_convertible.h>
+#include <cuda/std/mdspan>
 #include <cuda/std/span>
 
 #include <cuda/experimental/__launch/launch_transform.cuh>
 
 namespace cuda::experimental
 {
+
 #if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
 template <typename _Tp>
-concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
+concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
 
 #else
 template <typename _Tp, typename = int>
@@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
     int>> = true;
 
 template <typename _Tp>
-inline constexpr bool __valid_copy_fill_argument =
+inline constexpr bool __valid_1d_copy_fill_argument =
   _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;
 
 #endif
 
+template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
+using __as_mdspan_t =
+  _CUDA_VSTD::mdspan<typename _Decayed::value_type,
+                     typename _Decayed::extents_type,
+                     typename _Decayed::layout_type,
+                     typename _Decayed::accessor_type>;
+
+template <typename _Tp, typename = int>
+inline constexpr bool __convertible_to_mdspan = false;
+
+template <typename _Tp>
+inline constexpr bool
+  __convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
+    true;
+
+template <typename _Tp>
+inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;
+
 } // namespace cuda::experimental
 #endif //__CUDAX_ALGORITHM_COMMON
@@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 
 //! @brief Launches a bytewise memory copy from source to destination into the provided stream.
 //!
-//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
+//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
+//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
 //! Both source and destination type is required to be trivially copyable.
 //!
 //! This call might be synchronous if either source or destination is pagable host memory.
@@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
 //! @param __src Source to copy from
 //! @param __dst Destination to copy into
 _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
-_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
+_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
 void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
 {
   __copy_bytes_impl(
@@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
       detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
 }
 
+template <typename _Extents, typename _OtherExtents>
+inline constexpr bool __copy_bytes_compatible_extents = false;
+
+template <typename _IndexType,
+          _CUDA_VSTD::size_t... _Extents,
+          typename _OtherIndexType,
+          _CUDA_VSTD::size_t... _OtherExtents>
+inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
+                                                      _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
+  decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
+    _CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
+    _CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
+    _CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;
+
+template <typename _SrcExtents, typename _DstExtents>
+_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
+{
+  for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
+  {
+    if (__src_exts.extent(__i)
+        != static_cast<typename _SrcExtents::index_type>(
+          __dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
+    {
+      return false;
+    }
+  }
+  return true;
+}
+
+template <typename _SrcElem,
+          typename _SrcExtents,
+          typename _SrcLayout,
+          typename _SrcAccessor,
+          typename _DstElem,
+          typename _DstExtents,
+          typename _DstLayout,
+          typename _DstAccessor>
+void __nd_copy_bytes_impl(stream_ref __stream,
+                          _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
+                          _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
+{
+  static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
+                "Multidimensional copy requires both source and destination extents to be compatible");
+  static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
+                "Multidimensional copy requires both source and destination layouts to match");
+
+  if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
+  {
+    _CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
+  }
+
+  __copy_bytes_impl(__stream,
+                    _CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
+                    _CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
+}
+
+//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
+//!
+//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
+//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
+//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
+//! destination type is required to be trivially copyable.
+//!
+//! This call might be synchronous if either source or destination is pagable host memory.
+//! It will be synchronous if both destination and copy is located in host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __src Source to copy from
+//! @param __dst Destination to copy into
+_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
+_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
+void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
+{
+  decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
+  decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
+  decltype(auto) __src_as_arg      = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
+  decltype(auto) __dst_as_arg      = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
+  __nd_copy_bytes_impl(
+    __stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
+}
+
 } // namespace cuda::experimental
 #endif // __CUDAX_ALGORITHM_COPY
@@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _
 
 //! @brief Launches an operation to bytewise fill the memory into the provided stream.
 //!
-//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
-//! into one. It can't reside in pagable host memory.
+//! Destination needs to either be a `contiguous_range` or launch transform
+//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
 //! Destination type is required to be trivially copyable.
 //!
+//! Destination can't reside in pagable host memory.
+//!
 //! @param __stream Stream that the copy should be inserted into
 //! @param __dst Destination memory to fill
 //! @param __value Value to fill into every byte in the destination
 _CCCL_TEMPLATE(typename _DstTy)
-_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
+_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
 void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
 {
   __fill_bytes_impl(__stream,
@@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
                     __value);
 }
 
+//! @brief Launches an operation to bytewise fill the memory into the provided stream.
+//!
+//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
+//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
+//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`.   Destination
+//! type is required to be trivially copyable.
+//!
+//! Destination can't reside in pagable host memory.
+//!
+//! @param __stream Stream that the copy should be inserted into
+//! @param __dst Destination memory to fill
+//! @param __value Value to fill into every byte in the destination
+_CCCL_TEMPLATE(typename _DstTy)
+_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
+void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
+{
+  decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
+  decltype(auto) __dst_as_arg      = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
+  auto __dst_mdspan                = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);
+
+  __fill_bytes_impl(
+    __stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
+}
+
 } // namespace cuda::experimental
 #endif // __CUDAX_ALGORITHM_FILL
@@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p
   }
 }
 
+template <typename Layout = cuda::std::layout_right, typename Extents>
+auto make_buffer_for_mdspan(Extents extents, char value = 0)
+{
+  cuda::mr::pinned_memory_resource host_resource;
+  auto mapping = typename Layout::template mapping<decltype(extents)>{extents};
+
+  cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer(host_resource, mapping.required_span_size());
+
+  memset(buffer.data(), value, buffer.size_bytes());
+
+  return buffer;
+}
+
 namespace cuda::experimental
 {
 
 // Need a type that goes through all launch_transform steps, but is not a contiguous_range
+template <typename AsKernelArg = cuda::std::span<int>>
 struct weird_buffer
 {
   const cuda::mr::pinned_memory_resource& resource;
@@ -57,7 +71,9 @@ struct weird_buffer
       : resource(res)
       , data((int*) res.allocate(s * sizeof(int)))
       , size(s)
-  {}
+  {
+    memset(data, 0, size);
+  }
 
   ~weird_buffer()
   {
@@ -72,22 +88,25 @@ struct weird_buffer
     int* data;
     std::size_t size;
 
-    using __as_kernel_arg = cuda::std::span<int>;
+    using __as_kernel_arg = AsKernelArg;
 
     operator cuda::std::span<int>()
     {
       return {data, size};
     }
+
+    template <typename Extents>
+    operator cuda::std::mdspan<int, Extents>()
+    {
+      return cuda::std::mdspan<int, Extents>{data};
+    }
   };
 
   _CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept
   {
     return {self.data, self.size};
   }
 };
-
-static_assert(std::is_same_v<cudax::as_kernel_arg_t<cudax::weird_buffer>, cuda::std::span<int>>);
-
 } // namespace cuda::experimental
 
 #endif // __ALGORITHM_COMMON__
@@ -10,7 +10,7 @@
 
 #include "common.cuh"
 
-TEST_CASE("Copy", "[data_manipulation]")
+TEST_CASE("1d Copy", "[data_manipulation]")
 {
   cudax::stream _stream;
 
@@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]")
     CUDAX_REQUIRE(vec[1] == 0xbeef);
   }
 }
+
+template <typename SrcLayout = cuda::std::layout_right,
+          typename DstLayout = SrcLayout,
+          typename SrcExtents,
+          typename DstExtents>
+void test_mdspan_copy_bytes(
+  cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents())
+{
+  auto src_buffer = make_buffer_for_mdspan<SrcLayout>(src_extents, 1);
+  auto dst_buffer = make_buffer_for_mdspan<DstLayout>(dst_extents, 0);
+
+  cuda::std::mdspan<int, SrcExtents, SrcLayout> src(src_buffer.data(), src_extents);
+  cuda::std::mdspan<int, DstExtents, DstLayout> dst(dst_buffer.data(), dst_extents);
+
+  for (int i = 0; i < static_cast<int>(src.extent(1)); i++)
+  {
+    src(0, i) = i;
+  }
+
+  cudax::copy_bytes(stream, std::move(src), dst);
+  stream.wait();
+
+  for (int i = 0; i < static_cast<int>(dst.extent(1)); i++)
+  {
+    CUDAX_CHECK(dst(0, i) == i);
+  }
+}
+
+TEST_CASE("Mdspan copy", "[data_manipulation]")
+{
+  cudax::stream stream;
+
+  SECTION("Different extents")
+  {
+    auto static_extents = cuda::std::extents<size_t, 3, 4>();
+    test_mdspan_copy_bytes(stream, static_extents, static_extents);
+    test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, static_extents);
+
+    auto dynamic_extents = cuda::std::dextents<size_t, 2>(3, 4);
+    test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents);
+    test_mdspan_copy_bytes(stream, static_extents, dynamic_extents);
+    test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, dynamic_extents);
+
+    auto mixed_extents = cuda::std::extents<int, cuda::std::dynamic_extent, 4>(3);
+    test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents);
+    test_mdspan_copy_bytes(stream, mixed_extents, static_extents);
+    test_mdspan_copy_bytes<cuda::std::layout_left>(stream, mixed_extents, static_extents);
+  }
+
+  SECTION("Launch transform")
+  {
+    auto mixed_extents =
+      cuda::std::extents<size_t, 1024, cuda::std::dynamic_extent, 2, cuda::std::dynamic_extent>(1024, 2);
+    [[maybe_unused]] auto static_extents = cuda::std::extents<size_t, 1024, 1024, 2, 2>();
+    auto mdspan_buffer                   = make_buffer_for_mdspan(mixed_extents, 1);
+    cuda::std::mdspan<int, decltype(mixed_extents)> mdspan(mdspan_buffer.data(), mixed_extents);
+    cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
+      cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()};
+
+    cudax::copy_bytes(stream, mdspan, buffer);
+    stream.wait();
+    CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size()));
+  }
+}
@@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]")
     check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size));
   }
 }
+
+TEST_CASE("Mdspan Fill", "[data_manipulation]")
+{
+  cudax::stream stream;
+  {
+    cuda::std::dextents<size_t, 3> dynamic_extents{1, 2, 3};
+    auto buffer = make_buffer_for_mdspan(dynamic_extents, 0);
+    cuda::std::mdspan<int, decltype(dynamic_extents)> dynamic_mdspan(buffer.data(), dynamic_extents);
+
+    cudax::fill_bytes(stream, dynamic_mdspan, fill_byte);
+    check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
+  }
+  {
+    cuda::std::extents<size_t, 2, cuda::std::dynamic_extent, 4> mixed_extents{1};
+    auto buffer = make_buffer_for_mdspan(mixed_extents, 0);
+    cuda::std::mdspan<int, decltype(mixed_extents)> mixed_mdspan(buffer.data(), mixed_extents);
+
+    cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
+    check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
+  }
+  {
+    using static_extents = cuda::std::extents<size_t, 2, 3, 4>;
+    auto size            = cuda::std::layout_left::mapping<static_extents>().required_span_size();
+    cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cuda::mr::pinned_memory_resource{}, size);
+
+    cudax::fill_bytes(stream, buffer, fill_byte);
+    check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));
+  }
+}