diff --git a/cudax/include/cuda/experimental/__algorithm/common.cuh b/cudax/include/cuda/experimental/__algorithm/common.cuh index 9dd891f7b28..eadb5e50dd5 100644 --- a/cudax/include/cuda/experimental/__algorithm/common.cuh +++ b/cudax/include/cuda/experimental/__algorithm/common.cuh @@ -23,15 +23,17 @@ #include #include +#include #include #include namespace cuda::experimental { + #if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES) template -concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range>; +concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range>; #else template @@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span< int>> = true; template -inline constexpr bool __valid_copy_fill_argument = +inline constexpr bool __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range> || __convertible_to_span<_Tp>; #endif +template > +using __as_mdspan_t = + _CUDA_VSTD::mdspan; + +template +inline constexpr bool __convertible_to_mdspan = false; + +template +inline constexpr bool + __convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> = + true; + +template +inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan>; + } // namespace cuda::experimental #endif //__CUDAX_ALGORITHM_COMMON diff --git a/cudax/include/cuda/experimental/__algorithm/copy.cuh b/cudax/include/cuda/experimental/__algorithm/copy.cuh index 9cb5cf99a0a..e2c7c73d51a 100644 --- a/cudax/include/cuda/experimental/__algorithm/copy.cuh +++ b/cudax/include/cuda/experimental/__algorithm/copy.cuh @@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD //! @brief Launches a bytewise memory copy from source to destination into the provided stream. //! -//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one. +//! Both source and destination needs to either be a `contiguous_range` or launch transform to one. +//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias. //! Both source and destination type is required to be trivially copyable. //! //! This call might be synchronous if either source or destination is pagable host memory. @@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD //! @param __src Source to copy from //! @param __dst Destination to copy into _CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) -_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>) +_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>) void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) { __copy_bytes_impl( @@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst))))); } +template +inline constexpr bool __copy_bytes_compatible_extents = false; + +template +inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>, + _CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> = + decltype(_CUDA_VSTD::__detail::__check_compatible_extents( + _CUDA_VSTD::integral_constant{}, + _CUDA_VSTD::integer_sequence{}, + _CUDA_VSTD::integer_sequence{}))::value; + +template +_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts) +{ + for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++) + { + if (__src_exts.extent(__i) + != static_cast( + __dst_exts.extent((static_cast(__i))))) + { + return false; + } + } + return true; +} + +template +void __nd_copy_bytes_impl(stream_ref __stream, + _CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src, + _CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst) +{ + static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>, + "Multidimensional copy requires both source and destination extents to be compatible"); + static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>, + "Multidimensional copy requires both source and destination layouts to match"); + + if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents())) + { + _CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source"); + } + + __copy_bytes_impl(__stream, + _CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()), + _CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size())); +} + +//! @brief Launches a bytewise memory copy from source to destination into the provided stream. +//! +//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to +//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template +//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and +//! destination type is required to be trivially copyable. +//! +//! This call might be synchronous if either source or destination is pagable host memory. +//! It will be synchronous if both destination and copy is located in host memory. +//! +//! @param __stream Stream that the copy should be inserted into +//! @param __src Source to copy from +//! @param __dst Destination to copy into +_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy) +_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>) +void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst) +{ + decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src)); + decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)); + decltype(auto) __src_as_arg = static_cast>(__src_transformed); + decltype(auto) __dst_as_arg = static_cast>(__dst_transformed); + __nd_copy_bytes_impl( + __stream, __as_mdspan_t(__src_as_arg), __as_mdspan_t(__dst_as_arg)); +} + } // namespace cuda::experimental #endif // __CUDAX_ALGORITHM_COPY diff --git a/cudax/include/cuda/experimental/__algorithm/fill.cuh b/cudax/include/cuda/experimental/__algorithm/fill.cuh index aeb54235c78..cc7ddc61382 100644 --- a/cudax/include/cuda/experimental/__algorithm/fill.cuh +++ b/cudax/include/cuda/experimental/__algorithm/fill.cuh @@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _ //! @brief Launches an operation to bytewise fill the memory into the provided stream. //! -//! Destination needs to either be a `contiguous_range` or implicitly/launch transform -//! into one. It can't reside in pagable host memory. +//! Destination needs to either be a `contiguous_range` or launch transform +//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias. //! Destination type is required to be trivially copyable. //! +//! Destination can't reside in pagable host memory. +//! //! @param __stream Stream that the copy should be inserted into //! @param __dst Destination memory to fill //! @param __value Value to fill into every byte in the destination _CCCL_TEMPLATE(typename _DstTy) -_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>) +_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>) void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) { __fill_bytes_impl(__stream, @@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) __value); } +//! @brief Launches an operation to bytewise fill the memory into the provided stream. +//! +//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform +//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template +//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Destination +//! type is required to be trivially copyable. +//! +//! Destination can't reside in pagable host memory. +//! +//! @param __stream Stream that the copy should be inserted into +//! @param __dst Destination memory to fill +//! @param __value Value to fill into every byte in the destination +_CCCL_TEMPLATE(typename _DstTy) +_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>) +void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value) +{ + decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)); + decltype(auto) __dst_as_arg = static_cast>(__dst_transformed); + auto __dst_mdspan = __as_mdspan_t(__dst_as_arg); + + __fill_bytes_impl( + __stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value); +} + } // namespace cuda::experimental #endif // __CUDAX_ALGORITHM_FILL diff --git a/cudax/test/algorithm/common.cuh b/cudax/test/algorithm/common.cuh index 2789a1f4802..4b262966190 100644 --- a/cudax/test/algorithm/common.cuh +++ b/cudax/test/algorithm/common.cuh @@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p } } +template +auto make_buffer_for_mdspan(Extents extents, char value = 0) +{ + cuda::mr::pinned_memory_resource host_resource; + auto mapping = typename Layout::template mapping{extents}; + + cudax::uninitialized_buffer buffer(host_resource, mapping.required_span_size()); + + memset(buffer.data(), value, buffer.size_bytes()); + + return buffer; +} + namespace cuda::experimental { // Need a type that goes through all launch_transform steps, but is not a contiguous_range +template > struct weird_buffer { const cuda::mr::pinned_memory_resource& resource; @@ -57,7 +71,9 @@ struct weird_buffer : resource(res) , data((int*) res.allocate(s * sizeof(int))) , size(s) - {} + { + memset(data, 0, size); + } ~weird_buffer() { @@ -72,12 +88,18 @@ struct weird_buffer int* data; std::size_t size; - using __as_kernel_arg = cuda::std::span; + using __as_kernel_arg = AsKernelArg; operator cuda::std::span() { return {data, size}; } + + template + operator cuda::std::mdspan() + { + return cuda::std::mdspan{data}; + } }; _CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept @@ -85,9 +107,6 @@ struct weird_buffer return {self.data, self.size}; } }; - -static_assert(std::is_same_v, cuda::std::span>); - } // namespace cuda::experimental #endif // __ALGORITHM_COMMON__ diff --git a/cudax/test/algorithm/copy.cu b/cudax/test/algorithm/copy.cu index 07eabba32e6..3db65e22c51 100644 --- a/cudax/test/algorithm/copy.cu +++ b/cudax/test/algorithm/copy.cu @@ -10,7 +10,7 @@ #include "common.cuh" -TEST_CASE("Copy", "[data_manipulation]") +TEST_CASE("1d Copy", "[data_manipulation]") { cudax::stream _stream; @@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]") CUDAX_REQUIRE(vec[1] == 0xbeef); } } + +template +void test_mdspan_copy_bytes( + cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents()) +{ + auto src_buffer = make_buffer_for_mdspan(src_extents, 1); + auto dst_buffer = make_buffer_for_mdspan(dst_extents, 0); + + cuda::std::mdspan src(src_buffer.data(), src_extents); + cuda::std::mdspan dst(dst_buffer.data(), dst_extents); + + for (int i = 0; i < static_cast(src.extent(1)); i++) + { + src(0, i) = i; + } + + cudax::copy_bytes(stream, std::move(src), dst); + stream.wait(); + + for (int i = 0; i < static_cast(dst.extent(1)); i++) + { + CUDAX_CHECK(dst(0, i) == i); + } +} + +TEST_CASE("Mdspan copy", "[data_manipulation]") +{ + cudax::stream stream; + + SECTION("Different extents") + { + auto static_extents = cuda::std::extents(); + test_mdspan_copy_bytes(stream, static_extents, static_extents); + test_mdspan_copy_bytes(stream, static_extents, static_extents); + + auto dynamic_extents = cuda::std::dextents(3, 4); + test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents); + test_mdspan_copy_bytes(stream, static_extents, dynamic_extents); + test_mdspan_copy_bytes(stream, static_extents, dynamic_extents); + + auto mixed_extents = cuda::std::extents(3); + test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents); + test_mdspan_copy_bytes(stream, mixed_extents, static_extents); + test_mdspan_copy_bytes(stream, mixed_extents, static_extents); + } + + SECTION("Launch transform") + { + auto mixed_extents = + cuda::std::extents(1024, 2); + [[maybe_unused]] auto static_extents = cuda::std::extents(); + auto mdspan_buffer = make_buffer_for_mdspan(mixed_extents, 1); + cuda::std::mdspan mdspan(mdspan_buffer.data(), mixed_extents); + cudax::weird_buffer> buffer{ + cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()}; + + cudax::copy_bytes(stream, mdspan, buffer); + stream.wait(); + CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size())); + } +} diff --git a/cudax/test/algorithm/fill.cu b/cudax/test/algorithm/fill.cu index 7111aa848f3..ce733871f51 100644 --- a/cudax/test/algorithm/fill.cu +++ b/cudax/test/algorithm/fill.cu @@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]") check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size)); } } + +TEST_CASE("Mdspan Fill", "[data_manipulation]") +{ + cudax::stream stream; + { + cuda::std::dextents dynamic_extents{1, 2, 3}; + auto buffer = make_buffer_for_mdspan(dynamic_extents, 0); + cuda::std::mdspan dynamic_mdspan(buffer.data(), dynamic_extents); + + cudax::fill_bytes(stream, dynamic_mdspan, fill_byte); + check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size())); + } + { + cuda::std::extents mixed_extents{1}; + auto buffer = make_buffer_for_mdspan(mixed_extents, 0); + cuda::std::mdspan mixed_mdspan(buffer.data(), mixed_extents); + + cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte); + check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size())); + } + { + using static_extents = cuda::std::extents; + auto size = cuda::std::layout_left::mapping().required_span_size(); + cudax::weird_buffer> buffer(cuda::mr::pinned_memory_resource{}, size); + + cudax::fill_bytes(stream, buffer, fill_byte); + check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size)); + } +}