Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[CUDAX] Add copy_bytes and fill_bytes overloads for mdspan #2932

Merged
merged 9 commits into from
Nov 25, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 22 additions & 2 deletions cudax/include/cuda/experimental/__algorithm/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,17 @@

#include <cuda/std/__ranges/concepts.h>
#include <cuda/std/__type_traits/is_convertible.h>
#include <cuda/std/mdspan>
#include <cuda/std/span>

#include <cuda/experimental/__launch/launch_transform.cuh>

namespace cuda::experimental
{

#if _CCCL_STD_VER >= 2020 && defined(_CCCL_SPAN_USES_RANGES)
template <typename _Tp>
concept __valid_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;
concept __valid_1d_copy_fill_argument = _CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>>;

#else
template <typename _Tp, typename = int>
Expand All @@ -45,10 +47,28 @@ inline constexpr bool __convertible_to_span<
int>> = true;

template <typename _Tp>
inline constexpr bool __valid_copy_fill_argument =
inline constexpr bool __valid_1d_copy_fill_argument =
_CUDA_VRANGES::contiguous_range<detail::__as_copy_arg_t<_Tp>> || __convertible_to_span<_Tp>;

#endif

template <typename _Tp, typename _Decayed = _CUDA_VSTD::decay_t<_Tp>>
using __as_mdspan_t =
_CUDA_VSTD::mdspan<typename _Decayed::value_type,
typename _Decayed::extents_type,
typename _Decayed::layout_type,
typename _Decayed::accessor_type>;

template <typename _Tp, typename = int>
inline constexpr bool __convertible_to_mdspan = false;

template <typename _Tp>
inline constexpr bool
__convertible_to_mdspan<_Tp, _CUDA_VSTD::enable_if_t<_CUDA_VSTD::is_convertible_v<_Tp, __as_mdspan_t<_Tp>>, int>> =
true;

template <typename _Tp>
inline constexpr bool __valid_nd_copy_fill_argument = __convertible_to_mdspan<detail::__as_copy_arg_t<_Tp>>;

} // namespace cuda::experimental
#endif //__CUDAX_ALGORITHM_COMMON
86 changes: 84 additions & 2 deletions cudax/include/cuda/experimental/__algorithm/copy.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,8 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be a `contiguous_range` or implicitly/launch transform to one.
//! Both source and destination needs to either be a `contiguous_range` or launch transform to one.
//! They can also implicitly convert to `cuda::std::span`, but the type needs to contain `value_type` member alias.
//! Both source and destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
Expand All @@ -63,7 +64,7 @@ void __copy_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_SrcTy> __src, _CUD
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_SrcTy> _CCCL_AND __valid_copy_fill_argument<_DstTy>)
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_SrcTy> _CCCL_AND __valid_1d_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
__copy_bytes_impl(
Expand All @@ -74,5 +75,86 @@ void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst)))));
}

template <typename _Extents, typename _OtherExtents>
inline constexpr bool __copy_bytes_compatible_extents = false;

template <typename _IndexType,
_CUDA_VSTD::size_t... _Extents,
typename _OtherIndexType,
_CUDA_VSTD::size_t... _OtherExtents>
inline constexpr bool __copy_bytes_compatible_extents<_CUDA_VSTD::extents<_IndexType, _Extents...>,
_CUDA_VSTD::extents<_OtherIndexType, _OtherExtents...>> =
decltype(_CUDA_VSTD::__detail::__check_compatible_extents(
_CUDA_VSTD::integral_constant<bool, sizeof...(_Extents) == sizeof...(_OtherExtents)>{},
_CUDA_VSTD::integer_sequence<size_t, _Extents...>{},
_CUDA_VSTD::integer_sequence<size_t, _OtherExtents...>{}))::value;

template <typename _SrcExtents, typename _DstExtents>
_CCCL_NODISCARD bool __copy_bytes_runtime_extents_match(_SrcExtents __src_exts, _DstExtents __dst_exts)
{
for (typename _SrcExtents::rank_type __i = 0; __i < __src_exts.rank(); __i++)
{
if (__src_exts.extent(__i)
!= static_cast<typename _SrcExtents::index_type>(
__dst_exts.extent((static_cast<typename _DstExtents::rank_type>(__i)))))
{
return false;
}
}
return true;
}

template <typename _SrcElem,
typename _SrcExtents,
typename _SrcLayout,
typename _SrcAccessor,
typename _DstElem,
typename _DstExtents,
typename _DstLayout,
typename _DstAccessor>
void __nd_copy_bytes_impl(stream_ref __stream,
_CUDA_VSTD::mdspan<_SrcElem, _SrcExtents, _SrcLayout, _SrcAccessor> __src,
_CUDA_VSTD::mdspan<_DstElem, _DstExtents, _DstLayout, _DstAccessor> __dst)
{
static_assert(__copy_bytes_compatible_extents<_SrcExtents, _DstExtents>,
"Multidimensional copy requires both source and destination extents to be compatible");
static_assert(_CUDA_VSTD::is_same_v<_SrcLayout, _DstLayout>,
"Multidimensional copy requires both source and destination layouts to match");

if (!__copy_bytes_runtime_extents_match(__src.extents(), __dst.extents()))
{
_CUDA_VSTD::__throw_invalid_argument("Copy destination size differs from the source");
}

__copy_bytes_impl(__stream,
_CUDA_VSTD::span(__src.data_handle(), __src.mapping().required_span_size()),
_CUDA_VSTD::span(__dst.data_handle(), __dst.mapping().required_span_size()));
}

//! @brief Launches a bytewise memory copy from source to destination into the provided stream.
//!
//! Both source and destination needs to either be an instance of `cuda::std::mdspan` or launch transform to
//! one. They can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Both source and
//! destination type is required to be trivially copyable.
//!
//! This call might be synchronous if either source or destination is pagable host memory.
//! It will be synchronous if both destination and copy is located in host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __src Source to copy from
//! @param __dst Destination to copy into
_CCCL_TEMPLATE(typename _SrcTy, typename _DstTy)
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_SrcTy> _CCCL_AND __valid_nd_copy_fill_argument<_DstTy>)
void copy_bytes(stream_ref __stream, _SrcTy&& __src, _DstTy&& __dst)
{
decltype(auto) __src_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_SrcTy>(__src));
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
decltype(auto) __src_as_arg = static_cast<detail::__as_copy_arg_t<_SrcTy>>(__src_transformed);
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
__nd_copy_bytes_impl(
__stream, __as_mdspan_t<decltype(__src_as_arg)>(__src_as_arg), __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg));
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_COPY
32 changes: 29 additions & 3 deletions cudax/include/cuda/experimental/__algorithm/fill.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -42,15 +42,17 @@ void __fill_bytes_impl(stream_ref __stream, _CUDA_VSTD::span<_DstTy, _DstSize> _

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be a `contiguous_range` or implicitly/launch transform
//! into one. It can't reside in pagable host memory.
//! Destination needs to either be a `contiguous_range` or launch transform
//! into one. It can also implicitly convert to `cuda::std::span`, but it needs to contain `value_type` member alias.
//! Destination type is required to be trivially copyable.
//!
//! Destination can't reside in pagable host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_copy_fill_argument<_DstTy>)
_CCCL_REQUIRES(__valid_1d_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
__fill_bytes_impl(__stream,
Expand All @@ -59,5 +61,29 @@ void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
__value);
}

//! @brief Launches an operation to bytewise fill the memory into the provided stream.
//!
//! Destination needs to either be an instance of `cuda::std::mdspan` or launch transform
//! into one. It can also implicitly convert to `cuda::std::mdspan`, but the type needs to contain `mdspan` template
//! arguments as member aliases named `value_type`, `extents_type`, `layout_type` and `accessor_type`. Destination
//! type is required to be trivially copyable.
//!
//! Destination can't reside in pagable host memory.
//!
//! @param __stream Stream that the copy should be inserted into
//! @param __dst Destination memory to fill
//! @param __value Value to fill into every byte in the destination
_CCCL_TEMPLATE(typename _DstTy)
_CCCL_REQUIRES(__valid_nd_copy_fill_argument<_DstTy>)
void fill_bytes(stream_ref __stream, _DstTy&& __dst, uint8_t __value)
{
decltype(auto) __dst_transformed = detail::__launch_transform(__stream, _CUDA_VSTD::forward<_DstTy>(__dst));
decltype(auto) __dst_as_arg = static_cast<detail::__as_copy_arg_t<_DstTy>>(__dst_transformed);
auto __dst_mdspan = __as_mdspan_t<decltype(__dst_as_arg)>(__dst_as_arg);

__fill_bytes_impl(
__stream, _CUDA_VSTD::span(__dst_mdspan.data_handle(), __dst_mdspan.mapping().required_span_size()), __value);
}

} // namespace cuda::experimental
#endif // __CUDAX_ALGORITHM_FILL
29 changes: 24 additions & 5 deletions cudax/test/algorithm/common.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,10 +43,24 @@ void check_result_and_erase(cudax::stream_ref stream, Result&& result, uint8_t p
}
}

template <typename Layout = cuda::std::layout_right, typename Extents>
auto make_buffer_for_mdspan(Extents extents, char value = 0)
{
cuda::mr::pinned_memory_resource host_resource;
auto mapping = typename Layout::template mapping<decltype(extents)>{extents};

cudax::uninitialized_buffer<int, cuda::mr::host_accessible> buffer(host_resource, mapping.required_span_size());

memset(buffer.data(), value, buffer.size_bytes());

return buffer;
}

namespace cuda::experimental
{

// Need a type that goes through all launch_transform steps, but is not a contiguous_range
template <typename AsKernelArg = cuda::std::span<int>>
struct weird_buffer
{
const cuda::mr::pinned_memory_resource& resource;
Expand All @@ -57,7 +71,9 @@ struct weird_buffer
: resource(res)
, data((int*) res.allocate(s * sizeof(int)))
, size(s)
{}
{
memset(data, 0, size);
}

~weird_buffer()
{
Expand All @@ -72,22 +88,25 @@ struct weird_buffer
int* data;
std::size_t size;

using __as_kernel_arg = cuda::std::span<int>;
using __as_kernel_arg = AsKernelArg;

operator cuda::std::span<int>()
{
return {data, size};
}

template <typename Extents>
operator cuda::std::mdspan<int, Extents>()
{
return cuda::std::mdspan<int, Extents>{data};
}
};

_CCCL_NODISCARD_FRIEND transform_result __cudax_launch_transform(cuda::stream_ref, const weird_buffer& self) noexcept
{
return {self.data, self.size};
}
};

static_assert(std::is_same_v<cudax::as_kernel_arg_t<cudax::weird_buffer>, cuda::std::span<int>>);

} // namespace cuda::experimental

#endif // __ALGORITHM_COMMON__
66 changes: 65 additions & 1 deletion cudax/test/algorithm/copy.cu
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@

#include "common.cuh"

TEST_CASE("Copy", "[data_manipulation]")
TEST_CASE("1d Copy", "[data_manipulation]")
{
cudax::stream _stream;

Expand Down Expand Up @@ -103,3 +103,67 @@ TEST_CASE("Copy", "[data_manipulation]")
CUDAX_REQUIRE(vec[1] == 0xbeef);
}
}

template <typename SrcLayout = cuda::std::layout_right,
typename DstLayout = SrcLayout,
typename SrcExtents,
typename DstExtents>
void test_mdspan_copy_bytes(
cudax::stream_ref stream, SrcExtents src_extents = SrcExtents(), DstExtents dst_extents = DstExtents())
{
auto src_buffer = make_buffer_for_mdspan<SrcLayout>(src_extents, 1);
auto dst_buffer = make_buffer_for_mdspan<DstLayout>(dst_extents, 0);

cuda::std::mdspan<int, SrcExtents, SrcLayout> src(src_buffer.data(), src_extents);
cuda::std::mdspan<int, DstExtents, DstLayout> dst(dst_buffer.data(), dst_extents);

for (int i = 0; i < static_cast<int>(src.extent(1)); i++)
{
src(0, i) = i;
}

cudax::copy_bytes(stream, std::move(src), dst);
stream.wait();

for (int i = 0; i < static_cast<int>(dst.extent(1)); i++)
{
CUDAX_CHECK(dst(0, i) == i);
}
}

TEST_CASE("Mdspan copy", "[data_manipulation]")
{
cudax::stream stream;

SECTION("Different extents")
{
auto static_extents = cuda::std::extents<size_t, 3, 4>();
test_mdspan_copy_bytes(stream, static_extents, static_extents);
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, static_extents);

auto dynamic_extents = cuda::std::dextents<size_t, 2>(3, 4);
test_mdspan_copy_bytes(stream, dynamic_extents, dynamic_extents);
test_mdspan_copy_bytes(stream, static_extents, dynamic_extents);
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, static_extents, dynamic_extents);

auto mixed_extents = cuda::std::extents<int, cuda::std::dynamic_extent, 4>(3);
test_mdspan_copy_bytes(stream, dynamic_extents, mixed_extents);
test_mdspan_copy_bytes(stream, mixed_extents, static_extents);
test_mdspan_copy_bytes<cuda::std::layout_left>(stream, mixed_extents, static_extents);
}

SECTION("Launch transform")
{
auto mixed_extents =
cuda::std::extents<size_t, 1024, cuda::std::dynamic_extent, 2, cuda::std::dynamic_extent>(1024, 2);
[[maybe_unused]] auto static_extents = cuda::std::extents<size_t, 1024, 1024, 2, 2>();
auto mdspan_buffer = make_buffer_for_mdspan(mixed_extents, 1);
cuda::std::mdspan<int, decltype(mixed_extents)> mdspan(mdspan_buffer.data(), mixed_extents);
cudax::weird_buffer<cuda::std::mdspan<int, decltype(static_extents)>> buffer{
cuda::mr::pinned_memory_resource{}, mdspan.mapping().required_span_size()};

cudax::copy_bytes(stream, mdspan, buffer);
stream.wait();
CUDAX_REQUIRE(!memcmp(mdspan_buffer.data(), buffer.data, mdspan_buffer.size()));
}
}
29 changes: 29 additions & 0 deletions cudax/test/algorithm/fill.cu
Original file line number Diff line number Diff line change
Expand Up @@ -44,3 +44,32 @@ TEST_CASE("Fill", "[data_manipulation]")
check_result_and_erase(_stream, cuda::std::span(buffer.data, buffer.size));
}
}

TEST_CASE("Mdspan Fill", "[data_manipulation]")
{
cudax::stream stream;
{
cuda::std::dextents<size_t, 3> dynamic_extents{1, 2, 3};
auto buffer = make_buffer_for_mdspan(dynamic_extents, 0);
cuda::std::mdspan<int, decltype(dynamic_extents)> dynamic_mdspan(buffer.data(), dynamic_extents);

cudax::fill_bytes(stream, dynamic_mdspan, fill_byte);
check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
}
{
cuda::std::extents<size_t, 2, cuda::std::dynamic_extent, 4> mixed_extents{1};
auto buffer = make_buffer_for_mdspan(mixed_extents, 0);
cuda::std::mdspan<int, decltype(mixed_extents)> mixed_mdspan(buffer.data(), mixed_extents);

cudax::fill_bytes(stream, cuda::std::move(mixed_mdspan), fill_byte);
check_result_and_erase(stream, cuda::std::span(buffer.data(), buffer.size()));
}
{
using static_extents = cuda::std::extents<size_t, 2, 3, 4>;
auto size = cuda::std::layout_left::mapping<static_extents>().required_span_size();
cudax::weird_buffer<cuda::std::mdspan<int, static_extents>> buffer(cuda::mr::pinned_memory_resource{}, size);

cudax::fill_bytes(stream, buffer, fill_byte);
check_result_and_erase(stream, cuda::std::span(buffer.data, buffer.size));
}
}
Loading