Skip to content

Commit

Permalink
Merge pull request #1976 from IntelPython/reduce-elementwise-extensio…
Browse files Browse the repository at this point in the history
…n-size

Reduce elementwise extension size
  • Loading branch information
oleksandr-pavlyk authored Jan 22, 2025
2 parents d9e9bf8 + 1a95394 commit c5cbb08
Show file tree
Hide file tree
Showing 7 changed files with 214 additions and 94 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,11 @@
#include <cstddef>
#include <cstdint>
#include <limits>
#include <sycl/sycl.hpp>
#include <utility>
#include <vector>

#include <sycl/sycl.hpp>

#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_dispatch_building.hpp"
Expand Down Expand Up @@ -599,6 +600,10 @@ sycl::event masked_place_all_slices_strided_impl(
sycl::nd_range<2> ndRange{gRange, lRange};

using LocalAccessorT = sycl::local_accessor<indT, 1>;
using Impl =
MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
Strided1DCyclicIndexer, dataT, indT,
LocalAccessorT>;

dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
Expand All @@ -611,13 +616,9 @@ sycl::event masked_place_all_slices_strided_impl(
LocalAccessorT lacc(lacc_size, cgh);

cgh.parallel_for<KernelName>(
ndRange,
MaskedPlaceStridedFunctor<TwoZeroOffsets_Indexer, StridedIndexer,
Strided1DCyclicIndexer, dataT, indT,
LocalAccessorT>(
dst_tp, cumsum_tp, rhs_tp, iteration_size,
orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer,
lacc));
ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, iteration_size,
orthog_dst_rhs_indexer, masked_dst_indexer,
masked_rhs_indexer, lacc));
});

return comp_ev;
Expand Down Expand Up @@ -696,6 +697,10 @@ sycl::event masked_place_some_slices_strided_impl(
sycl::nd_range<2> ndRange{gRange, lRange};

using LocalAccessorT = sycl::local_accessor<indT, 1>;
using Impl =
MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
Strided1DCyclicIndexer, dataT, indT,
LocalAccessorT>;

dataT *dst_tp = reinterpret_cast<dataT *>(dst_p);
const dataT *rhs_tp = reinterpret_cast<const dataT *>(rhs_p);
Expand All @@ -708,13 +713,9 @@ sycl::event masked_place_some_slices_strided_impl(
LocalAccessorT lacc(lacc_size, cgh);

cgh.parallel_for<KernelName>(
ndRange,
MaskedPlaceStridedFunctor<TwoOffsets_StridedIndexer, StridedIndexer,
Strided1DCyclicIndexer, dataT, indT,
LocalAccessorT>(
dst_tp, cumsum_tp, rhs_tp, masked_nelems,
orthog_dst_rhs_indexer, masked_dst_indexer, masked_rhs_indexer,
lacc));
ndRange, Impl(dst_tp, cumsum_tp, rhs_tp, masked_nelems,
orthog_dst_rhs_indexer, masked_dst_indexer,
masked_rhs_indexer, lacc));
});

return comp_ev;
Expand Down
18 changes: 11 additions & 7 deletions dpctl/tensor/libtensor/include/kernels/clip.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -216,22 +216,24 @@ sycl::event clip_contig_impl(sycl::queue &q,
{
constexpr bool enable_sg_loadstore = true;
using KernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
using Impl =
ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs, enable_sg_loadstore>(
nelems, x_tp, min_tp, max_tp, dst_tp));
Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
}
else {
constexpr bool disable_sg_loadstore = false;
using InnerKernelName = clip_contig_kernel<T, vec_sz, n_vecs>;
using KernelName =
disabled_sg_loadstore_wrapper_krn<InnerKernelName>;
using Impl =
ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ClipContigFunctor<T, vec_sz, n_vecs, disable_sg_loadstore>(
nelems, x_tp, min_tp, max_tp, dst_tp));
Impl(nelems, x_tp, min_tp, max_tp, dst_tp));
}
});

Expand Down Expand Up @@ -311,10 +313,12 @@ sycl::event clip_strided_impl(sycl::queue &q,
const FourOffsets_StridedIndexer indexer{
nd, x_offset, min_offset, max_offset, dst_offset, shape_strides};

cgh.parallel_for<clip_strided_kernel<T, FourOffsets_StridedIndexer>>(
using KernelName = clip_strided_kernel<T, FourOffsets_StridedIndexer>;
using Impl = ClipStridedFunctor<T, FourOffsets_StridedIndexer>;

cgh.parallel_for<KernelName>(
sycl::range<1>(nelems),
ClipStridedFunctor<T, FourOffsets_StridedIndexer>(
x_tp, min_tp, max_tp, dst_tp, indexer));
Impl(x_tp, min_tp, max_tp, dst_tp, indexer));
});

return clip_ev;
Expand Down
54 changes: 32 additions & 22 deletions dpctl/tensor/libtensor/include/kernels/constructors.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,13 +24,15 @@
//===----------------------------------------------------------------------===//

#pragma once
#include <complex>
#include <cstddef>

#include <sycl/sycl.hpp>

#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
#include "utils/strided_iters.hpp"
#include "utils/type_utils.hpp"
#include <complex>
#include <cstddef>
#include <sycl/sycl.hpp>

namespace dpctl
{
Expand Down Expand Up @@ -200,22 +202,25 @@ sycl::event lin_space_affine_impl(sycl::queue &exec_q,
{
dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);

bool device_supports_doubles = exec_q.get_device().has(sycl::aspect::fp64);
const bool device_supports_doubles =
exec_q.get_device().has(sycl::aspect::fp64);
const std::size_t den = (include_endpoint) ? nelems - 1 : nelems;

sycl::event lin_space_affine_event = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
if (device_supports_doubles) {
cgh.parallel_for<linear_sequence_affine_kernel<Ty, double>>(
sycl::range<1>{nelems},
LinearSequenceAffineFunctor<Ty, double>(
array_data, start_v, end_v,
(include_endpoint) ? nelems - 1 : nelems));
using KernelName = linear_sequence_affine_kernel<Ty, double>;
using Impl = LinearSequenceAffineFunctor<Ty, double>;

cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
Impl(array_data, start_v, end_v, den));
}
else {
cgh.parallel_for<linear_sequence_affine_kernel<Ty, float>>(
sycl::range<1>{nelems},
LinearSequenceAffineFunctor<Ty, float>(
array_data, start_v, end_v,
(include_endpoint) ? nelems - 1 : nelems));
using KernelName = linear_sequence_affine_kernel<Ty, float>;
using Impl = LinearSequenceAffineFunctor<Ty, float>;

cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
Impl(array_data, start_v, end_v, den));
}
});

Expand Down Expand Up @@ -312,10 +317,12 @@ sycl::event full_strided_impl(sycl::queue &q,

sycl::event fill_ev = q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
cgh.parallel_for<full_strided_kernel<dstTy>>(
sycl::range<1>{nelems},
FullStridedFunctor<dstTy, decltype(strided_indexer)>(
dst_tp, fill_v, strided_indexer));

using KernelName = full_strided_kernel<dstTy>;
using Impl = FullStridedFunctor<dstTy, StridedIndexer>;

cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
Impl(dst_tp, fill_v, strided_indexer));
});

return fill_ev;
Expand Down Expand Up @@ -388,9 +395,12 @@ sycl::event eye_impl(sycl::queue &exec_q,
dpctl::tensor::type_utils::validate_type_for_device<Ty>(exec_q);
sycl::event eye_event = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends);
cgh.parallel_for<eye_kernel<Ty>>(
sycl::range<1>{nelems},
EyeFunctor<Ty>(array_data, start, end, step));

using KernelName = eye_kernel<Ty>;
using Impl = EyeFunctor<Ty>;

cgh.parallel_for<KernelName>(sycl::range<1>{nelems},
Impl(array_data, start, end, step));
});

return eye_event;
Expand Down Expand Up @@ -478,7 +488,7 @@ sycl::event tri_impl(sycl::queue &exec_q,
ssize_t inner_gid = idx[0] - inner_range * outer_gid;

ssize_t src_inner_offset = 0, dst_inner_offset = 0;
bool to_copy(true);
bool to_copy{false};

{
using dpctl::tensor::strides::CIndexer_array;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,11 +26,13 @@
#include <cstddef>
#include <cstdint>
#include <stdexcept>
#include <sycl/sycl.hpp>
#include <utility>

#include <sycl/sycl.hpp>

#include "kernels/alignment.hpp"
#include "kernels/dpctl_tensor_types.hpp"
#include "kernels/elementwise_functions/common_detail.hpp"
#include "utils/offset_utils.hpp"
#include "utils/sycl_alloc_utils.hpp"
#include "utils/sycl_utils.hpp"
Expand Down Expand Up @@ -324,21 +326,23 @@ sycl::event unary_contig_impl(sycl::queue &exec_q,
{
constexpr bool enable_sg_loadstore = true;
using KernelName = BaseKernelName;
using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
enable_sg_loadstore>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
enable_sg_loadstore>(arg_tp, res_tp, nelems));
Impl(arg_tp, res_tp, nelems));
}
else {
constexpr bool disable_sg_loadstore = false;
using KernelName =
disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
using Impl = ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
disable_sg_loadstore>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
ContigFunctorT<argTy, resTy, vec_sz, n_vecs,
disable_sg_loadstore>(arg_tp, res_tp, nelems));
Impl(arg_tp, res_tp, nelems));
}
});

Expand Down Expand Up @@ -377,9 +381,10 @@ unary_strided_impl(sycl::queue &exec_q,
const argTy *arg_tp = reinterpret_cast<const argTy *>(arg_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

using Impl = StridedFunctorT<argTy, resTy, IndexerT>;

cgh.parallel_for<kernel_name<argTy, resTy, IndexerT>>(
{nelems},
StridedFunctorT<argTy, resTy, IndexerT>(arg_tp, res_tp, indexer));
{nelems}, Impl(arg_tp, res_tp, indexer));
});
return comp_ev;
}
Expand Down Expand Up @@ -814,22 +819,23 @@ sycl::event binary_contig_impl(sycl::queue &exec_q,
{
constexpr bool enable_sg_loadstore = true;
using KernelName = BaseKernelName;
using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
n_vecs, enable_sg_loadstore>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
enable_sg_loadstore>(arg1_tp, arg2_tp,
res_tp, nelems));
Impl(arg1_tp, arg2_tp, res_tp, nelems));
}
else {
constexpr bool disable_sg_loadstore = false;
using KernelName =
disabled_sg_loadstore_wrapper_krn<BaseKernelName>;
using Impl = BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz,
n_vecs, disable_sg_loadstore>;

cgh.parallel_for<KernelName>(
sycl::nd_range<1>(gws_range, lws_range),
BinaryContigFunctorT<argTy1, argTy2, resTy, vec_sz, n_vecs,
disable_sg_loadstore>(arg1_tp, arg2_tp,
res_tp, nelems));
Impl(arg1_tp, arg2_tp, res_tp, nelems));
}
});
return comp_ev;
Expand Down Expand Up @@ -873,9 +879,10 @@ binary_strided_impl(sycl::queue &exec_q,
const argTy2 *arg2_tp = reinterpret_cast<const argTy2 *>(arg2_p);
resTy *res_tp = reinterpret_cast<resTy *>(res_p);

using Impl = BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>;

cgh.parallel_for<kernel_name<argTy1, argTy2, resTy, IndexerT>>(
{nelems}, BinaryStridedFunctorT<argTy1, argTy2, resTy, IndexerT>(
arg1_tp, arg2_tp, res_tp, indexer));
{nelems}, Impl(arg1_tp, arg2_tp, res_tp, indexer));
});
return comp_ev;
}
Expand Down Expand Up @@ -917,13 +924,9 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
exec_q);
argT2 *padded_vec = padded_vec_owner.get();

sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends); // ensure vec contains actual data
cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
auto i = id[0];
padded_vec[i] = vec[i % n1];
});
});
sycl::event make_padded_vec_ev =
dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);

// sub-group spans work-items [I, I + sgSize)
// base = ndit.get_global_linear_id() - sg.get_local_id()[0]
Expand All @@ -942,10 +945,12 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl(
std::size_t n_groups = (n_elems + lws - 1) / lws;
auto gwsRange = sycl::range<1>(n_groups * lws);

using Impl =
BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>;

cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
sycl::nd_range<1>(gwsRange, lwsRange),
BinaryContigMatrixContigRowBroadcastFunctorT<argT1, argT2, resT>(
mat, padded_vec, res, n_elems, n1));
Impl(mat, padded_vec, res, n_elems, n1));
});

sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
Expand Down Expand Up @@ -993,13 +998,9 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
exec_q);
argT2 *padded_vec = padded_vec_owner.get();

sycl::event make_padded_vec_ev = exec_q.submit([&](sycl::handler &cgh) {
cgh.depends_on(depends); // ensure vec contains actual data
cgh.parallel_for({n1_padded}, [=](sycl::id<1> id) {
auto i = id[0];
padded_vec[i] = vec[i % n1];
});
});
sycl::event make_padded_vec_ev =
dpctl::tensor::kernels::elementwise_detail::populate_padded_vector<
argT2>(exec_q, vec, n1, padded_vec, n1_padded, depends);

// sub-group spans work-items [I, I + sgSize)
// base = ndit.get_global_linear_id() - sg.get_local_id()[0]
Expand All @@ -1018,10 +1019,12 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl(
std::size_t n_groups = (n_elems + lws - 1) / lws;
auto gwsRange = sycl::range<1>(n_groups * lws);

using Impl =
BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>;

cgh.parallel_for<class kernel_name<argT1, argT2, resT>>(
sycl::nd_range<1>(gwsRange, lwsRange),
BinaryContigRowContigMatrixBroadcastFunctorT<argT1, argT2, resT>(
padded_vec, mat, res, n_elems, n1));
Impl(padded_vec, mat, res, n_elems, n1));
});

sycl::event tmp_cleanup_ev = dpctl::tensor::alloc_utils::async_smart_free(
Expand Down
Loading

0 comments on commit c5cbb08

Please sign in to comment.