Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimizing repeat API for scalar value of repeats #1131

Closed
wants to merge 1 commit into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 27 additions & 3 deletions cunumeric/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -2022,10 +2022,34 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
def repeat(
self, repeats: Any, axis: int, scalar_repeats: bool
) -> DeferredArray:
out = self.runtime.create_unbound_thunk(self.base.type, ndim=self.ndim)
task = self.context.create_auto_task(CuNumericOpCode.REPEAT)
task.add_input(self.base)
task.add_output(out.base)
if scalar_repeats:
out_shape = tuple(
self.shape[dim] * repeats if dim == axis else self.shape[dim]
for dim in range(self.ndim)
)
out = cast(
DeferredArray,
self.runtime.create_empty_thunk(
out_shape,
dtype=self.base.type,
inputs=[self],
),
)
p_in = task.declare_partition(self.base)
p_out = task.declare_partition(out.base)
task.add_input(self.base, partition=p_in)
task.add_output(out.base, partition=p_out)
scale = tuple(
repeats if dim == axis else 1 for dim in range(self.ndim)
)
task.add_constraint(p_out <= p_in * scale)
else:
out = self.runtime.create_unbound_thunk(
self.base.type, ndim=self.ndim
)
task.add_input(self.base)
task.add_output(out.base)
# We pass axis now but don't use for 1D case (will use for ND case
task.add_scalar_arg(axis, ty.int32)
task.add_scalar_arg(scalar_repeats, ty.bool_)
Expand Down
9 changes: 2 additions & 7 deletions src/cunumeric/index/repeat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,15 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
for (size_t idx = 0; idx < out_volume; ++idx) {
auto out_p = pitches.unflatten(idx, out_rect.lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_rect.lo;
out[out_p] = in[in_p];
}
}
Expand Down
17 changes: 6 additions & 11 deletions src/cunumeric/index/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,19 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)

template <typename VAL, int DIM>
__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
repeat_kernel(Buffer<VAL, DIM> out,
repeat_kernel(AccessorWO<VAL, DIM> out,
const AccessorRO<VAL, DIM> in,
int64_t repeats,
const int32_t axis,
const Point<DIM> in_lo,
const Point<DIM> out_lo,
const Pitches<DIM - 1> pitches,
const size_t volume)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
auto out_p = pitches.unflatten(idx, Point<DIM>::ZEROES());
auto out_p = pitches.unflatten(idx, out_lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_lo;
out[out_p] = in[in_p];
}

Expand Down Expand Up @@ -103,20 +102,16 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
const auto blocks = (out_volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

auto stream = get_cached_stream();
repeat_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
out, in, repeats, axis, in_rect.lo, pitches, out_volume);
out, in, repeats, axis, out_rect.lo, pitches, out_volume);
CHECK_CUDA_STREAM(stream);
}

Expand Down
9 changes: 2 additions & 7 deletions src/cunumeric/index/repeat_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
Expand All @@ -50,7 +46,6 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
auto out_p = pitches.unflatten(idx, out_rect.lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_rect.lo;
out[out_p] = in[in_p];
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/cunumeric/index/repeat_template.inl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ struct RepeatImpl {
auto input_arr = args.input.read_accessor<VAL, DIM>(input_rect);

if (input_rect.empty()) {
args.output.bind_empty_data();
if (!args.scalar_repeats) { args.output.bind_empty_data(); }
return;
}

Expand Down
Loading