Skip to content

Commit

Permalink
optimizing repeat for scalar value of repeats
Browse files Browse the repository at this point in the history
  • Loading branch information
ipdemes committed Mar 10, 2024
1 parent 9ece0a3 commit a336da4
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 29 deletions.
30 changes: 27 additions & 3 deletions cunumeric/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -2022,10 +2022,34 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
def repeat(
self, repeats: Any, axis: int, scalar_repeats: bool
) -> DeferredArray:
out = self.runtime.create_unbound_thunk(self.base.type, ndim=self.ndim)
task = self.context.create_auto_task(CuNumericOpCode.REPEAT)
task.add_input(self.base)
task.add_output(out.base)
if scalar_repeats:
out_shape = tuple(
self.shape[dim] * repeats if dim == axis else self.shape[dim]
for dim in range(self.ndim)
)
out = cast(
DeferredArray,
self.runtime.create_empty_thunk(
out_shape,
dtype=self.base.type,
inputs=[self],
),
)
p_in = task.declare_partition(self.base)
p_out = task.declare_partition(out.base)
task.add_input(self.base, partition=p_in)
task.add_output(out.base, partition=p_out)
scale = tuple(
repeats if dim == axis else 1 for dim in range(self.ndim)
)
task.add_constraint(p_out <= p_in * scale)
else:
out = self.runtime.create_unbound_thunk(
self.base.type, ndim=self.ndim
)
task.add_input(self.base)
task.add_output(out.base)
# We pass axis now but don't use for 1D case (will use for ND case
task.add_scalar_arg(axis, ty.int32)
task.add_scalar_arg(scalar_repeats, ty.bool_)
Expand Down
9 changes: 2 additions & 7 deletions src/cunumeric/index/repeat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,15 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
for (size_t idx = 0; idx < out_volume; ++idx) {
auto out_p = pitches.unflatten(idx, out_rect.lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_rect.lo;
out[out_p] = in[in_p];
}
}
Expand Down
17 changes: 6 additions & 11 deletions src/cunumeric/index/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,20 +50,19 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)

template <typename VAL, int DIM>
__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
repeat_kernel(Buffer<VAL, DIM> out,
repeat_kernel(AccessorWO<VAL, DIM> out,
const AccessorRO<VAL, DIM> in,
int64_t repeats,
const int32_t axis,
const Point<DIM> in_lo,
const Point<DIM> out_lo,
const Pitches<DIM - 1> pitches,
const size_t volume)
{
const size_t idx = global_tid_1d();
if (idx >= volume) return;
auto out_p = pitches.unflatten(idx, Point<DIM>::ZEROES());
auto out_p = pitches.unflatten(idx, out_lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_lo;
out[out_p] = in[in_p];
}

Expand Down Expand Up @@ -103,20 +102,16 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
const auto blocks = (out_volume + THREADS_PER_BLOCK - 1) / THREADS_PER_BLOCK;

auto stream = get_cached_stream();
repeat_kernel<VAL, DIM><<<blocks, THREADS_PER_BLOCK, 0, stream>>>(
out, in, repeats, axis, in_rect.lo, pitches, out_volume);
out, in, repeats, axis, out_rect.lo, pitches, out_volume);
CHECK_CUDA_STREAM(stream);
}

Expand Down
9 changes: 2 additions & 7 deletions src/cunumeric/index/repeat_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
Expand All @@ -50,7 +46,6 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
auto out_p = pitches.unflatten(idx, out_rect.lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_rect.lo;
out[out_p] = in[in_p];
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/cunumeric/index/repeat_template.inl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ struct RepeatImpl {
auto input_arr = args.input.read_accessor<VAL, DIM>(input_rect);

if (input_rect.empty()) {
args.output.bind_empty_data();
if (!args.scalar_repeats) { args.output.bind_empty_data(); }
return;
}

Expand Down

0 comments on commit a336da4

Please sign in to comment.