Skip to content

Commit

Permalink
optimizing repeat for scalar value of repeats
Browse files Browse the repository at this point in the history
  • Loading branch information
ipdemes committed Mar 8, 2024
1 parent 5838fa6 commit 2cf80f9
Show file tree
Hide file tree
Showing 5 changed files with 35 additions and 26 deletions.
30 changes: 27 additions & 3 deletions cunumeric/deferred.py
Original file line number Diff line number Diff line change
Expand Up @@ -2022,10 +2022,34 @@ def trilu(self, rhs: Any, k: int, lower: bool) -> None:
def repeat(
self, repeats: Any, axis: int, scalar_repeats: bool
) -> DeferredArray:
out = self.runtime.create_unbound_thunk(self.base.type, ndim=self.ndim)
task = self.context.create_auto_task(CuNumericOpCode.REPEAT)
task.add_input(self.base)
task.add_output(out.base)
if scalar_repeats:
out_shape = tuple(
self.shape[dim] * repeats if dim == axis else self.shape[dim]
for dim in range(self.ndim)
)
out = cast(
DeferredArray,
self.runtime.create_empty_thunk(
out_shape,
dtype=self.base.type,
inputs=[self],
),
)
p_in = task.declare_partition(self.base)
p_out = task.declare_partition(out.base)
task.add_input(self.base, partition=p_in)
task.add_output(out.base, partition=p_out)
scale = tuple(
repeats if dim == axis else 1 for dim in range(self.ndim)
)
task.add_constraint(p_out <= p_in * scale)
else:
out = self.runtime.create_unbound_thunk(
self.base.type, ndim=self.ndim
)
task.add_input(self.base)
task.add_output(out.base)
# We pass axis now but don't use for 1D case (will use for ND case
task.add_scalar_arg(axis, ty.int32)
task.add_scalar_arg(scalar_repeats, ty.bool_)
Expand Down
9 changes: 2 additions & 7 deletions src/cunumeric/index/repeat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,20 +31,15 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.read_write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
for (size_t idx = 0; idx < out_volume; ++idx) {
auto out_p = pitches.unflatten(idx, out_rect.lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_rect.lo;
out[out_p] = in[in_p];
}
}
Expand Down
11 changes: 3 additions & 8 deletions src/cunumeric/index/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)

template <typename VAL, int DIM>
__global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
repeat_kernel(Buffer<VAL, DIM> out,
repeat_kernel(AccessorRW<VAL, DIM> out,
const AccessorRO<VAL, DIM> in,
int64_t repeats,
const int32_t axis,
Expand All @@ -63,7 +63,6 @@ __global__ static void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
auto out_p = pitches.unflatten(idx, Point<DIM>::ZEROES());
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_lo;
out[out_p] = in[in_p];
}

Expand Down Expand Up @@ -103,12 +102,8 @@ struct RepeatImplBody<VariantKind::GPU, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.read_write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
Expand Down
9 changes: 2 additions & 7 deletions src/cunumeric/index/repeat_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,8 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
const int32_t axis,
const Rect<DIM>& in_rect) const
{
Point<DIM> extents = in_rect.hi - in_rect.lo + Point<DIM>::ONES();
extents[axis] *= repeats;

auto out = out_array.create_output_buffer<VAL, DIM>(extents, true);

Rect<DIM> out_rect(Point<DIM>::ZEROES(), extents - Point<DIM>::ONES());
auto out_rect = out_array.shape<DIM>();
auto out = out_array.read_write_accessor<VAL, DIM>(out_rect);
Pitches<DIM - 1> pitches;

auto out_volume = pitches.flatten(out_rect);
Expand All @@ -50,7 +46,6 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
auto out_p = pitches.unflatten(idx, out_rect.lo);
auto in_p = out_p;
in_p[axis] /= repeats;
in_p += in_rect.lo;
out[out_p] = in[in_p];
}
}
Expand Down
2 changes: 1 addition & 1 deletion src/cunumeric/index/repeat_template.inl
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ struct RepeatImpl {
auto input_arr = args.input.read_accessor<VAL, DIM>(input_rect);

if (input_rect.empty()) {
args.output.bind_empty_data();
if (!args.scalar_repeats) { args.output.bind_empty_data(); }
return;
}

Expand Down

0 comments on commit 2cf80f9

Please sign in to comment.