Skip to content

Commit

Permalink
Remove logical version of transpose
Browse files Browse the repository at this point in the history
  • Loading branch information
manopapad committed Jan 9, 2024
1 parent c1bfd9d commit 2b99a54
Show file tree
Hide file tree
Showing 7 changed files with 45 additions and 150 deletions.
2 changes: 0 additions & 2 deletions cunumeric/linalg/cholesky.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def transpose_copy_single(
task.add_input(input)
# Output has the same shape as input, but is mapped
# to a column major instance
task.add_scalar_arg(False, ty.bool_)

task.add_broadcast(output)
task.add_broadcast(input)
Expand All @@ -62,7 +61,6 @@ def transpose_copy(
task.add_input(p_input)
# Output has the same shape as input, but is mapped
# to a column major instance
task.add_scalar_arg(False, ty.bool_)

task.execute()

Expand Down
16 changes: 6 additions & 10 deletions src/cunumeric/mapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,12 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
return std::move(mappings);
}
case CUNUMERIC_TRANSPOSE_COPY_2D: {
auto logical = task.scalars()[0].value<bool>();
if (!logical) {
std::vector<StoreMapping> mappings;
auto& outputs = task.outputs();
mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front()));
mappings.back().policy.ordering.set_fortran_order();
mappings.back().policy.exact = true;
return std::move(mappings);
} else
return {};
std::vector<StoreMapping> mappings;
auto& outputs = task.outputs();
mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front()));
mappings.back().policy.ordering.set_fortran_order();
mappings.back().policy.exact = true;
return std::move(mappings);
}
case CUNUMERIC_MATMUL:
case CUNUMERIC_MATVECMUL:
Expand Down
30 changes: 9 additions & 21 deletions src/cunumeric/matrix/transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,19 @@ template <Type::Code CODE>
struct TransposeImplBody<VariantKind::CPU, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(const Rect<2>& out_rect,
const Rect<2>& in_rect,
void operator()(const Rect<2>& rect,
const AccessorWO<VAL, 2>& out,
const AccessorRO<VAL, 2>& in,
bool logical) const
const AccessorRO<VAL, 2>& in) const
{
constexpr coord_t BF = 128 / sizeof(VAL);
if (logical)
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2];
}
}
else
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) {
for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0];
const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
}
}
};

Expand Down
99 changes: 17 additions & 82 deletions src/cunumeric/matrix/transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,70 +24,12 @@ namespace cunumeric {
#define TILE_DIM 32
#define BLOCK_ROWS 8

template <typename VAL>
__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
transpose_2d_logical(const AccessorWO<VAL, 2> out,
const AccessorRO<VAL, 2> in,
const Point<2> lo_in,
const Point<2> hi_in,
const Point<2> lo_out,
const Point<2> hi_out)
{
__shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];

// These are reversed here for coalescing
coord_t x = blockIdx.y * TILE_DIM + threadIdx.y;
coord_t y = blockIdx.x * TILE_DIM + threadIdx.x;

// Check to see if we hit our y-bounds, if so we can just mask off those threads
if ((lo_in[1] + y) <= hi_in[1]) {
// Check to see if we're going to hit our x-bounds while striding
if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_in[0] + x + i) <= hi_in[0])
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
}
}
// Make sure all the data is in shared memory
__syncthreads();

// Transpose the coordinates
x = blockIdx.x * TILE_DIM + threadIdx.y;
y = blockIdx.y * TILE_DIM + threadIdx.x;

// Check to see if we hit our y-bounds, if so we can just mask off those threads
if ((lo_out[1] + y) <= hi_out[1]) {
// Check to see if we're going to hit our x-bounds while striding
if ((lo_out[0] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[0]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_out[0] + x + i) <= hi_out[0])
out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i];
}
}
}

template <typename VAL>
__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
transpose_2d_physical(const AccessorWO<VAL, 2> out,
const AccessorRO<VAL, 2> in,
const Point<2> lo_in,
const Point<2> hi_in,
const Point<2> lo_out,
const Point<2> hi_out)
const Point<2> lo,
const Point<2> hi)
{
__shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];

Expand All @@ -96,19 +38,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S
coord_t y = blockIdx.x * TILE_DIM + threadIdx.x;

// Check to see if we hit our y-bounds, if so we can just mask off those threads
if ((lo_in[1] + y) <= hi_in[1]) {
if ((lo[1] + y) <= hi[1]) {
// Check to see if we're going to hit our x-bounds while striding
if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) {
if ((lo[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi[0]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_in[0] + x + i) <= hi_in[0])
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
if ((lo[0] + x + i) <= hi[0])
tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)];
}
}

Expand All @@ -119,19 +61,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S
y = blockIdx.x * TILE_DIM + threadIdx.y;

// Check to see if we hit our x-bounds, if so we can just mask off those threads
if ((lo_out[0] + x) <= hi_out[0]) {
if ((lo[0] + x) <= hi[0]) {
// Check to see if we're going to hit our y-bounds while striding
if ((lo_out[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[1]) {
if ((lo[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi[1]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_out[1] + y + i) <= hi_out[1])
out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
if ((lo[1] + y + i) <= hi[1])
out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
}
}
}
Expand All @@ -140,24 +82,17 @@ template <Type::Code CODE>
struct TransposeImplBody<VariantKind::GPU, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(const Rect<2>& out_rect,
const Rect<2>& in_rect,
void operator()(const Rect<2>& rect,
const AccessorWO<VAL, 2>& out,
const AccessorRO<VAL, 2>& in,
bool logical) const
const AccessorRO<VAL, 2>& in) const
{
const coord_t m = (in_rect.hi[0] - in_rect.lo[0]) + 1;
const coord_t n = (in_rect.hi[1] - in_rect.lo[1]) + 1;
const coord_t m = (rect.hi[0] - rect.lo[0]) + 1;
const coord_t n = (rect.hi[1] - rect.lo[1]) + 1;
const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (m + TILE_DIM - 1) / TILE_DIM, 1);
const dim3 threads(TILE_DIM, BLOCK_ROWS, 1);

auto stream = get_cached_stream();
if (logical)
transpose_2d_logical<VAL>
<<<blocks, threads, 0, stream>>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi);
else
transpose_2d_physical<VAL>
<<<blocks, threads, 0, stream>>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi);
transpose_2d_physical<VAL><<<blocks, threads, 0, stream>>>(out, in, rect.lo, rect.hi);
CHECK_CUDA_STREAM(stream);
}
};
Expand Down
1 change: 0 additions & 1 deletion src/cunumeric/matrix/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ namespace cunumeric {
struct TransposeArgs {
const Array& out;
const Array& in;
bool logical;
};

class TransposeTask : public CuNumericTask<TransposeTask> {
Expand Down
31 changes: 9 additions & 22 deletions src/cunumeric/matrix/transpose_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,20 @@ template <Type::Code CODE>
struct TransposeImplBody<VariantKind::OMP, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(const Rect<2>& out_rect,
const Rect<2>& in_rect,
void operator()(const Rect<2>& rect,
const AccessorWO<VAL, 2>& out,
const AccessorRO<VAL, 2>& in,
bool logical) const
const AccessorRO<VAL, 2>& in) const
{
constexpr coord_t BF = 128 / sizeof(VAL);
if (logical)
#pragma omp parallel for
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2];
}
}
else
#pragma omp parallel for
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) {
for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0];
const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
}
}
};

Expand Down
16 changes: 4 additions & 12 deletions src/cunumeric/matrix/transpose_template.inl
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,13 @@ struct TransposeImpl {
{
using VAL = legate_type_of<CODE>;

const auto out_rect = args.out.shape<2>();
if (out_rect.empty()) return;

Rect<2> in_rect;
if (args.logical) {
in_rect.lo = Point<2>(out_rect.lo[1], out_rect.lo[0]);
in_rect.hi = Point<2>(out_rect.hi[1], out_rect.hi[0]);
} else
in_rect = out_rect;
const auto rect = args.out.shape<2>();
if (rect.empty()) return;

auto out = args.out.write_accessor<VAL, 2>();
auto in = args.in.read_accessor<VAL, 2>();

TransposeImplBody<KIND, CODE>{}(out_rect, in_rect, out, in, args.logical);
TransposeImplBody<KIND, CODE>{}(rect, out, in);
}
};

Expand All @@ -55,9 +48,8 @@ static void transpose_template(TaskContext& context)
{
auto& output = context.outputs()[0];
auto& input = context.inputs()[0];
auto logical = context.scalars()[0].value<bool>();

TransposeArgs args{output, input, logical};
TransposeArgs args{output, input};
type_dispatch(input.code(), TransposeImpl<KIND>{}, args);
}

Expand Down

0 comments on commit 2b99a54

Please sign in to comment.