Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Remove logical version of transpose #1113

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 0 additions & 2 deletions cunumeric/linalg/cholesky.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,6 @@ def transpose_copy_single(
task.add_input(input)
# Output has the same shape as input, but is mapped
# to a column major instance
task.add_scalar_arg(False, ty.bool_)

task.add_broadcast(output)
task.add_broadcast(input)
Expand All @@ -62,7 +61,6 @@ def transpose_copy(
task.add_input(p_input)
# Output has the same shape as input, but is mapped
# to a column major instance
task.add_scalar_arg(False, ty.bool_)

task.execute()

Expand Down
16 changes: 6 additions & 10 deletions src/cunumeric/mapper.cc
Original file line number Diff line number Diff line change
Expand Up @@ -96,16 +96,12 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
return std::move(mappings);
}
case CUNUMERIC_TRANSPOSE_COPY_2D: {
auto logical = task.scalars()[0].value<bool>();
if (!logical) {
std::vector<StoreMapping> mappings;
auto& outputs = task.outputs();
mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front()));
mappings.back().policy.ordering.set_fortran_order();
mappings.back().policy.exact = true;
return std::move(mappings);
} else
return {};
std::vector<StoreMapping> mappings;
auto& outputs = task.outputs();
mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front()));
mappings.back().policy.ordering.set_fortran_order();
mappings.back().policy.exact = true;
return std::move(mappings);
}
case CUNUMERIC_MATMUL:
case CUNUMERIC_MATVECMUL:
Expand Down
30 changes: 9 additions & 21 deletions src/cunumeric/matrix/transpose.cc
Original file line number Diff line number Diff line change
Expand Up @@ -30,31 +30,19 @@ template <Type::Code CODE>
struct TransposeImplBody<VariantKind::CPU, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(const Rect<2>& out_rect,
const Rect<2>& in_rect,
void operator()(const Rect<2>& rect,
const AccessorWO<VAL, 2>& out,
const AccessorRO<VAL, 2>& in,
bool logical) const
const AccessorRO<VAL, 2>& in) const
{
constexpr coord_t BF = 128 / sizeof(VAL);
if (logical)
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2];
}
}
else
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) {
for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0];
const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
}
}
};

Expand Down
99 changes: 17 additions & 82 deletions src/cunumeric/matrix/transpose.cu
Original file line number Diff line number Diff line change
Expand Up @@ -24,70 +24,12 @@ namespace cunumeric {
#define TILE_DIM 32
#define BLOCK_ROWS 8

template <typename VAL>
__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
transpose_2d_logical(const AccessorWO<VAL, 2> out,
const AccessorRO<VAL, 2> in,
const Point<2> lo_in,
const Point<2> hi_in,
const Point<2> lo_out,
const Point<2> hi_out)
{
__shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];

// These are reversed here for coalescing
coord_t x = blockIdx.y * TILE_DIM + threadIdx.y;
coord_t y = blockIdx.x * TILE_DIM + threadIdx.x;

// Check to see if we hit our y-bounds, if so we can just mask off those threads
if ((lo_in[1] + y) <= hi_in[1]) {
// Check to see if we're going to hit our x-bounds while striding
if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_in[0] + x + i) <= hi_in[0])
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
}
}
// Make sure all the data is in shared memory
__syncthreads();

// Transpose the coordinates
x = blockIdx.x * TILE_DIM + threadIdx.y;
y = blockIdx.y * TILE_DIM + threadIdx.x;

// Check to see if we hit our y-bounds, if so we can just mask off those threads
if ((lo_out[1] + y) <= hi_out[1]) {
// Check to see if we're going to hit our x-bounds while striding
if ((lo_out[0] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[0]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_out[0] + x + i) <= hi_out[0])
out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i];
}
}
}

template <typename VAL>
__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
transpose_2d_physical(const AccessorWO<VAL, 2> out,
const AccessorRO<VAL, 2> in,
const Point<2> lo_in,
const Point<2> hi_in,
const Point<2> lo_out,
const Point<2> hi_out)
const Point<2> lo,
const Point<2> hi)
{
__shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];

Expand All @@ -96,19 +38,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S
coord_t y = blockIdx.x * TILE_DIM + threadIdx.x;

// Check to see if we hit our y-bounds, if so we can just mask off those threads
if ((lo_in[1] + y) <= hi_in[1]) {
if ((lo[1] + y) <= hi[1]) {
// Check to see if we're going to hit our x-bounds while striding
if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) {
if ((lo[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi[0]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_in[0] + x + i) <= hi_in[0])
tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
if ((lo[0] + x + i) <= hi[0])
tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)];
}
}

Expand All @@ -119,19 +61,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S
y = blockIdx.x * TILE_DIM + threadIdx.y;

// Check to see if we hit our x-bounds, if so we can just mask off those threads
if ((lo_out[0] + x) <= hi_out[0]) {
if ((lo[0] + x) <= hi[0]) {
// Check to see if we're going to hit our y-bounds while striding
if ((lo_out[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[1]) {
if ((lo[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi[1]) {
// No overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
} else {
// Overflow case
#pragma unroll
for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
if ((lo_out[1] + y + i) <= hi_out[1])
out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
if ((lo[1] + y + i) <= hi[1])
out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
}
}
}
Expand All @@ -140,24 +82,17 @@ template <Type::Code CODE>
struct TransposeImplBody<VariantKind::GPU, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(const Rect<2>& out_rect,
const Rect<2>& in_rect,
void operator()(const Rect<2>& rect,
const AccessorWO<VAL, 2>& out,
const AccessorRO<VAL, 2>& in,
bool logical) const
const AccessorRO<VAL, 2>& in) const
{
const coord_t m = (in_rect.hi[0] - in_rect.lo[0]) + 1;
const coord_t n = (in_rect.hi[1] - in_rect.lo[1]) + 1;
const coord_t m = (rect.hi[0] - rect.lo[0]) + 1;
const coord_t n = (rect.hi[1] - rect.lo[1]) + 1;
const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (m + TILE_DIM - 1) / TILE_DIM, 1);
const dim3 threads(TILE_DIM, BLOCK_ROWS, 1);

auto stream = get_cached_stream();
if (logical)
transpose_2d_logical<VAL>
<<<blocks, threads, 0, stream>>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi);
else
transpose_2d_physical<VAL>
<<<blocks, threads, 0, stream>>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi);
transpose_2d_physical<VAL><<<blocks, threads, 0, stream>>>(out, in, rect.lo, rect.hi);
CHECK_CUDA_STREAM(stream);
}
};
Expand Down
1 change: 0 additions & 1 deletion src/cunumeric/matrix/transpose.h
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ namespace cunumeric {
struct TransposeArgs {
const Array& out;
const Array& in;
bool logical;
};

class TransposeTask : public CuNumericTask<TransposeTask> {
Expand Down
31 changes: 9 additions & 22 deletions src/cunumeric/matrix/transpose_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -28,33 +28,20 @@ template <Type::Code CODE>
struct TransposeImplBody<VariantKind::OMP, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(const Rect<2>& out_rect,
const Rect<2>& in_rect,
void operator()(const Rect<2>& rect,
const AccessorWO<VAL, 2>& out,
const AccessorRO<VAL, 2>& in,
bool logical) const
const AccessorRO<VAL, 2>& in) const
{
constexpr coord_t BF = 128 / sizeof(VAL);
if (logical)
#pragma omp parallel for
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2];
}
}
else
#pragma omp parallel for
for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) {
for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) {
const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0];
const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1];
for (auto i2 = i1; i2 <= max_i2; i2++)
for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
}
}
}
};

Expand Down
16 changes: 4 additions & 12 deletions src/cunumeric/matrix/transpose_template.inl
Original file line number Diff line number Diff line change
Expand Up @@ -33,20 +33,13 @@ struct TransposeImpl {
{
using VAL = legate_type_of<CODE>;

const auto out_rect = args.out.shape<2>();
if (out_rect.empty()) return;

Rect<2> in_rect;
if (args.logical) {
in_rect.lo = Point<2>(out_rect.lo[1], out_rect.lo[0]);
in_rect.hi = Point<2>(out_rect.hi[1], out_rect.hi[0]);
} else
in_rect = out_rect;
const auto rect = args.out.shape<2>();
if (rect.empty()) return;

auto out = args.out.write_accessor<VAL, 2>();
auto in = args.in.read_accessor<VAL, 2>();

TransposeImplBody<KIND, CODE>{}(out_rect, in_rect, out, in, args.logical);
TransposeImplBody<KIND, CODE>{}(rect, out, in);
}
};

Expand All @@ -55,9 +48,8 @@ static void transpose_template(TaskContext& context)
{
auto& output = context.outputs()[0];
auto& input = context.inputs()[0];
auto logical = context.scalars()[0].value<bool>();

TransposeArgs args{output, input, logical};
TransposeArgs args{output, input};
type_dispatch(input.code(), TransposeImpl<KIND>{}, args);
}

Expand Down
Loading