diff --git a/cunumeric/linalg/cholesky.py b/cunumeric/linalg/cholesky.py index eed4c3188..8eccd2944 100644 --- a/cunumeric/linalg/cholesky.py +++ b/cunumeric/linalg/cholesky.py @@ -40,7 +40,6 @@ def transpose_copy_single( task.add_input(input) # Output has the same shape as input, but is mapped # to a column major instance - task.add_scalar_arg(False, ty.bool_) task.add_broadcast(output) task.add_broadcast(input) @@ -62,7 +61,6 @@ def transpose_copy( task.add_input(p_input) # Output has the same shape as input, but is mapped # to a column major instance - task.add_scalar_arg(False, ty.bool_) task.execute() diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc index ba7114e45..5fd36bceb 100644 --- a/src/cunumeric/mapper.cc +++ b/src/cunumeric/mapper.cc @@ -96,16 +96,12 @@ std::vector CuNumericMapper::store_mappings( return std::move(mappings); } case CUNUMERIC_TRANSPOSE_COPY_2D: { - auto logical = task.scalars()[0].value(); - if (!logical) { - std::vector mappings; - auto& outputs = task.outputs(); - mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front())); - mappings.back().policy.ordering.set_fortran_order(); - mappings.back().policy.exact = true; - return std::move(mappings); - } else - return {}; + std::vector mappings; + auto& outputs = task.outputs(); + mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front())); + mappings.back().policy.ordering.set_fortran_order(); + mappings.back().policy.exact = true; + return std::move(mappings); } case CUNUMERIC_MATMUL: case CUNUMERIC_MATVECMUL: diff --git a/src/cunumeric/matrix/transpose.cc b/src/cunumeric/matrix/transpose.cc index 224a36ab2..285aac4da 100644 --- a/src/cunumeric/matrix/transpose.cc +++ b/src/cunumeric/matrix/transpose.cc @@ -30,31 +30,19 @@ template struct TransposeImplBody { using VAL = legate_type_of; - void operator()(const Rect<2>& out_rect, - const Rect<2>& in_rect, + void operator()(const Rect<2>& rect, const AccessorWO& out, - const AccessorRO& in, - bool logical) const + const AccessorRO& in) const { constexpr coord_t BF = 128 / sizeof(VAL); - if (logical) - for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) { - for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) { - const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0]; - const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1]; - for (auto i2 = i1; i2 <= max_i2; i2++) - for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2]; - } - } - else - for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) { - for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) { - const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0]; - const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1]; - for (auto i2 = i1; i2 <= max_i2; i2++) - for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2]; - } + for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) { + for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) { + const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0]; + const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1]; + for (auto i2 = i1; i2 <= max_i2; i2++) + for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2]; } + } } }; diff --git a/src/cunumeric/matrix/transpose.cu b/src/cunumeric/matrix/transpose.cu index 5ccd3ef7a..bf296e0d8 100644 --- a/src/cunumeric/matrix/transpose.cu +++ b/src/cunumeric/matrix/transpose.cu @@ -24,70 +24,12 @@ namespace cunumeric { #define TILE_DIM 32 #define BLOCK_ROWS 8 -template -__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM) - transpose_2d_logical(const AccessorWO out, - const AccessorRO in, - const Point<2> lo_in, - const Point<2> hi_in, - const Point<2> lo_out, - const Point<2> hi_out) -{ - __shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/]; - - // These are reversed here for coalescing - coord_t x = blockIdx.y * TILE_DIM + threadIdx.y; - coord_t y = blockIdx.x * TILE_DIM + threadIdx.x; - - // Check to see if we hit our y-bounds, if so we can just mask off those threads - if ((lo_in[1] + y) <= hi_in[1]) { - // Check to see if we're going to hit our x-bounds while striding - if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) { -// No overflow case -#pragma unroll - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)]; - } else { -// Overflow case -#pragma unroll - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - if ((lo_in[0] + x + i) <= hi_in[0]) - tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)]; - } - } - // Make sure all the data is in shared memory - __syncthreads(); - - // Transpose the coordinates - x = blockIdx.x * TILE_DIM + threadIdx.y; - y = blockIdx.y * TILE_DIM + threadIdx.x; - - // Check to see if we hit our y-bounds, if so we can just mask off those threads - if ((lo_out[1] + y) <= hi_out[1]) { - // Check to see if we're going to hit our x-bounds while striding - if ((lo_out[0] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[0]) { -// No overflow case -#pragma unroll - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i]; - } else { -// Overflow case -#pragma unroll - for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - if ((lo_out[0] + x + i) <= hi_out[0]) - out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i]; - } - } -} - template __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM) transpose_2d_physical(const AccessorWO out, const AccessorRO in, - const Point<2> lo_in, - const Point<2> hi_in, - const Point<2> lo_out, - const Point<2> hi_out) + const Point<2> lo, + const Point<2> hi) { __shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/]; @@ -96,19 +38,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S coord_t y = blockIdx.x * TILE_DIM + threadIdx.x; // Check to see if we hit our y-bounds, if so we can just mask off those threads - if ((lo_in[1] + y) <= hi_in[1]) { + if ((lo[1] + y) <= hi[1]) { // Check to see if we're going to hit our x-bounds while striding - if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) { + if ((lo[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi[0]) { // No overflow case #pragma unroll for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)]; + tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)]; } else { // Overflow case #pragma unroll for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - if ((lo_in[0] + x + i) <= hi_in[0]) - tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)]; + if ((lo[0] + x + i) <= hi[0]) + tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)]; } } @@ -119,19 +61,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S y = blockIdx.x * TILE_DIM + threadIdx.y; // Check to see if we hit our x-bounds, if so we can just mask off those threads - if ((lo_out[0] + x) <= hi_out[0]) { + if ((lo[0] + x) <= hi[0]) { // Check to see if we're going to hit our y-bounds while striding - if ((lo_out[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[1]) { + if ((lo[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi[1]) { // No overflow case #pragma unroll for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i]; + out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i]; } else { // Overflow case #pragma unroll for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS) - if ((lo_out[1] + y + i) <= hi_out[1]) - out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i]; + if ((lo[1] + y + i) <= hi[1]) + out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i]; } } } @@ -140,24 +82,17 @@ template struct TransposeImplBody { using VAL = legate_type_of; - void operator()(const Rect<2>& out_rect, - const Rect<2>& in_rect, + void operator()(const Rect<2>& rect, const AccessorWO& out, - const AccessorRO& in, - bool logical) const + const AccessorRO& in) const { - const coord_t m = (in_rect.hi[0] - in_rect.lo[0]) + 1; - const coord_t n = (in_rect.hi[1] - in_rect.lo[1]) + 1; + const coord_t m = (rect.hi[0] - rect.lo[0]) + 1; + const coord_t n = (rect.hi[1] - rect.lo[1]) + 1; const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (m + TILE_DIM - 1) / TILE_DIM, 1); const dim3 threads(TILE_DIM, BLOCK_ROWS, 1); auto stream = get_cached_stream(); - if (logical) - transpose_2d_logical - <<>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi); - else - transpose_2d_physical - <<>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi); + transpose_2d_physical<<>>(out, in, rect.lo, rect.hi); CHECK_CUDA_STREAM(stream); } }; diff --git a/src/cunumeric/matrix/transpose.h b/src/cunumeric/matrix/transpose.h index ee3951e72..9e2043c3e 100644 --- a/src/cunumeric/matrix/transpose.h +++ b/src/cunumeric/matrix/transpose.h @@ -23,7 +23,6 @@ namespace cunumeric { struct TransposeArgs { const Array& out; const Array& in; - bool logical; }; class TransposeTask : public CuNumericTask { diff --git a/src/cunumeric/matrix/transpose_omp.cc b/src/cunumeric/matrix/transpose_omp.cc index 729719242..6cf589290 100644 --- a/src/cunumeric/matrix/transpose_omp.cc +++ b/src/cunumeric/matrix/transpose_omp.cc @@ -28,33 +28,20 @@ template struct TransposeImplBody { using VAL = legate_type_of; - void operator()(const Rect<2>& out_rect, - const Rect<2>& in_rect, + void operator()(const Rect<2>& rect, const AccessorWO& out, - const AccessorRO& in, - bool logical) const + const AccessorRO& in) const { constexpr coord_t BF = 128 / sizeof(VAL); - if (logical) #pragma omp parallel for - for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) { - for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) { - const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0]; - const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1]; - for (auto i2 = i1; i2 <= max_i2; i2++) - for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2]; - } - } - else -#pragma omp parallel for - for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) { - for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) { - const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0]; - const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1]; - for (auto i2 = i1; i2 <= max_i2; i2++) - for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2]; - } + for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) { + for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) { + const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0]; + const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1]; + for (auto i2 = i1; i2 <= max_i2; i2++) + for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2]; } + } } }; diff --git a/src/cunumeric/matrix/transpose_template.inl b/src/cunumeric/matrix/transpose_template.inl index 4d695c3cd..4d97aeb54 100644 --- a/src/cunumeric/matrix/transpose_template.inl +++ b/src/cunumeric/matrix/transpose_template.inl @@ -33,20 +33,13 @@ struct TransposeImpl { { using VAL = legate_type_of; - const auto out_rect = args.out.shape<2>(); - if (out_rect.empty()) return; - - Rect<2> in_rect; - if (args.logical) { - in_rect.lo = Point<2>(out_rect.lo[1], out_rect.lo[0]); - in_rect.hi = Point<2>(out_rect.hi[1], out_rect.hi[0]); - } else - in_rect = out_rect; + const auto rect = args.out.shape<2>(); + if (rect.empty()) return; auto out = args.out.write_accessor(); auto in = args.in.read_accessor(); - TransposeImplBody{}(out_rect, in_rect, out, in, args.logical); + TransposeImplBody{}(rect, out, in); } }; @@ -55,9 +48,8 @@ static void transpose_template(TaskContext& context) { auto& output = context.outputs()[0]; auto& input = context.inputs()[0]; - auto logical = context.scalars()[0].value(); - TransposeArgs args{output, input, logical}; + TransposeArgs args{output, input}; type_dispatch(input.code(), TransposeImpl{}, args); }