From 2b99a5446dcf64e9c9412317c3e2f578aca9211d Mon Sep 17 00:00:00 2001
From: Manolis Papadakis <manopapad@gmail.com>
Date: Tue, 9 Jan 2024 14:06:06 -0800
Subject: [PATCH] Remove logical version of transpose

---
 cunumeric/linalg/cholesky.py                |  2 -
 src/cunumeric/mapper.cc                     | 16 ++--
 src/cunumeric/matrix/transpose.cc           | 30 ++-----
 src/cunumeric/matrix/transpose.cu           | 99 ++++-----------------
 src/cunumeric/matrix/transpose.h            |  1 -
 src/cunumeric/matrix/transpose_omp.cc       | 31 ++-----
 src/cunumeric/matrix/transpose_template.inl | 16 +---
 7 files changed, 45 insertions(+), 150 deletions(-)
diff --git a/cunumeric/linalg/cholesky.py b/cunumeric/linalg/cholesky.py
index eed4c3188..8eccd2944 100644
--- a/cunumeric/linalg/cholesky.py
+++ b/cunumeric/linalg/cholesky.py
@@ -40,7 +40,6 @@ def transpose_copy_single(
     task.add_input(input)
     # Output has the same shape as input, but is mapped
     # to a column major instance
-    task.add_scalar_arg(False, ty.bool_)
 
     task.add_broadcast(output)
     task.add_broadcast(input)
@@ -62,7 +61,6 @@ def transpose_copy(
     task.add_input(p_input)
     # Output has the same shape as input, but is mapped
     # to a column major instance
-    task.add_scalar_arg(False, ty.bool_)
 
     task.execute()
 
diff --git a/src/cunumeric/mapper.cc b/src/cunumeric/mapper.cc
index ba7114e45..5fd36bceb 100644
--- a/src/cunumeric/mapper.cc
+++ b/src/cunumeric/mapper.cc
@@ -96,16 +96,12 @@ std::vector<StoreMapping> CuNumericMapper::store_mappings(
       return std::move(mappings);
     }
     case CUNUMERIC_TRANSPOSE_COPY_2D: {
-      auto logical = task.scalars()[0].value<bool>();
-      if (!logical) {
-        std::vector<StoreMapping> mappings;
-        auto& outputs = task.outputs();
-        mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front()));
-        mappings.back().policy.ordering.set_fortran_order();
-        mappings.back().policy.exact = true;
-        return std::move(mappings);
-      } else
-        return {};
+      std::vector<StoreMapping> mappings;
+      auto& outputs = task.outputs();
+      mappings.push_back(StoreMapping::default_mapping(outputs[0], options.front()));
+      mappings.back().policy.ordering.set_fortran_order();
+      mappings.back().policy.exact = true;
+      return std::move(mappings);
     }
     case CUNUMERIC_MATMUL:
     case CUNUMERIC_MATVECMUL:
diff --git a/src/cunumeric/matrix/transpose.cc b/src/cunumeric/matrix/transpose.cc
index 224a36ab2..285aac4da 100644
--- a/src/cunumeric/matrix/transpose.cc
+++ b/src/cunumeric/matrix/transpose.cc
@@ -30,31 +30,19 @@ template <Type::Code CODE>
 struct TransposeImplBody<VariantKind::CPU, CODE> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(const Rect<2>& out_rect,
-                  const Rect<2>& in_rect,
+  void operator()(const Rect<2>& rect,
                   const AccessorWO<VAL, 2>& out,
-                  const AccessorRO<VAL, 2>& in,
-                  bool logical) const
+                  const AccessorRO<VAL, 2>& in) const
   {
     constexpr coord_t BF = 128 / sizeof(VAL);
-    if (logical)
-      for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
-        for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
-          const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
-          const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
-          for (auto i2 = i1; i2 <= max_i2; i2++)
-            for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2];
-        }
-      }
-    else
-      for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
-        for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
-          const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
-          const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
-          for (auto i2 = i1; i2 <= max_i2; i2++)
-            for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
-        }
+    for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) {
+      for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) {
+        const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0];
+        const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1];
+        for (auto i2 = i1; i2 <= max_i2; i2++)
+          for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
       }
+    }
   }
 };
 
diff --git a/src/cunumeric/matrix/transpose.cu b/src/cunumeric/matrix/transpose.cu
index 5ccd3ef7a..bf296e0d8 100644
--- a/src/cunumeric/matrix/transpose.cu
+++ b/src/cunumeric/matrix/transpose.cu
@@ -24,70 +24,12 @@ namespace cunumeric {
 #define TILE_DIM 32
 #define BLOCK_ROWS 8
 
-template <typename VAL>
-__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
-  transpose_2d_logical(const AccessorWO<VAL, 2> out,
-                       const AccessorRO<VAL, 2> in,
-                       const Point<2> lo_in,
-                       const Point<2> hi_in,
-                       const Point<2> lo_out,
-                       const Point<2> hi_out)
-{
-  __shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];
-
-  // These are reversed here for coalescing
-  coord_t x = blockIdx.y * TILE_DIM + threadIdx.y;
-  coord_t y = blockIdx.x * TILE_DIM + threadIdx.x;
-
-  // Check to see if we hit our y-bounds, if so we can just mask off those threads
-  if ((lo_in[1] + y) <= hi_in[1]) {
-    // Check to see if we're going to hit our x-bounds while striding
-    if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) {
-// No overflow case
-#pragma unroll
-      for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
-    } else {
-// Overflow case
-#pragma unroll
-      for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        if ((lo_in[0] + x + i) <= hi_in[0])
-          tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
-    }
-  }
-  // Make sure all the data is in shared memory
-  __syncthreads();
-
-  // Transpose the coordinates
-  x = blockIdx.x * TILE_DIM + threadIdx.y;
-  y = blockIdx.y * TILE_DIM + threadIdx.x;
-
-  // Check to see if we hit our y-bounds, if so we can just mask off those threads
-  if ((lo_out[1] + y) <= hi_out[1]) {
-    // Check to see if we're going to hit our x-bounds while striding
-    if ((lo_out[0] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[0]) {
-// No overflow case
-#pragma unroll
-      for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i];
-    } else {
-// Overflow case
-#pragma unroll
-      for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        if ((lo_out[0] + x + i) <= hi_out[0])
-          out[lo_out + Point<2>(x + i, y)] = tile[threadIdx.x][threadIdx.y + i];
-    }
-  }
-}
-
 template <typename VAL>
 __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
   transpose_2d_physical(const AccessorWO<VAL, 2> out,
                         const AccessorRO<VAL, 2> in,
-                        const Point<2> lo_in,
-                        const Point<2> hi_in,
-                        const Point<2> lo_out,
-                        const Point<2> hi_out)
+                        const Point<2> lo,
+                        const Point<2> hi)
 {
   __shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];
 
@@ -96,19 +38,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S
   coord_t y = blockIdx.x * TILE_DIM + threadIdx.x;
 
   // Check to see if we hit our y-bounds, if so we can just mask off those threads
-  if ((lo_in[1] + y) <= hi_in[1]) {
+  if ((lo[1] + y) <= hi[1]) {
     // Check to see if we're going to hit our x-bounds while striding
-    if ((lo_in[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi_in[0]) {
+    if ((lo[0] + (blockIdx.y + 1) * TILE_DIM - 1) <= hi[0]) {
 // No overflow case
 #pragma unroll
       for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
+        tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)];
     } else {
 // Overflow case
 #pragma unroll
       for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        if ((lo_in[0] + x + i) <= hi_in[0])
-          tile[threadIdx.y + i][threadIdx.x] = in[lo_in + Point<2>(x + i, y)];
+        if ((lo[0] + x + i) <= hi[0])
+          tile[threadIdx.y + i][threadIdx.x] = in[lo + Point<2>(x + i, y)];
     }
   }
 
@@ -119,19 +61,19 @@ __global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_S
   y = blockIdx.x * TILE_DIM + threadIdx.y;
 
   // Check to see if we hit our x-bounds, if so we can just mask off those threads
-  if ((lo_out[0] + x) <= hi_out[0]) {
+  if ((lo[0] + x) <= hi[0]) {
     // Check to see if we're going to hit our y-bounds while striding
-    if ((lo_out[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi_out[1]) {
+    if ((lo[1] + (blockIdx.x + 1) * TILE_DIM - 1) <= hi[1]) {
 // No overflow case
 #pragma unroll
       for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
+        out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
     } else {
 // Overflow case
 #pragma unroll
       for (int i = 0; i < TILE_DIM; i += BLOCK_ROWS)
-        if ((lo_out[1] + y + i) <= hi_out[1])
-          out[lo_out + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
+        if ((lo[1] + y + i) <= hi[1])
+          out[lo + Point<2>(x, y + i)] = tile[threadIdx.x][threadIdx.y + i];
     }
   }
 }
@@ -140,24 +82,17 @@ template <Type::Code CODE>
 struct TransposeImplBody<VariantKind::GPU, CODE> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(const Rect<2>& out_rect,
-                  const Rect<2>& in_rect,
+  void operator()(const Rect<2>& rect,
                   const AccessorWO<VAL, 2>& out,
-                  const AccessorRO<VAL, 2>& in,
-                  bool logical) const
+                  const AccessorRO<VAL, 2>& in) const
   {
-    const coord_t m = (in_rect.hi[0] - in_rect.lo[0]) + 1;
-    const coord_t n = (in_rect.hi[1] - in_rect.lo[1]) + 1;
+    const coord_t m = (rect.hi[0] - rect.lo[0]) + 1;
+    const coord_t n = (rect.hi[1] - rect.lo[1]) + 1;
     const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (m + TILE_DIM - 1) / TILE_DIM, 1);
     const dim3 threads(TILE_DIM, BLOCK_ROWS, 1);
 
     auto stream = get_cached_stream();
-    if (logical)
-      transpose_2d_logical<VAL>
-        <<<blocks, threads, 0, stream>>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi);
-    else
-      transpose_2d_physical<VAL>
-        <<<blocks, threads, 0, stream>>>(out, in, in_rect.lo, in_rect.hi, out_rect.lo, out_rect.hi);
+    transpose_2d_physical<VAL><<<blocks, threads, 0, stream>>>(out, in, rect.lo, rect.hi);
     CHECK_CUDA_STREAM(stream);
   }
 };
diff --git a/src/cunumeric/matrix/transpose.h b/src/cunumeric/matrix/transpose.h
index ee3951e72..9e2043c3e 100644
--- a/src/cunumeric/matrix/transpose.h
+++ b/src/cunumeric/matrix/transpose.h
@@ -23,7 +23,6 @@ namespace cunumeric {
 struct TransposeArgs {
   const Array& out;
   const Array& in;
-  bool logical;
 };
 
 class TransposeTask : public CuNumericTask<TransposeTask> {
diff --git a/src/cunumeric/matrix/transpose_omp.cc b/src/cunumeric/matrix/transpose_omp.cc
index 729719242..6cf589290 100644
--- a/src/cunumeric/matrix/transpose_omp.cc
+++ b/src/cunumeric/matrix/transpose_omp.cc
@@ -28,33 +28,20 @@ template <Type::Code CODE>
 struct TransposeImplBody<VariantKind::OMP, CODE> {
   using VAL = legate_type_of<CODE>;
 
-  void operator()(const Rect<2>& out_rect,
-                  const Rect<2>& in_rect,
+  void operator()(const Rect<2>& rect,
                   const AccessorWO<VAL, 2>& out,
-                  const AccessorRO<VAL, 2>& in,
-                  bool logical) const
+                  const AccessorRO<VAL, 2>& in) const
   {
     constexpr coord_t BF = 128 / sizeof(VAL);
-    if (logical)
 #pragma omp parallel for
-      for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
-        for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
-          const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
-          const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
-          for (auto i2 = i1; i2 <= max_i2; i2++)
-            for (auto j2 = j1; j2 <= max_j2; j2++) out[j2][i2] = in[i2][j2];
-        }
-      }
-    else
-#pragma omp parallel for
-      for (auto i1 = in_rect.lo[0]; i1 <= in_rect.hi[0]; i1 += BF) {
-        for (auto j1 = in_rect.lo[1]; j1 <= in_rect.hi[1]; j1 += BF) {
-          const auto max_i2 = ((i1 + BF) <= in_rect.hi[0]) ? i1 + BF : in_rect.hi[0];
-          const auto max_j2 = ((j1 + BF) <= in_rect.hi[1]) ? j1 + BF : in_rect.hi[1];
-          for (auto i2 = i1; i2 <= max_i2; i2++)
-            for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
-        }
+    for (auto i1 = rect.lo[0]; i1 <= rect.hi[0]; i1 += BF) {
+      for (auto j1 = rect.lo[1]; j1 <= rect.hi[1]; j1 += BF) {
+        const auto max_i2 = ((i1 + BF) <= rect.hi[0]) ? i1 + BF : rect.hi[0];
+        const auto max_j2 = ((j1 + BF) <= rect.hi[1]) ? j1 + BF : rect.hi[1];
+        for (auto i2 = i1; i2 <= max_i2; i2++)
+          for (auto j2 = j1; j2 <= max_j2; j2++) out[i2][j2] = in[i2][j2];
       }
+    }
   }
 };
 
diff --git a/src/cunumeric/matrix/transpose_template.inl b/src/cunumeric/matrix/transpose_template.inl
index 4d695c3cd..4d97aeb54 100644
--- a/src/cunumeric/matrix/transpose_template.inl
+++ b/src/cunumeric/matrix/transpose_template.inl
@@ -33,20 +33,13 @@ struct TransposeImpl {
   {
     using VAL = legate_type_of<CODE>;
 
-    const auto out_rect = args.out.shape<2>();
-    if (out_rect.empty()) return;
-
-    Rect<2> in_rect;
-    if (args.logical) {
-      in_rect.lo = Point<2>(out_rect.lo[1], out_rect.lo[0]);
-      in_rect.hi = Point<2>(out_rect.hi[1], out_rect.hi[0]);
-    } else
-      in_rect = out_rect;
+    const auto rect = args.out.shape<2>();
+    if (rect.empty()) return;
 
     auto out = args.out.write_accessor<VAL, 2>();
     auto in  = args.in.read_accessor<VAL, 2>();
 
-    TransposeImplBody<KIND, CODE>{}(out_rect, in_rect, out, in, args.logical);
+    TransposeImplBody<KIND, CODE>{}(rect, out, in);
   }
 };
 
@@ -55,9 +48,8 @@ static void transpose_template(TaskContext& context)
 {
   auto& output = context.outputs()[0];
   auto& input  = context.inputs()[0];
-  auto logical = context.scalars()[0].value<bool>();
 
-  TransposeArgs args{output, input, logical};
+  TransposeArgs args{output, input};
   type_dispatch(input.code(), TransposeImpl<KIND>{}, args);
 }