missing files

nv-legate · Aug 17, 2023 · 0f2bc69 · 0f2bc69
1 parent d6f5e3d
commit 0f2bc69
Show file tree

Hide file tree

Showing 5 changed files with 415 additions and 0 deletions.
diff --git a/src/cunumeric/matrix/batched_cholesky.cc b/src/cunumeric/matrix/batched_cholesky.cc
@@ -0,0 +1,77 @@
+/* Copyright 2021-2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/matrix/batched_cholesky_template.inl"
+
+#include <cblas.h>
+#include <core/type/type_info.h>
+#include <lapack.h>
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <Type::Code CODE>
+struct BatchedTransposeImplBody<VariantKind::CPU, CODE> {
+  using VAL = legate_type_of<CODE>;
+
+  static constexpr int tile_size = 64;
+
+  void operator()(VAL* out, int n) const
+  {
+    VAL tile[tile_size][tile_size];
+    int nblocks = (n + tile_size - 1) / tile_size;
+
+    for (int rb=0; rb < nblocks; ++rb){
+      for (int cb=0; cb < nblocks; ++cb){
+        int r_start = rb * tile_size;
+        int r_stop = std::min(r_start + tile_size, n);
+        int c_start = cb * tile_size;
+        int c_stop = std::min(c_start + tile_size, n);
+        for (int r=r_start, tr=0; r < r_stop; ++r){
+          for (int c=c_start, tc=0; c < c_stop; ++c){
+            if (r <= c){
+              tile[tr][tc] = out[r*n + c];
+            } else {
+              tile[tr][tc] = 0;
+            }
+          }
+        }
+        for (int r=c_start, tr=0; r < c_stop; ++r){
+          for (int c=r_start, tc=0; c < r_stop; ++c){
+            out[r*n+c] = tile[tr][tc];
+          }
+        }
+      }
+    }
+  }
+};
+
+/*static*/ void BatchedCholeskyTask::cpu_variant(TaskContext& context)
+{
+#ifdef LEGATE_USE_OPENMP
+  openblas_set_num_threads(1);  // make sure this isn't overzealous
+#endif
+  batched_cholesky_task_context_dispatch<VariantKind::CPU>(context);
+}
+
+namespace  // unnamed
+{
+static void __attribute__((constructor)) register_tasks(void) { BatchedCholeskyTask::register_variants(); }
+}  // namespace
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.cu b/src/cunumeric/matrix/batched_cholesky.cu
@@ -0,0 +1,98 @@
+/* Copyright 2021-2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/matrix/potrf.h"
+#include "cunumeric/matrix/batched_cholesky_template.inl"
+
+#include "cunumeric/cuda_help.h"
+
+namespace cunumeric {
+
+using namespace legate;
+
+#define TILE_DIM 32
+#define BLOCK_ROWS 8
+
+template <typename VAL>
+__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
+  transpose_2d_lower(VAL* out, int n)
+{
+  __shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];
+
+  // The y dim is fast-moving index for coalescing
+  auto r = blockIdx.x * TILE_DIM + threadIdx.x;
+  auto c = blockIdx.y * TILE_DIM + threadIdx.y;
+  auto stride = BLOCK_ROWS;
+  // The tile coordinates
+  auto tr = threadIdx.x;
+  auto tc = threadIdx.y;
+  auto offset = r*n + c;
+#pragma unroll
+  for (int i=0; i < TILE_DIM; i += BLOCK_ROWS, offset += stride){
+    if (r < n && (c+i) < n){
+      if (r <= (c+i)){
+        tile[tr][tc+i] = out[offset];
+      } else {
+        tile[tr][tc+i] = 0;
+      }
+    }
+  }
+
+  // Make sure all the data is in shared memory
+  __syncthreads();
+
+  // Transpose the global coordinates, keep y the fast-moving index
+  r = blockIdx.y * TILE_DIM + threadIdx.x;
+  c = blockIdx.x * TILE_DIM + threadIdx.y;
+  offset = r*n + c;
+
+#pragma unroll
+  for (int i=0; i < TILE_DIM; i += BLOCK_ROWS, offset += stride){
+    // only store to the lower triangle
+    if (r < n && (c+i) < n){
+      out[offset] = tile[tc+i][tr];
+    }
+  }
+}
+
+template <Type::Code CODE>
+struct BatchedTransposeImplBody<VariantKind::GPU, CODE> {
+  using VAL = legate_type_of<CODE>;
+
+  void operator()(VAL* out, int n) const
+  {
+    const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (n + TILE_DIM - 1) / TILE_DIM, 1);
+    const dim3 threads(TILE_DIM, BLOCK_ROWS, 1);
+
+    auto stream = get_cached_stream();
+
+    // CUDA Potrf produces the full matrix, we only want
+    // the lower diagonal
+    transpose_2d_lower<VAL>
+      <<<blocks, threads, 0, stream>>>(out, n);
+
+    CHECK_CUDA_STREAM(stream);
+  }
+};
+
+
+/*static*/ void BatchedCholeskyTask::gpu_variant(TaskContext& context)
+{
+  batched_cholesky_task_context_dispatch<VariantKind::GPU>(context);
+}
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky.h b/src/cunumeric/matrix/batched_cholesky.h
@@ -0,0 +1,38 @@
+/* Copyright 2021-2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#pragma once
+
+#include "cunumeric/cunumeric.h"
+#include "cunumeric/cunumeric_c.h"
+
+namespace cunumeric {
+
+class BatchedCholeskyTask : public CuNumericTask<BatchedCholeskyTask> {
+ public:
+  static const int TASK_ID = CUNUMERIC_BATCHED_CHOLESKY;
+
+ public:
+  static void cpu_variant(legate::TaskContext& context);
+#ifdef LEGATE_USE_OPENMP
+  static void omp_variant(legate::TaskContext& context);
+#endif
+#ifdef LEGATE_USE_CUDA
+  static void gpu_variant(legate::TaskContext& context);
+#endif
+};
+
+}  // namespace cunumeric
diff --git a/src/cunumeric/matrix/batched_cholesky_omp.cc b/src/cunumeric/matrix/batched_cholesky_omp.cc
@@ -0,0 +1,75 @@
+/* Copyright 2021-2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ *
+ */
+
+#include "cunumeric/matrix/batched_cholesky.h"
+#include "cunumeric/matrix/batched_cholesky_template.inl"
+
+#include <cblas.h>
+#include <lapack.h>
+#include <omp.h>
+
+namespace cunumeric {
+
+using namespace legate;
+
+template <Type::Code CODE>
+struct BatchedTransposeImplBody<VariantKind::OMP, CODE> {
+  using VAL = legate_type_of<CODE>;
+
+  static constexpr int tile_size = 64;
+
+  void operator()(VAL* out, int n) const
+  {
+
+    int nblocks = (n + tile_size - 1) / tile_size;
+
+#pragma omp parallel for
+    for (int rb=0; rb < nblocks; ++rb){
+      for (int cb=0; cb < nblocks; ++cb){
+        VAL tile[tile_size][tile_size];
+        int r_start = rb * tile_size;
+        int r_stop = std::min(r_start + tile_size, n);
+        int c_start = cb * tile_size;
+        int c_stop = std::min(c_start + tile_size, n);
+
+        for (int r=r_start, tr=0; r < r_stop; ++r){
+          for (int c=c_start, tc=0; c < c_stop; ++c){
+            if (r <= c){
+              tile[tr][tc] = out[r*n + c];
+            } else {
+              tile[tr][tc] = 0;
+            }
+          }
+        }
+
+        for (int r=c_start, tr=0; r < c_stop; ++r){
+          for (int c=r_start, tc=0; c < r_stop; ++c){
+            out[r*n+c] = tile[tr][tc];
+          }
+        }
+
+      }
+    }
+  }
+};
+
+/*static*/ void BatchedCholeskyTask::omp_variant(TaskContext& context)
+{
+  openblas_set_num_threads(omp_get_max_threads());
+  batched_cholesky_task_context_dispatch<VariantKind::OMP>(context);
+}
+
+}  // namespace cunumeric