Skip to content

Commit

Permalink
missing files
Browse files Browse the repository at this point in the history
  • Loading branch information
jjwilke committed Aug 17, 2023
1 parent d6f5e3d commit 0f2bc69
Show file tree
Hide file tree
Showing 5 changed files with 415 additions and 0 deletions.
77 changes: 77 additions & 0 deletions src/cunumeric/matrix/batched_cholesky.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
/* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#include "cunumeric/matrix/batched_cholesky.h"
#include "cunumeric/matrix/batched_cholesky_template.inl"

#include <cblas.h>
#include <core/type/type_info.h>
#include <lapack.h>

namespace cunumeric {

using namespace legate;

template <Type::Code CODE>
struct BatchedTransposeImplBody<VariantKind::CPU, CODE> {
using VAL = legate_type_of<CODE>;

static constexpr int tile_size = 64;

void operator()(VAL* out, int n) const
{
VAL tile[tile_size][tile_size];
int nblocks = (n + tile_size - 1) / tile_size;

for (int rb=0; rb < nblocks; ++rb){
for (int cb=0; cb < nblocks; ++cb){
int r_start = rb * tile_size;
int r_stop = std::min(r_start + tile_size, n);
int c_start = cb * tile_size;
int c_stop = std::min(c_start + tile_size, n);
for (int r=r_start, tr=0; r < r_stop; ++r){
for (int c=c_start, tc=0; c < c_stop; ++c){
if (r <= c){
tile[tr][tc] = out[r*n + c];
} else {
tile[tr][tc] = 0;
}
}
}
for (int r=c_start, tr=0; r < c_stop; ++r){
for (int c=r_start, tc=0; c < r_stop; ++c){
out[r*n+c] = tile[tr][tc];
}
}
}
}
}
};

/*static*/ void BatchedCholeskyTask::cpu_variant(TaskContext& context)
{
#ifdef LEGATE_USE_OPENMP
openblas_set_num_threads(1); // make sure this isn't overzealous
#endif
batched_cholesky_task_context_dispatch<VariantKind::CPU>(context);
}

namespace // unnamed
{
static void __attribute__((constructor)) register_tasks(void) { BatchedCholeskyTask::register_variants(); }
} // namespace

} // namespace cunumeric
98 changes: 98 additions & 0 deletions src/cunumeric/matrix/batched_cholesky.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
/* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#include "cunumeric/matrix/batched_cholesky.h"
#include "cunumeric/matrix/potrf.h"
#include "cunumeric/matrix/batched_cholesky_template.inl"

#include "cunumeric/cuda_help.h"

namespace cunumeric {

using namespace legate;

#define TILE_DIM 32
#define BLOCK_ROWS 8

template <typename VAL>
__global__ static void __launch_bounds__((TILE_DIM * BLOCK_ROWS), MIN_CTAS_PER_SM)
transpose_2d_lower(VAL* out, int n)
{
__shared__ VAL tile[TILE_DIM][TILE_DIM + 1 /*avoid bank conflicts*/];

// The y dim is fast-moving index for coalescing
auto r = blockIdx.x * TILE_DIM + threadIdx.x;
auto c = blockIdx.y * TILE_DIM + threadIdx.y;
auto stride = BLOCK_ROWS;
// The tile coordinates
auto tr = threadIdx.x;
auto tc = threadIdx.y;
auto offset = r*n + c;
#pragma unroll
for (int i=0; i < TILE_DIM; i += BLOCK_ROWS, offset += stride){
if (r < n && (c+i) < n){
if (r <= (c+i)){
tile[tr][tc+i] = out[offset];
} else {
tile[tr][tc+i] = 0;
}
}
}

// Make sure all the data is in shared memory
__syncthreads();

// Transpose the global coordinates, keep y the fast-moving index
r = blockIdx.y * TILE_DIM + threadIdx.x;
c = blockIdx.x * TILE_DIM + threadIdx.y;
offset = r*n + c;

#pragma unroll
for (int i=0; i < TILE_DIM; i += BLOCK_ROWS, offset += stride){
// only store to the lower triangle
if (r < n && (c+i) < n){
out[offset] = tile[tc+i][tr];
}
}
}

template <Type::Code CODE>
struct BatchedTransposeImplBody<VariantKind::GPU, CODE> {
using VAL = legate_type_of<CODE>;

void operator()(VAL* out, int n) const
{
const dim3 blocks((n + TILE_DIM - 1) / TILE_DIM, (n + TILE_DIM - 1) / TILE_DIM, 1);
const dim3 threads(TILE_DIM, BLOCK_ROWS, 1);

auto stream = get_cached_stream();

// CUDA Potrf produces the full matrix, we only want
// the lower diagonal
transpose_2d_lower<VAL>
<<<blocks, threads, 0, stream>>>(out, n);

CHECK_CUDA_STREAM(stream);
}
};


/*static*/ void BatchedCholeskyTask::gpu_variant(TaskContext& context)
{
batched_cholesky_task_context_dispatch<VariantKind::GPU>(context);
}

} // namespace cunumeric
38 changes: 38 additions & 0 deletions src/cunumeric/matrix/batched_cholesky.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
/* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#pragma once

#include "cunumeric/cunumeric.h"
#include "cunumeric/cunumeric_c.h"

namespace cunumeric {

class BatchedCholeskyTask : public CuNumericTask<BatchedCholeskyTask> {
public:
static const int TASK_ID = CUNUMERIC_BATCHED_CHOLESKY;

public:
static void cpu_variant(legate::TaskContext& context);
#ifdef LEGATE_USE_OPENMP
static void omp_variant(legate::TaskContext& context);
#endif
#ifdef LEGATE_USE_CUDA
static void gpu_variant(legate::TaskContext& context);
#endif
};

} // namespace cunumeric
75 changes: 75 additions & 0 deletions src/cunumeric/matrix/batched_cholesky_omp.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
/* Copyright 2021-2022 NVIDIA Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
*/

#include "cunumeric/matrix/batched_cholesky.h"
#include "cunumeric/matrix/batched_cholesky_template.inl"

#include <cblas.h>
#include <lapack.h>
#include <omp.h>

namespace cunumeric {

using namespace legate;

template <Type::Code CODE>
struct BatchedTransposeImplBody<VariantKind::OMP, CODE> {
using VAL = legate_type_of<CODE>;

static constexpr int tile_size = 64;

void operator()(VAL* out, int n) const
{

int nblocks = (n + tile_size - 1) / tile_size;

#pragma omp parallel for
for (int rb=0; rb < nblocks; ++rb){
for (int cb=0; cb < nblocks; ++cb){
VAL tile[tile_size][tile_size];
int r_start = rb * tile_size;
int r_stop = std::min(r_start + tile_size, n);
int c_start = cb * tile_size;
int c_stop = std::min(c_start + tile_size, n);

for (int r=r_start, tr=0; r < r_stop; ++r){
for (int c=c_start, tc=0; c < c_stop; ++c){
if (r <= c){
tile[tr][tc] = out[r*n + c];
} else {
tile[tr][tc] = 0;
}
}
}

for (int r=c_start, tr=0; r < c_stop; ++r){
for (int c=r_start, tc=0; c < r_stop; ++c){
out[r*n+c] = tile[tr][tc];
}
}

}
}
}
};

/*static*/ void BatchedCholeskyTask::omp_variant(TaskContext& context)
{
openblas_set_num_threads(omp_get_max_threads());
batched_cholesky_task_context_dispatch<VariantKind::OMP>(context);
}

} // namespace cunumeric
Loading

0 comments on commit 0f2bc69

Please sign in to comment.