Skip to content

Commit

Permalink
Merge format conversion, SpMV and write improvements
Browse files Browse the repository at this point in the history
This PR contains many small improvements to the format conversions, SpMVs and write functions

* Fix SpMV edge cases
* Simplify and unify matrix format conversions
* Initialize row_ptr like structures by default
* Guard against empty kernel launches
* remove unnecessary dim3 usage

Related PR: #905
  • Loading branch information
upsj authored Feb 1, 2022
2 parents f25b795 + a7379ad commit 0436f3e
Show file tree
Hide file tree
Showing 220 changed files with 5,479 additions and 9,514 deletions.
6 changes: 6 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ endfunction()

function(ginkgo_benchmark_cusparse_linops type def)
add_library(cusparse_linops_${type} utils/cuda_linops.cu)
if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
# remove false positive CUDA warnings when calling one<T>() and zero<T>()
target_compile_options(cusparse_linops_${type}
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
endif()
# make the dependency public to catch issues
target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE})
Expand Down
75 changes: 69 additions & 6 deletions benchmark/utils/cuda_linops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,6 @@ public:
}

protected:
void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*,
gko::LinOp*) const override
{
GKO_NOT_IMPLEMENTED;
}

CusparseBase(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::LinOp(exec, size)
Expand Down Expand Up @@ -143,6 +137,12 @@ class CusparseCsrmp
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -174,6 +174,10 @@ protected:
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsrmp(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseCsrmp, CusparseBase>(exec, size),
Expand Down Expand Up @@ -203,6 +207,12 @@ class CusparseCsr
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -234,6 +244,10 @@ protected:
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseCsr, CusparseBase>(exec, size),
Expand Down Expand Up @@ -264,6 +278,12 @@ class CusparseCsrmm
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -296,6 +316,10 @@ protected:
dense_x->get_size()[0]);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsrmm(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseCsrmm, CusparseBase>(exec, size),
Expand Down Expand Up @@ -329,6 +353,12 @@ class CusparseCsrEx
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -380,6 +410,9 @@ protected:
// DEVICE for Ginkgo
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsrEx(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
Expand Down Expand Up @@ -422,6 +455,12 @@ class CusparseHybrid
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -471,6 +510,10 @@ protected:
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseHybrid(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
Expand Down Expand Up @@ -551,10 +594,16 @@ class CusparseGenericCsr
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
cusparseIndexType_t cu_index =
gko::kernels::cuda::cusparse_index_type<IndexType>();
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
using gko::kernels::cuda::as_culibs_type;
Expand Down Expand Up @@ -598,6 +647,10 @@ protected:
Alg);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseGenericCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseGenericCsr, CusparseBase>(exec, size),
Expand Down Expand Up @@ -629,10 +682,16 @@ class CusparseGenericCoo
public:
using coo = gko::matrix::Coo<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
cusparseIndexType_t cu_index =
gko::kernels::cuda::cusparse_index_type<IndexType>();
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
using gko::kernels::cuda::as_culibs_type;
Expand Down Expand Up @@ -676,6 +735,10 @@ protected:
CUSPARSE_MV_ALG_DEFAULT);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseGenericCoo(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseGenericCoo, CusparseBase>(exec, size),
Expand Down
36 changes: 30 additions & 6 deletions benchmark/utils/hip_linops.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,6 @@ class HipsparseBase : public gko::LinOp {
const gko::HipExecutor* get_gpu_exec() const { return gpu_exec_.get(); }

protected:
void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*,
gko::LinOp*) const override
{
GKO_NOT_IMPLEMENTED;
}

HipsparseBase(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::LinOp(exec, size)
Expand Down Expand Up @@ -130,6 +124,12 @@ class HipsparseCsr
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -161,6 +161,10 @@ class HipsparseCsr
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

HipsparseCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<HipsparseCsr, HipsparseBase>(exec, size),
Expand Down Expand Up @@ -191,6 +195,12 @@ class HipsparseCsrmm
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -223,6 +233,10 @@ class HipsparseCsrmm
dense_x->get_size()[0]);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

HipsparseCsrmm(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<HipsparseCsrmm, HipsparseBase>(exec, size),
Expand Down Expand Up @@ -257,6 +271,12 @@ class HipsparseHybrid
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -306,6 +326,10 @@ class HipsparseHybrid
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

HipsparseHybrid(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<HipsparseHybrid, HipsparseBase>(exec, size),
Expand Down
3 changes: 2 additions & 1 deletion common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
set(UNIFIED_SOURCES
base/index_set_kernels.cpp
components/device_matrix_data_kernels.cpp
components/absolute_array_kernels.cpp
components/device_matrix_data_kernels.cpp
components/fill_array_kernels.cpp
components/format_conversion_kernels.cpp
components/precision_conversion_kernels.cpp
components/reduce_array_kernels.cpp
distributed/partition_kernels.cpp
Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/components/warp_blas.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform(
ValueType* __restrict__ row, bool& __restrict__ status)
{
auto key_col_elem = group.shfl(row[key_col], key_row);
if (key_col_elem == zero<ValueType>()) {
if (is_zero(key_col_elem)) {
// TODO: implement error handling for GPUs to be able to properly
// report it here
status = false;
Expand Down Expand Up @@ -105,7 +105,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform_with_rhs(
{
auto key_col_elem = group.shfl(row[key_col], key_row);
auto key_rhs_elem = group.shfl(rhs[0], key_row);
if (key_col_elem == zero<ValueType>()) {
if (is_zero(key_col_elem)) {
// TODO: implement error handling for GPUs to be able to properly
// report it here
status = false;
Expand Down
55 changes: 0 additions & 55 deletions common/cuda_hip/matrix/coo_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -218,58 +218,3 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmm(


} // namespace


namespace kernel {


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
const IndexType* __restrict__ idxs, size_type num_nonzeros,
IndexType* __restrict__ ptrs, size_type length)
{
const auto tidx = thread::get_thread_id_flat();

if (tidx == 0) {
ptrs[0] = 0;
ptrs[length - 1] = num_nonzeros;
}

if (0 < tidx && tidx < num_nonzeros) {
if (idxs[tidx - 1] < idxs[tidx]) {
for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) {
ptrs[i] = tidx;
}
}
}
}


template <typename ValueType>
__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
size_type num_rows, size_type num_cols, size_type stride,
ValueType* __restrict__ result)
{
const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
if (tidx_x < num_cols && tidx_y < num_rows) {
result[tidx_y * stride + tidx_x] = zero<ValueType>();
}
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void fill_in_dense(
size_type nnz, const IndexType* __restrict__ row_idxs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values, size_type stride,
ValueType* __restrict__ result)
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < nnz) {
result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];
}
}


} // namespace kernel
Loading

0 comments on commit 0436f3e

Please sign in to comment.