Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Format conversion, SpMV and write improvements #905

Merged
merged 32 commits into from
Feb 1, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
b1c77eb
avoid calling empty GEMMs for CUDA/HIP
upsj Oct 12, 2021
3afdab6
simplify matrix write impl
upsj Oct 12, 2021
e4591d1
initialize Csr arrays to valid values
upsj Oct 12, 2021
e362d4f
fix CUDA compilation implicit include paths
upsj Oct 13, 2021
628e789
add CSR SpMM support, make `classical` fall-back
upsj Oct 13, 2021
04b404c
fix Csr initialization of empty load_balance mtx
upsj Oct 18, 2021
efd83d9
add is_zero/is_nonzero helpers
upsj Nov 27, 2021
5178a81
keep numeric zeros in matrix_data and conversions
upsj Oct 18, 2021
4a1d813
move row_ptr/row_idx conversion to components
upsj Nov 29, 2021
8a89975
unify convert_to(Dense) implementations
upsj Nov 30, 2021
e9803b9
unify convert_to(Csr) implementations
upsj Nov 30, 2021
ed864ac
fix benchmark LinOp warnings
upsj Nov 30, 2021
0462d4f
unify Dense/Csr::convert_to(...)
upsj Dec 9, 2021
03bd279
fix benchmark warnings
upsj Dec 9, 2021
89e5dac
fix sellp conversions
upsj Dec 9, 2021
ff833b4
fall back to classical in hip Csr SpMV
upsj Dec 9, 2021
57c56f9
fix sellp SpMV on GPUs
upsj Dec 9, 2021
eae6e48
fix const-correctness for Hybrid strategy
upsj Dec 9, 2021
b2dcca0
zero Fbcsr and Sellp pointer arrays
upsj Dec 9, 2021
72f0bbe
work around cuSPARSE issues with empty matrices
upsj Dec 9, 2021
77422ee
add Fbcsr conversion to Dense
upsj Dec 9, 2021
31ae3e6
guard empty CUDA kernel launches
upsj Dec 9, 2021
17891e5
improve sorting test output
upsj Dec 9, 2021
66b5ed2
remove unnecessary dim3 usages
upsj Dec 9, 2021
f732c3c
add conversions to Fbcsr
upsj Dec 10, 2021
f0a683d
review updates
upsj Jan 24, 2022
b4cfc27
simplify cuSPARSE type handling
upsj Jan 25, 2022
84ebf9d
fix accessor compilation issues
upsj Jan 25, 2022
b982ec6
review updates
upsj Jan 30, 2022
2d27767
add shortcuts for common reduction boilerplate
upsj Jan 30, 2022
f98170c
fix CUDA version detection for triangular solve
upsj Jan 31, 2022
a7379ad
fix dpcpp compilation
upsj Jan 31, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions benchmark/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,12 @@ endfunction()

function(ginkgo_benchmark_cusparse_linops type def)
add_library(cusparse_linops_${type} utils/cuda_linops.cu)
if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA")
# remove false positive CUDA warnings when calling one<T>() and zero<T>()
target_compile_options(cusparse_linops_${type}
PRIVATE
$<$<COMPILE_LANGUAGE:CUDA>:--expt-relaxed-constexpr>)
endif()
# make the dependency public to catch issues
target_compile_definitions(cusparse_linops_${type} PUBLIC ${def})
target_link_libraries(cusparse_linops_${type} Ginkgo::ginkgo ${CUDA_RUNTIME_LIBS} ${CUBLAS} ${CUSPARSE})
Expand Down
75 changes: 69 additions & 6 deletions benchmark/utils/cuda_linops.cu
Original file line number Diff line number Diff line change
Expand Up @@ -76,12 +76,6 @@ public:
}

protected:
void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*,
gko::LinOp*) const override
{
GKO_NOT_IMPLEMENTED;
}

CusparseBase(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::LinOp(exec, size)
Expand Down Expand Up @@ -143,6 +137,12 @@ class CusparseCsrmp
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -174,6 +174,10 @@ protected:
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsrmp(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseCsrmp, CusparseBase>(exec, size),
Expand Down Expand Up @@ -203,6 +207,12 @@ class CusparseCsr
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -234,6 +244,10 @@ protected:
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseCsr, CusparseBase>(exec, size),
Expand Down Expand Up @@ -264,6 +278,12 @@ class CusparseCsrmm
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -296,6 +316,10 @@ protected:
dense_x->get_size()[0]);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsrmm(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseCsrmm, CusparseBase>(exec, size),
Expand Down Expand Up @@ -329,6 +353,12 @@ class CusparseCsrEx
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -380,6 +410,9 @@ protected:
// DEVICE for Ginkgo
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseCsrEx(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
Expand Down Expand Up @@ -422,6 +455,12 @@ class CusparseHybrid
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -471,6 +510,10 @@ protected:
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseHybrid(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseHybrid, CusparseBase>(exec, size),
Expand Down Expand Up @@ -551,10 +594,16 @@ class CusparseGenericCsr
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
cusparseIndexType_t cu_index =
gko::kernels::cuda::cusparse_index_type<IndexType>();
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
using gko::kernels::cuda::as_culibs_type;
Expand Down Expand Up @@ -598,6 +647,10 @@ protected:
Alg);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseGenericCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseGenericCsr, CusparseBase>(exec, size),
Expand Down Expand Up @@ -629,10 +682,16 @@ class CusparseGenericCoo
public:
using coo = gko::matrix::Coo<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;
cusparseIndexType_t cu_index =
gko::kernels::cuda::cusparse_index_type<IndexType>();
cudaDataType_t cu_value = gko::kernels::cuda::cuda_data_type<ValueType>();

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
using gko::kernels::cuda::as_culibs_type;
Expand Down Expand Up @@ -676,6 +735,10 @@ protected:
CUSPARSE_MV_ALG_DEFAULT);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

CusparseGenericCoo(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<CusparseGenericCoo, CusparseBase>(exec, size),
Expand Down
36 changes: 30 additions & 6 deletions benchmark/utils/hip_linops.hip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,12 +65,6 @@ class HipsparseBase : public gko::LinOp {
const gko::HipExecutor* get_gpu_exec() const { return gpu_exec_.get(); }

protected:
void apply_impl(const gko::LinOp*, const gko::LinOp*, const gko::LinOp*,
gko::LinOp*) const override
{
GKO_NOT_IMPLEMENTED;
}

HipsparseBase(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::LinOp(exec, size)
Expand Down Expand Up @@ -130,6 +124,12 @@ class HipsparseCsr
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -161,6 +161,10 @@ class HipsparseCsr
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

HipsparseCsr(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<HipsparseCsr, HipsparseBase>(exec, size),
Expand Down Expand Up @@ -191,6 +195,12 @@ class HipsparseCsrmm
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -223,6 +233,10 @@ class HipsparseCsrmm
dense_x->get_size()[0]);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

HipsparseCsrmm(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<HipsparseCsrmm, HipsparseBase>(exec, size),
Expand Down Expand Up @@ -257,6 +271,12 @@ class HipsparseHybrid
public:
using csr = gko::matrix::Csr<ValueType, IndexType>;
using mat_data = gko::matrix_data<ValueType, IndexType>;
using device_mat_data = gko::device_matrix_data<ValueType, IndexType>;

void read(const device_mat_data& data) override
{
this->read(data.copy_to_host());
}

void read(const mat_data& data) override
{
Expand Down Expand Up @@ -306,6 +326,10 @@ class HipsparseHybrid
&scalars.get_const_data()[1], dx);
}

void apply_impl(const gko::LinOp* alpha, const gko::LinOp* b,
const gko::LinOp* beta,
gko::LinOp* x) const override GKO_NOT_IMPLEMENTED;

HipsparseHybrid(std::shared_ptr<const gko::Executor> exec,
const gko::dim<2>& size = gko::dim<2>{})
: gko::EnableLinOp<HipsparseHybrid, HipsparseBase>(exec, size),
Expand Down
3 changes: 2 additions & 1 deletion common/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -1,8 +1,9 @@
set(UNIFIED_SOURCES
base/index_set_kernels.cpp
components/device_matrix_data_kernels.cpp
components/absolute_array_kernels.cpp
components/device_matrix_data_kernels.cpp
components/fill_array_kernels.cpp
components/format_conversion_kernels.cpp
components/precision_conversion_kernels.cpp
components/reduce_array_kernels.cpp
distributed/partition_kernels.cpp
Expand Down
4 changes: 2 additions & 2 deletions common/cuda_hip/components/warp_blas.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform(
ValueType* __restrict__ row, bool& __restrict__ status)
{
auto key_col_elem = group.shfl(row[key_col], key_row);
if (key_col_elem == zero<ValueType>()) {
if (is_zero(key_col_elem)) {
// TODO: implement error handling for GPUs to be able to properly
// report it here
status = false;
Expand Down Expand Up @@ -105,7 +105,7 @@ __device__ __forceinline__ void apply_gauss_jordan_transform_with_rhs(
{
auto key_col_elem = group.shfl(row[key_col], key_row);
auto key_rhs_elem = group.shfl(rhs[0], key_row);
if (key_col_elem == zero<ValueType>()) {
if (is_zero(key_col_elem)) {
// TODO: implement error handling for GPUs to be able to properly
// report it here
status = false;
Expand Down
55 changes: 0 additions & 55 deletions common/cuda_hip/matrix/coo_kernels.hpp.inc
Original file line number Diff line number Diff line change
Expand Up @@ -218,58 +218,3 @@ __global__ __launch_bounds__(spmv_block_size) void abstract_spmm(


} // namespace


namespace kernel {


template <typename IndexType>
__global__ __launch_bounds__(default_block_size) void convert_row_idxs_to_ptrs(
const IndexType* __restrict__ idxs, size_type num_nonzeros,
IndexType* __restrict__ ptrs, size_type length)
{
const auto tidx = thread::get_thread_id_flat();

if (tidx == 0) {
ptrs[0] = 0;
ptrs[length - 1] = num_nonzeros;
}

if (0 < tidx && tidx < num_nonzeros) {
if (idxs[tidx - 1] < idxs[tidx]) {
for (auto i = idxs[tidx - 1] + 1; i <= idxs[tidx]; i++) {
ptrs[i] = tidx;
}
}
}
}


template <typename ValueType>
__global__ __launch_bounds__(config::max_block_size) void initialize_zero_dense(
size_type num_rows, size_type num_cols, size_type stride,
ValueType* __restrict__ result)
{
const auto tidx_x = threadIdx.x + blockDim.x * blockIdx.x;
const auto tidx_y = threadIdx.y + blockDim.y * blockIdx.y;
if (tidx_x < num_cols && tidx_y < num_rows) {
result[tidx_y * stride + tidx_x] = zero<ValueType>();
}
}


template <typename ValueType, typename IndexType>
__global__ __launch_bounds__(default_block_size) void fill_in_dense(
size_type nnz, const IndexType* __restrict__ row_idxs,
const IndexType* __restrict__ col_idxs,
const ValueType* __restrict__ values, size_type stride,
ValueType* __restrict__ result)
{
const auto tidx = thread::get_thread_id_flat();
if (tidx < nnz) {
result[stride * row_idxs[tidx] + col_idxs[tidx]] = values[tidx];
}
}


} // namespace kernel
Loading